diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index d3a9f247d6..40cf60f3dd 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -1,7 +1,7 @@
 ---
 name: "\U0001F41B Bug report"
 about: Create a bug report to help us improve Milvus
-title: "[BUG]"
+title: ''
 labels: ''
 assignees: ''
 
diff --git a/.github/ISSUE_TEMPLATE/documentation-request.md b/.github/ISSUE_TEMPLATE/documentation-request.md
index 1e3193f9f3..133fb9e1e9 100644
--- a/.github/ISSUE_TEMPLATE/documentation-request.md
+++ b/.github/ISSUE_TEMPLATE/documentation-request.md
@@ -1,7 +1,7 @@
 ---
 name: "\U0001F4DD Documentation request"
 about: Report incorrect or needed documentation
-title: "[DOC]"
+title: ''
 labels: ''
 assignees: ''
 
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
index 24de651b12..01bceb3321 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -1,7 +1,7 @@
 ---
 name: "\U0001F680 Feature request"
 about: Suggest an idea for Milvus
-title: "[FEATURE]"
+title: ''
 labels: ''
 assignees: ''
 
diff --git a/.github/ISSUE_TEMPLATE/general-question.md b/.github/ISSUE_TEMPLATE/general-question.md
index 32ce5dd701..d49fce1817 100644
--- a/.github/ISSUE_TEMPLATE/general-question.md
+++ b/.github/ISSUE_TEMPLATE/general-question.md
@@ -1,7 +1,7 @@
 ---
 name: "\U0001F914 General question"
 about: Ask a general question about Milvus
-title: "[QUESTION]"
+title: ''
 labels: ''
 assignees: ''
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 12d679c646..c298f3431d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,13 +21,26 @@ Please mark all change in change log and use the ticket from JIRA.
 - \#440 - Server cannot startup with gpu_resource_config.enable=false in GPU version
 - \#458 - Index data is not compatible between 0.5 and 0.6
 - \#465 - Server hang caused by searching with nsg index
+- \#485 - Increase code coverage rate
 - \#486 - gpu no usage during index building
+- \#497 - CPU-version search performance decreased
+- \#504 - The code coverage rate of core/src/scheduler/optimizer is too low
 - \#509 - IVF_PQ index build trapped into dead loop caused by invalid params
 - \#513 - Unittest DELETE_BY_RANGE sometimes failed
+- \#523 - Erase file data from cache once the file is marked as deleted
 - \#527 - faiss benchmark not compatible with faiss 1.6.0
 - \#530 - BuildIndex stop when do build index and search simultaneously
+- \#532 - assigin value to `table_name` from confest shell
 - \#533 - NSG build failed with MetricType Inner Product
+- \#543 - client raise exception in shards when search results is empty
+- \#545 - Avoid dead circle of build index thread when error occurs
+- \#547 - NSG build failed using GPU-edition if set gpu_enable false
 - \#548 - NSG search accuracy is too low
+- \#552 - Server down during building index_type: IVF_PQ using GPU-edition
+- \#561 - Milvus server should report exception/error message or terminate on mysql metadata backend error
+- \#599 - Build index log is incorrect
+- \#602 - Optimizer specify wrong gpu_id
+- \#606 - No log generated during building index with CPU
 
 ## Feature
 - \#12 - Pure CPU version for Milvus
@@ -36,25 +49,32 @@ Please mark all change in change log and use the ticket from JIRA.
 - \#226 - Experimental shards middleware for Milvus
 - \#227 - Support new index types SPTAG-KDT and SPTAG-BKT
 - \#346 - Support build index with multiple gpu
+- \#420 - Update shards merge part to match v0.5.3
 - \#488 - Add log in scheduler/optimizer
 - \#502 - C++ SDK support IVFPQ and SPTAG
+- \#560 - Add version in server config file
+- \#605 - Print more messages when server start
 
 ## Improvement
 - \#255 - Add ivfsq8 test report detailed version
 - \#260 - C++ SDK README
 - \#266 - Rpc request source code refactor
+- \#274 - Logger the time cost during preloading data
 - \#275 - Rename C++ SDK IndexType
 - \#284 - Change C++ SDK to shared library
 - \#306 - Use int64 for all config integer
 - \#310 - Add Q&A for 'protocol https not supported or disable in libcurl' issue
 - \#314 - add Find FAISS in CMake
 - \#322 - Add option to enable / disable prometheus
+- \#354 - Build migration scripts into milvus docker image
 - \#358 - Add more information in build.sh and install.md
 - \#404 - Add virtual method Init() in Pass abstract class
 - \#409 - Add a Fallback pass in optimizer
 - \#433 - C++ SDK query result is not easy to use
 - \#449 - Add ShowPartitions example for C++ SDK
 - \#470 - Small raw files should not be build index
+- \#584 - Intergrate internal FAISS
+- \#611 - Remove MILVUS_CPU_VERSION
 
 ## Task
 
diff --git a/README.md b/README.md
index c06074277c..47f8bb6c4e 100644
--- a/README.md
+++ b/README.md
@@ -69,6 +69,8 @@ Below is a list of Milvus contributors. We greatly appreciate your contributions
 
 - [Milvus test reports](https://github.com/milvus-io/milvus/tree/master/docs)
 
+- [Milvus FAQ](https://www.milvus.io/docs/en/faq/operational_faq/)
+
 - [Milvus Medium](https://medium.com/@milvusio)
 
 - [Milvus CSDN](https://zilliz.blog.csdn.net/)
@@ -79,4 +81,4 @@ Below is a list of Milvus contributors. We greatly appreciate your contributions
 
 ## License
 
-[Apache License 2.0](LICENSE)
\ No newline at end of file
+[Apache License 2.0](LICENSE)
diff --git a/README_CN.md b/README_CN.md
index 1c554ac0d8..4eb360f5fd 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -69,6 +69,8 @@ Milvus 提供稳定的 [Python](https://github.com/milvus-io/pymilvus)、[Java](
 
 - [Milvus 测试报告](https://github.com/milvus-io/milvus/tree/master/docs)
 
+- [Milvus 常见问题](https://www.milvus.io/docs/zh-CN/faq/operational_faq/)
+
 - [Milvus Medium](https://medium.com/@milvusio)
 
 - [Milvus CSDN](https://zilliz.blog.csdn.net/)
diff --git a/README_JP.md b/README_JP.md
index 65e68c7bec..6b11da7a62 100644
--- a/README_JP.md
+++ b/README_JP.md
@@ -61,6 +61,8 @@ C++サンプルコードを実行するために、次のコマンドをつか
 
 - [Milvus テストレポート](https://github.com/milvus-io/milvus/tree/master/docs)
 
+- [Milvusのよくある質問](https://www.milvus.io/docs/en/faq/operational_faq/)
+
 - [Milvus Medium](https://medium.com/@milvusio)
 
 - [Milvus CSDN](https://zilliz.blog.csdn.net/)
@@ -72,4 +74,4 @@ C++サンプルコードを実行するために、次のコマンドをつか
 
 ## ライセンス
 
-[Apache 2.0ライセンス](LICENSE)
\ No newline at end of file
+[Apache 2.0ライセンス](LICENSE)
diff --git a/ci/jenkins/Jenkinsfile b/ci/jenkins/Jenkinsfile
index 0eed167fb4..10c86a3cca 100644
--- a/ci/jenkins/Jenkinsfile
+++ b/ci/jenkins/Jenkinsfile
@@ -17,7 +17,7 @@ pipeline {
     }
 
     parameters{
-        choice choices: ['Release', 'Debug'], description: '', name: 'BUILD_TYPE'
+        choice choices: ['Release', 'Debug'], description: 'Build Type', name: 'BUILD_TYPE'
         string defaultValue: 'registry.zilliz.com', description: 'DOCKER REGISTRY URL', name: 'DOKCER_REGISTRY_URL', trim: true
         string defaultValue: 'ba070c98-c8cc-4f7c-b657-897715f359fc', description: 'DOCKER CREDENTIALS ID', name: 'DOCKER_CREDENTIALS_ID', trim: true
         string defaultValue: 'http://192.168.1.202/artifactory/milvus', description: 'JFROG ARTFACTORY URL', name: 'JFROG_ARTFACTORY_URL', trim: true
@@ -27,9 +27,8 @@ pipeline {
     environment {
         PROJECT_NAME = "milvus"
         LOWER_BUILD_TYPE = params.BUILD_TYPE.toLowerCase()
-        SEMVER = "${BRANCH_NAME}"
-        JOBNAMES = env.JOB_NAME.split('/')
-        PIPELINE_NAME = "${JOBNAMES[0]}"
+        SEMVER = "${BRANCH_NAME.contains('/') ? BRANCH_NAME.substring(BRANCH_NAME.lastIndexOf('/') + 1) : BRANCH_NAME}"
+        PIPELINE_NAME = "${env.JOB_NAME.contains('/') ? env.JOB_NAME.getAt(0..(env.JOB_NAME.indexOf('/') - 1)) : env.JOB_NAME}"
     }
 
     stages {
@@ -102,7 +101,7 @@ pipeline {
                             stages {
                                 stage('Publish') {
                                     steps {
-                                        container('publish-images'){
+                                        container('publish-images') {
                                             script {
                                                 load "${env.WORKSPACE}/ci/jenkins/step/publishImages.groovy"
                                             }
diff --git a/ci/jenkins/internalJenkinsfile.groovy b/ci/jenkins/internalJenkinsfile.groovy
new file mode 100644
index 0000000000..4e3b6f963d
--- /dev/null
+++ b/ci/jenkins/internalJenkinsfile.groovy
@@ -0,0 +1,477 @@
+#!/usr/bin/env groovy
+
+pipeline {
+    agent none
+    
+    options {
+        timestamps()
+    }
+
+    parameters{
+        choice choices: ['Release', 'Debug'], description: 'Build Type', name: 'BUILD_TYPE'
+        string defaultValue: 'registry.zilliz.com', description: 'DOCKER REGISTRY URL', name: 'DOKCER_REGISTRY_URL', trim: true
+        string defaultValue: 'a54e38ef-c424-4ea9-9224-b25fc20e3924', description: 'DOCKER CREDENTIALS ID', name: 'DOCKER_CREDENTIALS_ID', trim: true
+        string defaultValue: 'http://192.168.1.201/artifactory/milvus', description: 'JFROG ARTFACTORY URL', name: 'JFROG_ARTFACTORY_URL', trim: true
+        string defaultValue: '76fd48ab-2b8e-4eed-834d-2eefd23bb3a6', description: 'JFROG CREDENTIALS ID', name: 'JFROG_CREDENTIALS_ID', trim: true
+    }
+
+    environment {
+        PROJECT_NAME = "milvus"
+        LOWER_BUILD_TYPE = params.BUILD_TYPE.toLowerCase()
+        SEMVER = "${BRANCH_NAME.contains('/') ? BRANCH_NAME.substring(BRANCH_NAME.lastIndexOf('/') + 1) : BRANCH_NAME}"
+        PIPELINE_NAME = "${env.JOB_NAME.contains('/') ? env.JOB_NAME.getAt(0..(env.JOB_NAME.indexOf('/') - 1)) : env.JOB_NAME}"
+    }
+
+    stages {
+        stage("Ubuntu 18.04 x86_64") {
+            environment {
+                OS_NAME = "ubuntu18.04"
+                CPU_ARCH = "amd64"
+            }
+
+            parallel {
+                stage ("GPU Version") {
+                    environment {
+                        BINRARY_VERSION = "gpu"
+                        PACKAGE_VERSION = VersionNumber([
+                            versionNumberString : '${SEMVER}-gpu-${OS_NAME}-${CPU_ARCH}-${LOWER_BUILD_TYPE}-${BUILD_DATE_FORMATTED, "yyyyMMdd"}-${BUILDS_TODAY}'
+                        ]);
+                        DOCKER_VERSION = "${SEMVER}-gpu-${OS_NAME}-${LOWER_BUILD_TYPE}"
+                    }
+
+                    stages {
+                        stage("Run Build") {
+                            agent {
+                                kubernetes {
+                                    label "${env.BINRARY_VERSION}-build"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  name: milvus-gpu-build-env
+  labels:
+    app: milvus
+    componet: gpu-build-env
+spec:
+  containers:
+  - name: milvus-gpu-build-env
+    image: registry.zilliz.com/milvus/milvus-gpu-build-env:v0.6.0-ubuntu18.04
+    env:
+    - name: POD_IP
+      valueFrom:
+        fieldRef:
+          fieldPath: status.podIP
+    - name: BUILD_ENV_IMAGE_ID
+      value: "da9023b0f858f072672f86483a869aa87e90a5140864f89e5a012ec766d96dea"
+    command:
+    - cat
+    tty: true
+    resources:
+      limits:
+        memory: "24Gi"
+        cpu: "8.0"
+        nvidia.com/gpu: 1
+      requests:
+        memory: "16Gi"
+        cpu: "4.0"
+  - name: milvus-mysql
+    image: mysql:5.6
+    env:
+    - name: MYSQL_ROOT_PASSWORD
+      value: 123456
+    ports:
+    - containerPort: 3306
+      name: mysql
+                                    """
+                                }
+                            }
+
+                            stages {
+                                stage('Build') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/build.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                                stage('Code Coverage') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/internalCoverage.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                                stage('Upload Package') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/package.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        stage("Publish docker images") {
+                            agent {
+                                kubernetes {
+                                    label "${env.BINRARY_VERSION}-publish"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  labels:
+    app: publish
+    componet: docker
+spec:
+  containers:
+  - name: publish-images
+    image: registry.zilliz.com/library/docker:v1.0.0
+    securityContext:
+      privileged: true
+    command:
+    - cat
+    tty: true
+    volumeMounts:
+    - name: docker-sock
+      mountPath: /var/run/docker.sock
+  volumes:
+  - name: docker-sock
+    hostPath:
+      path: /var/run/docker.sock
+"""
+                                }
+                            }
+
+                            stages {
+                                stage('Publish') {
+                                    steps {
+                                        container('publish-images') {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/publishImages.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        stage("Deploy to Development") {
+                            environment {
+                                FROMAT_SEMVER = "${env.SEMVER}".replaceAll("\\.", "-")
+                                HELM_RELEASE_NAME = "${env.PIPELINE_NAME}-${env.FROMAT_SEMVER}-${env.BUILD_NUMBER}-single-${env.BINRARY_VERSION}".toLowerCase()
+                            }
+
+                            agent {
+                                kubernetes {
+                                    label "${env.BINRARY_VERSION}-dev-test"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  labels:
+    app: milvus
+    componet: test-env
+spec:
+  containers:
+  - name: milvus-test-env
+    image: registry.zilliz.com/milvus/milvus-test-env:v0.1
+    command:
+    - cat
+    tty: true
+    volumeMounts:
+    - name: kubeconf
+      mountPath: /root/.kube/
+      readOnly: true
+  volumes:
+  - name: kubeconf
+    secret:
+      secretName: test-cluster-config
+"""
+                                }
+                            }
+
+                            stages {
+                                stage("Deploy to Dev") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/deploySingle2Dev.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage("Dev Test") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                boolean isNightlyTest = isTimeTriggeredBuild()
+                                                if (isNightlyTest) {
+                                                    load "${env.WORKSPACE}/ci/jenkins/step/singleDevNightlyTest.groovy"
+                                                } else {
+                                                    load "${env.WORKSPACE}/ci/jenkins/step/singleDevTest.groovy"
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage ("Cleanup Dev") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/cleanupSingleDev.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                            post {
+                                unsuccessful {
+                                    container('milvus-test-env') {
+                                        script {
+                                            load "${env.WORKSPACE}/ci/jenkins/step/cleanupSingleDev.groovy"
+                                        }
+                                    }
+                                }
+                            }
+                        }
+    				}
+                }
+
+                stage ("CPU Version") {
+                    environment {
+                        BINRARY_VERSION = "cpu"
+                        PACKAGE_VERSION = VersionNumber([
+                            versionNumberString : '${SEMVER}-cpu-${OS_NAME}-${CPU_ARCH}-${LOWER_BUILD_TYPE}-${BUILD_DATE_FORMATTED, "yyyyMMdd"}-${BUILDS_TODAY}'
+                        ]);
+                        DOCKER_VERSION = "${SEMVER}-cpu-${OS_NAME}-${LOWER_BUILD_TYPE}"
+                    }
+
+                    stages {
+                        stage("Run Build") {
+                            agent {
+                                kubernetes {
+                                    label "${env.BINRARY_VERSION}-build"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  name: milvus-cpu-build-env
+  labels:
+    app: milvus
+    componet: cpu-build-env
+spec:
+  containers:
+  - name: milvus-cpu-build-env
+    image: registry.zilliz.com/milvus/milvus-cpu-build-env:v0.6.0-ubuntu18.04
+    env:
+    - name: POD_IP
+      valueFrom:
+        fieldRef:
+          fieldPath: status.podIP
+    - name: BUILD_ENV_IMAGE_ID
+      value: "23476391bec80c64f10d44a6370c73c71f011a6b95114b10ff82a60e771e11c7"
+    command:
+    - cat
+    tty: true
+    resources:
+      limits:
+        memory: "24Gi"
+        cpu: "8.0"
+      requests:
+        memory: "16Gi"
+        cpu: "4.0"
+  - name: milvus-mysql
+    image: mysql:5.6
+    env:
+    - name: MYSQL_ROOT_PASSWORD
+      value: 123456
+    ports:
+    - containerPort: 3306
+      name: mysql
+                                    """
+                                }
+                            }
+
+                            stages {
+                                stage('Build') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/build.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                                stage('Code Coverage') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/internalCoverage.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                                stage('Upload Package') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/package.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        stage("Publish docker images") {
+                            agent {
+                                kubernetes {
+                                    label "${env.BINRARY_VERSION}-publish"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  labels:
+    app: publish
+    componet: docker
+spec:
+  containers:
+  - name: publish-images
+    image: registry.zilliz.com/library/docker:v1.0.0
+    securityContext:
+      privileged: true
+    command:
+    - cat
+    tty: true
+    volumeMounts:
+    - name: docker-sock
+      mountPath: /var/run/docker.sock
+  volumes:
+  - name: docker-sock
+    hostPath:
+      path: /var/run/docker.sock
+"""
+                                }
+                            }
+
+                            stages {
+                                stage('Publish') {
+                                    steps {
+                                        container('publish-images'){
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/publishImages.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        stage("Deploy to Development") {
+                            environment {
+                                FROMAT_SEMVER = "${env.SEMVER}".replaceAll("\\.", "-")
+                                HELM_RELEASE_NAME = "${env.PIPELINE_NAME}-${env.FROMAT_SEMVER}-${env.BUILD_NUMBER}-single-${env.BINRARY_VERSION}".toLowerCase()
+                            }
+
+                            agent {
+                                kubernetes {
+                                    label "${env.BINRARY_VERSION}-dev-test"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  labels:
+    app: milvus
+    componet: test-env
+spec:
+  containers:
+  - name: milvus-test-env
+    image: registry.zilliz.com/milvus/milvus-test-env:v0.1
+    command:
+    - cat
+    tty: true
+    volumeMounts:
+    - name: kubeconf
+      mountPath: /root/.kube/
+      readOnly: true
+  volumes:
+  - name: kubeconf
+    secret:
+      secretName: test-cluster-config
+"""
+                                }
+                            }
+
+                            stages {
+                                stage("Deploy to Dev") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/deploySingle2Dev.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage("Dev Test") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                boolean isNightlyTest = isTimeTriggeredBuild()
+                                                if (isNightlyTest) {
+                                                    load "${env.WORKSPACE}/ci/jenkins/step/singleDevNightlyTest.groovy"
+                                                } else {
+                                                    load "${env.WORKSPACE}/ci/jenkins/step/singleDevTest.groovy"
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage ("Cleanup Dev") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/cleanupSingleDev.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                            post {
+                                unsuccessful {
+                                    container('milvus-test-env') {
+                                        script {
+                                            load "${env.WORKSPACE}/ci/jenkins/step/cleanupSingleDev.groovy"
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+boolean isTimeTriggeredBuild() {
+    if (currentBuild.getBuildCauses('hudson.triggers.TimerTrigger$TimerTriggerCause').size() != 0) {
+        return true
+    }
+    return false
+}
diff --git a/ci/jenkins/pod/milvus-cpu-version-build-env-pod.yaml b/ci/jenkins/pod/milvus-cpu-version-build-env-pod.yaml
index 58eae39061..4631c26e09 100644
--- a/ci/jenkins/pod/milvus-cpu-version-build-env-pod.yaml
+++ b/ci/jenkins/pod/milvus-cpu-version-build-env-pod.yaml
@@ -21,10 +21,10 @@ spec:
     tty: true
     resources:
       limits:
-        memory: "32Gi"
+        memory: "12Gi"
         cpu: "8.0"
       requests:
-        memory: "16Gi"
+        memory: "8Gi"
         cpu: "4.0"
   - name: milvus-mysql
     image: mysql:5.6
diff --git a/ci/jenkins/pod/milvus-gpu-version-build-env-pod.yaml b/ci/jenkins/pod/milvus-gpu-version-build-env-pod.yaml
index bd321a87ae..d4eff370a8 100644
--- a/ci/jenkins/pod/milvus-gpu-version-build-env-pod.yaml
+++ b/ci/jenkins/pod/milvus-gpu-version-build-env-pod.yaml
@@ -21,11 +21,11 @@ spec:
     tty: true
     resources:
       limits:
-        memory: "32Gi"
+        memory: "12Gi"
         cpu: "8.0"
         nvidia.com/gpu: 1
       requests:
-        memory: "16Gi"
+        memory: "8Gi"
         cpu: "4.0"
   - name: milvus-mysql
     image: mysql:5.6
diff --git a/ci/jenkins/step/build.groovy b/ci/jenkins/step/build.groovy
index d39f104a17..e65c6caa23 100644
--- a/ci/jenkins/step/build.groovy
+++ b/ci/jenkins/step/build.groovy
@@ -3,9 +3,9 @@ timeout(time: 60, unit: 'MINUTES') {
         withCredentials([usernamePassword(credentialsId: "${params.JFROG_CREDENTIALS_ID}", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
             def checkResult = sh(script: "./check_ccache.sh -l ${params.JFROG_ARTFACTORY_URL}/ccache", returnStatus: true)
             if ("${env.BINRARY_VERSION}" == "gpu") {
-                sh ". ./before-install.sh && ./build.sh -t ${params.BUILD_TYPE} -o /opt/milvus -l -g -u -c"
+                sh ". ./before-install.sh && ./build.sh -t ${params.BUILD_TYPE} -o /opt/milvus -l -g -x -u -c"
             } else {
-                sh ". ./before-install.sh && ./build.sh -t ${params.BUILD_TYPE} -o /opt/milvus -l -m -u -c"
+                sh ". ./before-install.sh && ./build.sh -t ${params.BUILD_TYPE} -o /opt/milvus -l -u -c"
             }
             sh "./update_ccache.sh -l ${params.JFROG_ARTFACTORY_URL}/ccache -u ${USERNAME} -p ${PASSWORD}"
         }
diff --git a/ci/jenkins/step/internalCoverage.groovy b/ci/jenkins/step/internalCoverage.groovy
new file mode 100644
index 0000000000..762c2cdc8f
--- /dev/null
+++ b/ci/jenkins/step/internalCoverage.groovy
@@ -0,0 +1,6 @@
+timeout(time: 30, unit: 'MINUTES') {
+    dir ("ci/scripts") {
+        sh "./coverage.sh -o /opt/milvus -u root -p 123456 -t \$POD_IP"
+    }
+}
+
diff --git a/ci/scripts/check_ccache.sh b/ci/scripts/check_ccache.sh
index 7e7b79f542..2350bd558d 100755
--- a/ci/scripts/check_ccache.sh
+++ b/ci/scripts/check_ccache.sh
@@ -41,12 +41,12 @@ if [[ -z "${ARTIFACTORY_URL}" || "${ARTIFACTORY_URL}" == "" ]];then
     exit 1
 fi
 
-for BRANCH_NAME in ${BRANCH_NAMES}
-do
-    echo "fetching ${BRANCH_NAME}/ccache-${OS_NAME}-${CODE_NAME}-${BUILD_ENV_DOCKER_IMAGE_ID}.tar.gz"
-    wget -q --method HEAD "${ARTIFACTORY_URL}/${BRANCH_NAME}/ccache-${OS_NAME}-${CODE_NAME}-${BUILD_ENV_DOCKER_IMAGE_ID}.tar.gz"
+check_ccache() {
+    BRANCH=$1
+    echo "fetching ${BRANCH}/ccache-${OS_NAME}-${CODE_NAME}-${BUILD_ENV_DOCKER_IMAGE_ID}.tar.gz"
+    wget -q --method HEAD "${ARTIFACTORY_URL}/${BRANCH}/ccache-${OS_NAME}-${CODE_NAME}-${BUILD_ENV_DOCKER_IMAGE_ID}.tar.gz"
     if [[ $? == 0 ]];then
-        wget "${ARTIFACTORY_URL}/${BRANCH_NAME}/ccache-${OS_NAME}-${CODE_NAME}-${BUILD_ENV_DOCKER_IMAGE_ID}.tar.gz" && \
+        wget -q "${ARTIFACTORY_URL}/${BRANCH}/ccache-${OS_NAME}-${CODE_NAME}-${BUILD_ENV_DOCKER_IMAGE_ID}.tar.gz" && \
         mkdir -p ${CCACHE_DIRECTORY} && \
         tar zxf ccache-${OS_NAME}-${CODE_NAME}-${BUILD_ENV_DOCKER_IMAGE_ID}.tar.gz -C ${CCACHE_DIRECTORY} && \
         rm ccache-${OS_NAME}-${CODE_NAME}-${BUILD_ENV_DOCKER_IMAGE_ID}.tar.gz
@@ -55,6 +55,18 @@ do
             exit 0
         fi
     fi
+}
+
+if [[ -n "${CHANGE_BRANCH}" && "${BRANCH_NAME}" =~ "PR-" ]];then
+    check_ccache ${CHANGE_BRANCH}
+    check_ccache ${BRANCH_NAME}
+fi
+
+for CURRENT_BRANCH in ${BRANCH_NAMES}
+do
+    if [[ "${CURRENT_BRANCH}" != "HEAD" ]];then
+        check_ccache ${CURRENT_BRANCH}
+    fi
 done
 
 echo "could not download cache" && exit 1
diff --git a/ci/scripts/update_ccache.sh b/ci/scripts/update_ccache.sh
index 4f3243e6a1..f4afc29d1e 100755
--- a/ci/scripts/update_ccache.sh
+++ b/ci/scripts/update_ccache.sh
@@ -54,14 +54,18 @@ fi
 PACKAGE_FILE="ccache-${OS_NAME}-${CODE_NAME}-${BUILD_ENV_DOCKER_IMAGE_ID}.tar.gz"
 REMOTE_PACKAGE_PATH="${ARTIFACTORY_URL}/${BRANCH_NAME}"
 
-echo "Updating ccache package file: ${PACKAGE_FILE}"
-tar zcf ./${PACKAGE_FILE} -C ${HOME}/.ccache .
-echo "Uploading ccache package file ${PACKAGE_FILE} to ${REMOTE_PACKAGE_PATH}"
-curl -u${ARTIFACTORY_USER}:${ARTIFACTORY_PASSWORD} -T ${PACKAGE_FILE} ${REMOTE_PACKAGE_PATH}/${PACKAGE_FILE}
-if [[ $? == 0 ]];then
-    echo "Uploading ccache package file success !"
-    exit 0
-else
-    echo "Uploading ccache package file fault !"
-    exit 1
+ccache --show-stats
+
+if [[ "${BRANCH_NAME}" != "HEAD" ]];then
+    echo "Updating ccache package file: ${PACKAGE_FILE}"
+    tar zcf ./${PACKAGE_FILE} -C ${HOME}/.ccache .
+    echo "Uploading ccache package file ${PACKAGE_FILE} to ${REMOTE_PACKAGE_PATH}"
+    curl -u${ARTIFACTORY_USER}:${ARTIFACTORY_PASSWORD} -T ${PACKAGE_FILE} ${REMOTE_PACKAGE_PATH}/${PACKAGE_FILE}
+    if [[ $? == 0 ]];then
+        echo "Uploading ccache package file success !"
+        exit 0
+    else
+        echo "Uploading ccache package file fault !"
+        exit 1
+    fi
 fi
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 01d0e1b9f2..dd482f6464 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -35,15 +35,15 @@ if (NOT DEFINED CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build.")
 endif ()
 
-set (GIT_BRANCH_NAME_REGEX "[0-9]+\\.[0-9]+\\.[0-9]")
+set(GIT_BRANCH_NAME_REGEX "[0-9]+\\.[0-9]+\\.[0-9]")
 
 MACRO(GET_GIT_BRANCH_NAME GIT_BRANCH_NAME)
     execute_process(COMMAND sh "-c" "git log --decorate | head -n 1 | sed 's/.*(\\(.*\\))/\\1/' | sed 's/.*, //' | sed 's=[a-zA-Z]*\/==g'"
             OUTPUT_VARIABLE ${GIT_BRANCH_NAME})
-    if(NOT GIT_BRANCH_NAME MATCHES "${GIT_BRANCH_NAME_REGEX}")
+    if (NOT GIT_BRANCH_NAME MATCHES "${GIT_BRANCH_NAME_REGEX}")
         execute_process(COMMAND "git" rev-parse --abbrev-ref HEAD OUTPUT_VARIABLE ${GIT_BRANCH_NAME})
     endif ()
-    if(NOT GIT_BRANCH_NAME MATCHES "${GIT_BRANCH_NAME_REGEX}")
+    if (NOT GIT_BRANCH_NAME MATCHES "${GIT_BRANCH_NAME_REGEX}")
         execute_process(COMMAND "git" symbolic-ref --short -q HEAD HEAD OUTPUT_VARIABLE ${GIT_BRANCH_NAME})
     endif ()
 ENDMACRO(GET_GIT_BRANCH_NAME)
@@ -79,7 +79,7 @@ if (MILVUS_VERSION_MAJOR STREQUAL ""
         OR MILVUS_VERSION_PATCH STREQUAL "")
     message(WARNING "Failed to determine Milvus version from git branch name")
     set(MILVUS_VERSION "0.6.0")
-endif()
+endif ()
 
 message(STATUS "Build version = ${MILVUS_VERSION}")
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/src/version.h.in ${CMAKE_CURRENT_SOURCE_DIR}/src/version.h @ONLY)
@@ -141,7 +141,11 @@ if (MILVUS_USE_CCACHE)
     endif (CCACHE_FOUND)
 endif ()
 
-set(MILVUS_CPU_VERSION false)
+if (CUSTOMIZATION)
+    set(MILVUS_GPU_VERSION ON)
+    add_compile_definitions(CUSTOMIZATION)
+endif ()
+
 if (MILVUS_GPU_VERSION)
     message(STATUS "Building Milvus GPU version")
     add_compile_definitions("MILVUS_GPU_VERSION")
@@ -150,8 +154,6 @@ if (MILVUS_GPU_VERSION)
     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fPIC -std=c++11 -D_FORCE_INLINES --expt-extended-lambda")
 else ()
     message(STATUS "Building Milvus CPU version")
-    set(MILVUS_CPU_VERSION true)
-    add_compile_definitions("MILVUS_CPU_VERSION")
 endif ()
 
 if (MILVUS_WITH_PROMETHEUS)
@@ -170,10 +172,6 @@ else ()
     endif ()
 endif ()
 
-if (CUSTOMIZATION)
-    add_definitions(-DCUSTOMIZATION)
-endif (CUSTOMIZATION)
-
 config_summary()
 add_subdirectory(src)
 
@@ -187,7 +185,7 @@ endif ()
 add_custom_target(Clean-All COMMAND ${CMAKE_BUILD_TOOL} clean)
 
 if ("${MILVUS_DB_PATH}" STREQUAL "")
-    set(MILVUS_DB_PATH "/tmp/milvus")
+    set(MILVUS_DB_PATH "${CMAKE_INSTALL_PREFIX}")
 endif ()
 
 if (MILVUS_GPU_VERSION)
@@ -204,6 +202,11 @@ install(DIRECTORY scripts/
         GROUP_EXECUTE GROUP_READ
         WORLD_EXECUTE WORLD_READ
         FILES_MATCHING PATTERN "*.sh")
+install(DIRECTORY scripts/migration
+        DESTINATION scripts
+        FILE_PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+        GROUP_EXECUTE GROUP_READ
+        WORLD_EXECUTE WORLD_READ)
 install(FILES
         conf/server_config.yaml
         conf/log_config.conf
diff --git a/core/cmake/DefineOptions.cmake b/core/cmake/DefineOptions.cmake
index 4f4c8b17b5..394d4d0893 100644
--- a/core/cmake/DefineOptions.cmake
+++ b/core/cmake/DefineOptions.cmake
@@ -41,10 +41,12 @@ macro(define_option_string name description default)
 endmacro()
 
 #----------------------------------------------------------------------
-set_option_category("GPU version")
+set_option_category("Milvus Build Option")
 
 define_option(MILVUS_GPU_VERSION "Build GPU version" OFF)
 
+define_option(CUSTOMIZATION "Build with customized FAISS library" OFF)
+
 #----------------------------------------------------------------------
 set_option_category("Thirdparty")
 
diff --git a/core/conf/server_cpu_config.template b/core/conf/server_cpu_config.template
index 41889f5cef..40b8107d39 100644
--- a/core/conf/server_cpu_config.template
+++ b/core/conf/server_cpu_config.template
@@ -1,5 +1,7 @@
 # Default values are used when you make no changes to the following parameters.
 
+version: 0.1                        # config version
+
 server_config:
   address: 0.0.0.0                  # milvus server ip address (IPv4)
   port: 19530                       # milvus server port, must in range [1025, 65534]
diff --git a/core/conf/server_gpu_config.template b/core/conf/server_gpu_config.template
index 531c633da7..678915ca9d 100644
--- a/core/conf/server_gpu_config.template
+++ b/core/conf/server_gpu_config.template
@@ -1,5 +1,7 @@
 # Default values are used when you make no changes to the following parameters.
 
+version: 0.1                        # config version
+
 server_config:
   address: 0.0.0.0                  # milvus server ip address (IPv4)
   port: 19530                       # milvus server port, must in range [1025, 65534]
diff --git a/core/src/cache/Cache.inl b/core/src/cache/Cache.inl
index 9ac7ff21e6..9ebec7cfdd 100644
--- a/core/src/cache/Cache.inl
+++ b/core/src/cache/Cache.inl
@@ -99,8 +99,8 @@ Cache<ItemObj>::insert(const std::string& key, const ItemObj& item) {
         std::lock_guard<std::mutex> lock(mutex_);
 
         lru_.put(key, item);
-        SERVER_LOG_DEBUG << "Insert " << key << " size:" << item->Size() << " bytes into cache, usage: " << usage_
-                         << " bytes";
+        SERVER_LOG_DEBUG << "Insert " << key << " size: " << item->Size() << " bytes into cache, usage: " << usage_
+                         << " bytes," << " capacity: " << capacity_ << " bytes";
     }
 }
 
@@ -115,7 +115,8 @@ Cache<ItemObj>::erase(const std::string& key) {
     const ItemObj& old_item = lru_.get(key);
     usage_ -= old_item->Size();
 
-    SERVER_LOG_DEBUG << "Erase " << key << " size: " << old_item->Size();
+    SERVER_LOG_DEBUG << "Erase " << key << " size: " << old_item->Size() << " bytes from cache, usage: " << usage_
+                     << " bytes," << " capacity: " << capacity_ << " bytes";
 
     lru_.erase(key);
 }
diff --git a/core/src/db/DBImpl.cpp b/core/src/db/DBImpl.cpp
index 1c18c22409..67769717c4 100644
--- a/core/src/db/DBImpl.cpp
+++ b/core/src/db/DBImpl.cpp
@@ -41,6 +41,7 @@
 #include <iostream>
 #include <set>
 #include <thread>
+#include <utility>
 
 namespace milvus {
 namespace engine {
@@ -51,6 +52,8 @@ constexpr uint64_t METRIC_ACTION_INTERVAL = 1;
 constexpr uint64_t COMPACT_ACTION_INTERVAL = 1;
 constexpr uint64_t INDEX_ACTION_INTERVAL = 1;
 
+constexpr uint64_t INDEX_FAILED_RETRY_TIME = 1;
+
 static const Status SHUTDOWN_ERROR = Status(DB_ERROR, "Milsvus server is shutdown!");
 
 void
@@ -112,7 +115,7 @@ DBImpl::Stop() {
     bg_timer_thread_.join();
 
     if (options_.mode_ != DBOptions::MODE::CLUSTER_READONLY) {
-        meta_ptr_->CleanUp();
+        meta_ptr_->CleanUpShadowFiles();
     }
 
     // ENGINE_LOG_TRACE << "DB service stop";
@@ -179,7 +182,7 @@ DBImpl::PreloadTable(const std::string& table_id) {
         return SHUTDOWN_ERROR;
     }
 
-    // get all table files from parent table
+    // step 1: get all table files from parent table
     meta::DatesT dates;
     std::vector<size_t> ids;
     meta::TableFilesSchema files_array;
@@ -188,7 +191,7 @@ DBImpl::PreloadTable(const std::string& table_id) {
         return status;
     }
 
-    // get files from partition tables
+    // step 2: get files from partition tables
     std::vector<meta::TableSchema> partiton_array;
     status = meta_ptr_->ShowPartitions(table_id, partiton_array);
     for (auto& schema : partiton_array) {
@@ -200,6 +203,10 @@ DBImpl::PreloadTable(const std::string& table_id) {
     int64_t cache_usage = cache::CpuCacheMgr::GetInstance()->CacheUsage();
     int64_t available_size = cache_total - cache_usage;
 
+    // step 3: load file one by one
+    ENGINE_LOG_DEBUG << "Begin pre-load table:" + table_id + ", totally " << files_array.size()
+                     << " files need to be pre-loaded";
+    TimeRecorderAuto rc("Pre-load table:" + table_id);
     for (auto& file : files_array) {
         ExecutionEnginePtr engine = EngineFactory::Build(file.dimension_, file.location_, (EngineType)file.engine_type_,
                                                          (MetricType)file.metric_type_, file.nlist_);
@@ -210,10 +217,12 @@ DBImpl::PreloadTable(const std::string& table_id) {
 
         size += engine->PhysicalSize();
         if (size > available_size) {
+            ENGINE_LOG_DEBUG << "Pre-load canceled since cache almost full";
             return Status(SERVER_CACHE_FULL, "Cache is full");
         } else {
             try {
-                // step 1: load index
+                std::string msg = "Pre-loaded file: " + file.file_id_ + " size: " + std::to_string(file.file_size_);
+                TimeRecorderAuto rc_1(msg);
                 engine->Load(true);
             } catch (std::exception& ex) {
                 std::string msg = "Pre-load table encounter exception: " + std::string(ex.what());
@@ -361,6 +370,7 @@ DBImpl::CreateIndex(const std::string& table_id, const TableIndex& index) {
     WaitMergeFileFinish();
 
     // step 4: wait and build index
+    status = CleanFailedIndexFileOfTable(table_id);
     status = BuildTableIndexRecursively(table_id, index);
 
     return status;
@@ -777,11 +787,18 @@ DBImpl::BackgroundCompaction(std::set<std::string> table_ids) {
 
     meta_ptr_->Archive();
 
-    int ttl = 5 * meta::M_SEC;  // default: file will be deleted after 5 minutes
-    if (options_.mode_ == DBOptions::MODE::CLUSTER_WRITABLE) {
-        ttl = meta::D_SEC;
+    {
+        uint64_t ttl = 10 * meta::SECOND;  // default: file data will be erase from cache after few seconds
+        meta_ptr_->CleanUpCacheWithTTL(ttl);
+    }
+
+    {
+        uint64_t ttl = 5 * meta::M_SEC;  // default: file will be deleted after few minutes
+        if (options_.mode_ == DBOptions::MODE::CLUSTER_WRITABLE) {
+            ttl = meta::D_SEC;
+        }
+        meta_ptr_->CleanUpFilesWithTTL(ttl);
     }
-    meta_ptr_->CleanUpFilesWithTTL(ttl);
 
     // ENGINE_LOG_TRACE << " Background compaction thread exit";
 }
@@ -821,22 +838,35 @@ DBImpl::BackgroundBuildIndex() {
     std::unique_lock<std::mutex> lock(build_index_mutex_);
     meta::TableFilesSchema to_index_files;
     meta_ptr_->FilesToIndex(to_index_files);
-    Status status;
+    Status status = IgnoreFailedIndexFiles(to_index_files);
 
     if (!to_index_files.empty()) {
-        scheduler::BuildIndexJobPtr job = std::make_shared<scheduler::BuildIndexJob>(meta_ptr_, options_);
-
         // step 2: put build index task to scheduler
+        std::vector<std::pair<scheduler::BuildIndexJobPtr, scheduler::TableFileSchemaPtr>> job2file_map;
         for (auto& file : to_index_files) {
+            scheduler::BuildIndexJobPtr job = std::make_shared<scheduler::BuildIndexJob>(meta_ptr_, options_);
             scheduler::TableFileSchemaPtr file_ptr = std::make_shared<meta::TableFileSchema>(file);
             job->AddToIndexFiles(file_ptr);
+            scheduler::JobMgrInst::GetInstance()->Put(job);
+            job2file_map.push_back(std::make_pair(job, file_ptr));
         }
-        scheduler::JobMgrInst::GetInstance()->Put(job);
-        job->WaitBuildIndexFinish();
-        if (!job->GetStatus().ok()) {
-            Status status = job->GetStatus();
-            ENGINE_LOG_ERROR << "Building index failed: " << status.ToString();
+
+        for (auto iter = job2file_map.begin(); iter != job2file_map.end(); ++iter) {
+            scheduler::BuildIndexJobPtr job = iter->first;
+            meta::TableFileSchema& file_schema = *(iter->second.get());
+            job->WaitBuildIndexFinish();
+            if (!job->GetStatus().ok()) {
+                Status status = job->GetStatus();
+                ENGINE_LOG_ERROR << "Building index job " << job->id() << " failed: " << status.ToString();
+
+                MarkFailedIndexFile(file_schema);
+            } else {
+                MarkSucceedIndexFile(file_schema);
+                ENGINE_LOG_DEBUG << "Building index job " << job->id() << " succeed.";
+            }
         }
+
+        ENGINE_LOG_DEBUG << "Background build index thread finished";
     }
 
     // ENGINE_LOG_TRACE << "Background build index thread exit";
@@ -904,6 +934,7 @@ DBImpl::DropTableRecursively(const std::string& table_id, const meta::DatesT& da
     if (dates.empty()) {
         status = mem_mgr_->EraseMemVector(table_id);  // not allow insert
         status = meta_ptr_->DropTable(table_id);      // soft delete table
+        CleanFailedIndexFileOfTable(table_id);
 
         // scheduler will determine when to delete table files
         auto nres = scheduler::ResMgrInst::GetInstance()->GetNumOfComputeResource();
@@ -982,6 +1013,8 @@ DBImpl::BuildTableIndexRecursively(const std::string& table_id, const TableIndex
         std::this_thread::sleep_for(std::chrono::milliseconds(std::min(10 * 1000, times * 100)));
         GetFilesToBuildIndex(table_id, file_types, table_files);
         times++;
+
+        IgnoreFailedIndexFiles(table_files);
     }
 
     // build index for partition
@@ -994,12 +1027,27 @@ DBImpl::BuildTableIndexRecursively(const std::string& table_id, const TableIndex
         }
     }
 
+    // failed to build index for some files, return error
+    std::vector<std::string> failed_files;
+    GetFailedIndexFileOfTable(table_id, failed_files);
+    if (!failed_files.empty()) {
+        std::string msg = "Failed to build index for " + std::to_string(failed_files.size()) +
+                          ((failed_files.size() == 1) ? " file" : " files");
+#ifdef MILVUS_GPU_VERSION
+        msg += ", file size is too large or gpu memory is not enough.";
+#else
+        msg += ", please double check index parameters.";
+#endif
+        return Status(DB_ERROR, msg);
+    }
+
     return Status::OK();
 }
 
 Status
 DBImpl::DropTableIndexRecursively(const std::string& table_id) {
     ENGINE_LOG_DEBUG << "Drop index for table: " << table_id;
+    CleanFailedIndexFileOfTable(table_id);
     auto status = meta_ptr_->DropTableIndex(table_id);
     if (!status.ok()) {
         return status;
@@ -1042,5 +1090,86 @@ DBImpl::GetTableRowCountRecursively(const std::string& table_id, uint64_t& row_c
     return Status::OK();
 }
 
+Status
+DBImpl::CleanFailedIndexFileOfTable(const std::string& table_id) {
+    std::lock_guard<std::mutex> lck(index_failed_mutex_);
+    index_failed_files_.erase(table_id);  // rebuild failed index files for this table
+
+    return Status::OK();
+}
+
+Status
+DBImpl::GetFailedIndexFileOfTable(const std::string& table_id, std::vector<std::string>& failed_files) {
+    failed_files.clear();
+    std::lock_guard<std::mutex> lck(index_failed_mutex_);
+    auto iter = index_failed_files_.find(table_id);
+    if (iter != index_failed_files_.end()) {
+        FileID2FailedTimes& failed_map = iter->second;
+        for (auto it_file = failed_map.begin(); it_file != failed_map.end(); ++it_file) {
+            failed_files.push_back(it_file->first);
+        }
+    }
+
+    return Status::OK();
+}
+
+Status
+DBImpl::MarkFailedIndexFile(const meta::TableFileSchema& file) {
+    std::lock_guard<std::mutex> lck(index_failed_mutex_);
+
+    auto iter = index_failed_files_.find(file.table_id_);
+    if (iter == index_failed_files_.end()) {
+        FileID2FailedTimes failed_files;
+        failed_files.insert(std::make_pair(file.file_id_, 1));
+        index_failed_files_.insert(std::make_pair(file.table_id_, failed_files));
+    } else {
+        auto it_failed_files = iter->second.find(file.file_id_);
+        if (it_failed_files != iter->second.end()) {
+            it_failed_files->second++;
+        } else {
+            iter->second.insert(std::make_pair(file.file_id_, 1));
+        }
+    }
+
+    return Status::OK();
+}
+
+Status
+DBImpl::MarkSucceedIndexFile(const meta::TableFileSchema& file) {
+    std::lock_guard<std::mutex> lck(index_failed_mutex_);
+
+    auto iter = index_failed_files_.find(file.table_id_);
+    if (iter != index_failed_files_.end()) {
+        iter->second.erase(file.file_id_);
+    }
+
+    return Status::OK();
+}
+
+Status
+DBImpl::IgnoreFailedIndexFiles(meta::TableFilesSchema& table_files) {
+    std::lock_guard<std::mutex> lck(index_failed_mutex_);
+
+    // there could be some failed files belong to different table.
+    // some files may has failed for several times, no need to build index for these files.
+    // thus we can avoid dead circle for build index operation
+    for (auto it_file = table_files.begin(); it_file != table_files.end();) {
+        auto it_failed_files = index_failed_files_.find((*it_file).table_id_);
+        if (it_failed_files != index_failed_files_.end()) {
+            auto it_failed_file = it_failed_files->second.find((*it_file).file_id_);
+            if (it_failed_file != it_failed_files->second.end()) {
+                if (it_failed_file->second >= INDEX_FAILED_RETRY_TIME) {
+                    it_file = table_files.erase(it_file);
+                    continue;
+                }
+            }
+        }
+
+        ++it_file;
+    }
+
+    return Status::OK();
+}
+
 }  // namespace engine
 }  // namespace milvus
diff --git a/core/src/db/DBImpl.h b/core/src/db/DBImpl.h
index 82a5d3096b..3baac92c0a 100644
--- a/core/src/db/DBImpl.h
+++ b/core/src/db/DBImpl.h
@@ -25,6 +25,7 @@
 #include <atomic>
 #include <condition_variable>
 #include <list>
+#include <map>
 #include <memory>
 #include <mutex>
 #include <set>
@@ -35,8 +36,6 @@
 namespace milvus {
 namespace engine {
 
-class Env;
-
 namespace meta {
 class Meta;
 }
@@ -179,6 +178,21 @@ class DBImpl : public DB {
     Status
     GetTableRowCountRecursively(const std::string& table_id, uint64_t& row_count);
 
+    Status
+    CleanFailedIndexFileOfTable(const std::string& table_id);
+
+    Status
+    GetFailedIndexFileOfTable(const std::string& table_id, std::vector<std::string>& failed_files);
+
+    Status
+    MarkFailedIndexFile(const meta::TableFileSchema& file);
+
+    Status
+    MarkSucceedIndexFile(const meta::TableFileSchema& file);
+
+    Status
+    IgnoreFailedIndexFiles(meta::TableFilesSchema& table_files);
+
  private:
     const DBOptions options_;
 
@@ -200,7 +214,11 @@ class DBImpl : public DB {
     std::list<std::future<void>> index_thread_results_;
 
     std::mutex build_index_mutex_;
-};  // DBImpl
+    std::mutex index_failed_mutex_;
+    using FileID2FailedTimes = std::map<std::string, uint64_t>;
+    using Table2FailedFiles = std::map<std::string, FileID2FailedTimes>;
+    Table2FailedFiles index_failed_files_;  // file id mapping to failed times
+};                                          // DBImpl
 
 }  // namespace engine
 }  // namespace milvus
diff --git a/core/src/db/Utils.cpp b/core/src/db/Utils.cpp
index 0ddf03568a..9689f496cf 100644
--- a/core/src/db/Utils.cpp
+++ b/core/src/db/Utils.cpp
@@ -154,7 +154,9 @@ GetTableFilePath(const DBMetaOptions& options, meta::TableFileSchema& table_file
     }
 
     std::string msg = "Table file doesn't exist: " + file_path;
-    ENGINE_LOG_ERROR << msg << " in path: " << options.path_ << " for table: " << table_file.table_id_;
+    if (table_file.file_size_ > 0) {  // no need to pop error for empty file
+        ENGINE_LOG_ERROR << msg << " in path: " << options.path_ << " for table: " << table_file.table_id_;
+    }
 
     return Status(DB_ERROR, msg);
 }
diff --git a/core/src/db/engine/ExecutionEngine.h b/core/src/db/engine/ExecutionEngine.h
index c8784e8a90..9f3e6a86db 100644
--- a/core/src/db/engine/ExecutionEngine.h
+++ b/core/src/db/engine/ExecutionEngine.h
@@ -77,8 +77,8 @@ class ExecutionEngine {
     virtual Status
     CopyToCpu() = 0;
 
-    virtual std::shared_ptr<ExecutionEngine>
-    Clone() = 0;
+    //    virtual std::shared_ptr<ExecutionEngine>
+    //    Clone() = 0;
 
     virtual Status
     Merge(const std::string& location) = 0;
diff --git a/core/src/db/engine/ExecutionEngineImpl.cpp b/core/src/db/engine/ExecutionEngineImpl.cpp
index c0ab4e829e..00bc548c06 100644
--- a/core/src/db/engine/ExecutionEngineImpl.cpp
+++ b/core/src/db/engine/ExecutionEngineImpl.cpp
@@ -93,18 +93,18 @@ ExecutionEngineImpl::CreatetVecIndex(EngineType type) {
             break;
         }
         case EngineType::FAISS_IVFFLAT: {
-#ifdef MILVUS_CPU_VERSION
-            index = GetVecIndexFactory(IndexType::FAISS_IVFFLAT_CPU);
-#else
+#ifdef MILVUS_GPU_VERSION
             index = GetVecIndexFactory(IndexType::FAISS_IVFFLAT_MIX);
+#else
+            index = GetVecIndexFactory(IndexType::FAISS_IVFFLAT_CPU);
 #endif
             break;
         }
         case EngineType::FAISS_IVFSQ8: {
-#ifdef MILVUS_CPU_VERSION
-            index = GetVecIndexFactory(IndexType::FAISS_IVFSQ8_CPU);
-#else
+#ifdef MILVUS_GPU_VERSION
             index = GetVecIndexFactory(IndexType::FAISS_IVFSQ8_MIX);
+#else
+            index = GetVecIndexFactory(IndexType::FAISS_IVFSQ8_CPU);
 #endif
             break;
         }
@@ -112,15 +112,17 @@ ExecutionEngineImpl::CreatetVecIndex(EngineType type) {
             index = GetVecIndexFactory(IndexType::NSG_MIX);
             break;
         }
+#ifdef CUSTOMIZATION
         case EngineType::FAISS_IVFSQ8H: {
             index = GetVecIndexFactory(IndexType::FAISS_IVFSQ8_HYBRID);
             break;
         }
+#endif
         case EngineType::FAISS_PQ: {
-#ifdef MILVUS_CPU_VERSION
-            index = GetVecIndexFactory(IndexType::FAISS_IVFPQ_CPU);
-#else
+#ifdef MILVUS_GPU_VERSION
             index = GetVecIndexFactory(IndexType::FAISS_IVFPQ_MIX);
+#else
+            index = GetVecIndexFactory(IndexType::FAISS_IVFPQ_CPU);
 #endif
             break;
         }
@@ -257,6 +259,11 @@ ExecutionEngineImpl::PhysicalSize() const {
 Status
 ExecutionEngineImpl::Serialize() {
     auto status = write_index(index_, location_);
+
+    // here we reset index size by file size,
+    // since some index type(such as SQ8) data size become smaller after serialized
+    index_->set_size(PhysicalSize());
+
     return status;
 }
 
@@ -410,18 +417,18 @@ ExecutionEngineImpl::CopyToCpu() {
     return Status::OK();
 }
 
-ExecutionEnginePtr
-ExecutionEngineImpl::Clone() {
-    if (index_ == nullptr) {
-        ENGINE_LOG_ERROR << "ExecutionEngineImpl: index is null, failed to clone";
-        return nullptr;
-    }
-
-    auto ret = std::make_shared<ExecutionEngineImpl>(dim_, location_, index_type_, metric_type_, nlist_);
-    ret->Init();
-    ret->index_ = index_->Clone();
-    return ret;
-}
+// ExecutionEnginePtr
+// ExecutionEngineImpl::Clone() {
+//    if (index_ == nullptr) {
+//        ENGINE_LOG_ERROR << "ExecutionEngineImpl: index is null, failed to clone";
+//        return nullptr;
+//    }
+//
+//    auto ret = std::make_shared<ExecutionEngineImpl>(dim_, location_, index_type_, metric_type_, nlist_);
+//    ret->Init();
+//    ret->index_ = index_->Clone();
+//    return ret;
+//}
 
 Status
 ExecutionEngineImpl::Merge(const std::string& location) {
@@ -604,6 +611,9 @@ ExecutionEngineImpl::Init() {
     server::Config& config = server::Config::GetInstance();
     std::vector<int64_t> gpu_ids;
     Status s = config.GetGpuResourceConfigBuildIndexResources(gpu_ids);
+    if (!s.ok()) {
+        gpu_num_ = knowhere::INVALID_VALUE;
+    }
     for (auto id : gpu_ids) {
         if (gpu_num_ == id) {
             return Status::OK();
diff --git a/core/src/db/engine/ExecutionEngineImpl.h b/core/src/db/engine/ExecutionEngineImpl.h
index da0e7cfb64..84604f519e 100644
--- a/core/src/db/engine/ExecutionEngineImpl.h
+++ b/core/src/db/engine/ExecutionEngineImpl.h
@@ -64,8 +64,8 @@ class ExecutionEngineImpl : public ExecutionEngine {
     Status
     CopyToCpu() override;
 
-    ExecutionEnginePtr
-    Clone() override;
+    //    ExecutionEnginePtr
+    //    Clone() override;
 
     Status
     Merge(const std::string& location) override;
diff --git a/core/src/db/meta/Meta.h b/core/src/db/meta/Meta.h
index 52fe86fe69..bf46f02fea 100644
--- a/core/src/db/meta/Meta.h
+++ b/core/src/db/meta/Meta.h
@@ -118,9 +118,13 @@ class Meta {
     Archive() = 0;
 
     virtual Status
-    CleanUp() = 0;
+    CleanUpShadowFiles() = 0;
 
-    virtual Status CleanUpFilesWithTTL(uint16_t) = 0;
+    virtual Status
+    CleanUpCacheWithTTL(uint64_t seconds) = 0;
+
+    virtual Status
+    CleanUpFilesWithTTL(uint64_t seconds) = 0;
 
     virtual Status
     DropAll() = 0;
diff --git a/core/src/db/meta/MySQLMetaImpl.cpp b/core/src/db/meta/MySQLMetaImpl.cpp
index 6d13cad248..dcf3824fe1 100644
--- a/core/src/db/meta/MySQLMetaImpl.cpp
+++ b/core/src/db/meta/MySQLMetaImpl.cpp
@@ -20,6 +20,7 @@
 #include "db/IDGenerator.h"
 #include "db/Utils.h"
 #include "metrics/Metrics.h"
+#include "utils/CommonUtil.h"
 #include "utils/Exception.h"
 #include "utils/Log.h"
 #include "utils/StringHelpFunctions.h"
@@ -289,45 +290,50 @@ MySQLMetaImpl::Initialize() {
     // step 4: validate to avoid open old version schema
     ValidateMetaSchema();
 
-    // step 5: create meta tables
-    try {
-        if (mode_ != DBOptions::MODE::CLUSTER_READONLY) {
-            CleanUp();
-        }
+    // step 5: clean shadow files
+    if (mode_ != DBOptions::MODE::CLUSTER_READONLY) {
+        CleanUpShadowFiles();
+    }
 
-        {
-            mysqlpp::ScopedConnection connectionPtr(*mysql_connection_pool_, safe_grab_);
+    // step 6: try connect mysql server
+    mysqlpp::ScopedConnection connectionPtr(*mysql_connection_pool_, safe_grab_);
 
-            if (connectionPtr == nullptr) {
-                return Status(DB_ERROR, "Failed to connect to meta server(mysql)");
-            }
+    if (connectionPtr == nullptr) {
+        std::string msg = "Failed to connect MySQL meta server: " + uri;
+        ENGINE_LOG_ERROR << msg;
+        throw Exception(DB_INVALID_META_URI, msg);
+    }
 
-            if (!connectionPtr->thread_aware()) {
-                ENGINE_LOG_ERROR << "MySQL++ wasn't built with thread awareness! Can't run without it.";
-                return Status(DB_ERROR, "MySQL++ wasn't built with thread awareness! Can't run without it.");
-            }
-            mysqlpp::Query InitializeQuery = connectionPtr->query();
+    if (!connectionPtr->thread_aware()) {
+        std::string msg =
+            "Failed to initialize MySQL meta backend: MySQL client component wasn't built with thread awareness";
+        ENGINE_LOG_ERROR << msg;
+        throw Exception(DB_INVALID_META_URI, msg);
+    }
 
-            InitializeQuery << "CREATE TABLE IF NOT EXISTS " << TABLES_SCHEMA.name() << " ("
-                            << TABLES_SCHEMA.ToString() + ");";
+    // step 7: create meta table Tables
+    mysqlpp::Query InitializeQuery = connectionPtr->query();
 
-            ENGINE_LOG_DEBUG << "MySQLMetaImpl::Initialize: " << InitializeQuery.str();
+    InitializeQuery << "CREATE TABLE IF NOT EXISTS " << TABLES_SCHEMA.name() << " (" << TABLES_SCHEMA.ToString() + ");";
 
-            if (!InitializeQuery.exec()) {
-                return HandleException("Initialization Error", InitializeQuery.error());
-            }
+    ENGINE_LOG_DEBUG << "MySQLMetaImpl::Initialize: " << InitializeQuery.str();
 
-            InitializeQuery << "CREATE TABLE IF NOT EXISTS " << TABLEFILES_SCHEMA.name() << " ("
-                            << TABLEFILES_SCHEMA.ToString() + ");";
+    if (!InitializeQuery.exec()) {
+        std::string msg = "Failed to create meta table 'Tables' in MySQL";
+        ENGINE_LOG_ERROR << msg;
+        throw Exception(DB_META_TRANSACTION_FAILED, msg);
+    }
 
-            ENGINE_LOG_DEBUG << "MySQLMetaImpl::Initialize: " << InitializeQuery.str();
+    // step 8: create meta table TableFiles
+    InitializeQuery << "CREATE TABLE IF NOT EXISTS " << TABLEFILES_SCHEMA.name() << " ("
+                    << TABLEFILES_SCHEMA.ToString() + ");";
 
-            if (!InitializeQuery.exec()) {
-                return HandleException("Initialization Error", InitializeQuery.error());
-            }
-        }  // Scoped Connection
-    } catch (std::exception& e) {
-        return HandleException("GENERAL ERROR DURING INITIALIZATION", e.what());
+    ENGINE_LOG_DEBUG << "MySQLMetaImpl::Initialize: " << InitializeQuery.str();
+
+    if (!InitializeQuery.exec()) {
+        std::string msg = "Failed to create meta table 'TableFiles' in MySQL";
+        ENGINE_LOG_ERROR << msg;
+        throw Exception(DB_META_TRANSACTION_FAILED, msg);
     }
 
     return Status::OK();
@@ -1609,10 +1615,35 @@ MySQLMetaImpl::FilesByType(const std::string& table_id, const std::vector<int>&
                 }
             }
 
-            ENGINE_LOG_DEBUG << "Table " << table_id << " currently has raw files:" << raw_count
-                             << " new files:" << new_count << " new_merge files:" << new_merge_count
-                             << " new_index files:" << new_index_count << " to_index files:" << to_index_count
-                             << " index files:" << index_count << " backup files:" << backup_count;
+            std::string msg = "Get table files by type.";
+            for (int file_type : file_types) {
+                switch (file_type) {
+                    case (int)TableFileSchema::RAW:
+                        msg = msg + " raw files:" + std::to_string(raw_count);
+                        break;
+                    case (int)TableFileSchema::NEW:
+                        msg = msg + " new files:" + std::to_string(new_count);
+                        break;
+                    case (int)TableFileSchema::NEW_MERGE:
+                        msg = msg + " new_merge files:" + std::to_string(new_merge_count);
+                        break;
+                    case (int)TableFileSchema::NEW_INDEX:
+                        msg = msg + " new_index files:" + std::to_string(new_index_count);
+                        break;
+                    case (int)TableFileSchema::TO_INDEX:
+                        msg = msg + " to_index files:" + std::to_string(to_index_count);
+                        break;
+                    case (int)TableFileSchema::INDEX:
+                        msg = msg + " index files:" + std::to_string(index_count);
+                        break;
+                    case (int)TableFileSchema::BACKUP:
+                        msg = msg + " backup files:" + std::to_string(backup_count);
+                        break;
+                    default:
+                        break;
+                }
+            }
+            ENGINE_LOG_DEBUG << msg;
         }
     } catch (std::exception& e) {
         return HandleException("GENERAL ERROR WHEN GET FILE BY TYPE", e.what());
@@ -1710,7 +1741,7 @@ MySQLMetaImpl::Size(uint64_t& result) {
 }
 
 Status
-MySQLMetaImpl::CleanUp() {
+MySQLMetaImpl::CleanUpShadowFiles() {
     try {
         mysqlpp::ScopedConnection connectionPtr(*mysql_connection_pool_, safe_grab_);
 
@@ -1752,7 +1783,49 @@ MySQLMetaImpl::CleanUp() {
 }
 
 Status
-MySQLMetaImpl::CleanUpFilesWithTTL(uint16_t seconds) {
+MySQLMetaImpl::CleanUpCacheWithTTL(uint64_t seconds) {
+    auto now = utils::GetMicroSecTimeStamp();
+
+    // erase deleted/backup files from cache
+    try {
+        server::MetricCollector metric;
+
+        mysqlpp::ScopedConnection connectionPtr(*mysql_connection_pool_, safe_grab_);
+
+        if (connectionPtr == nullptr) {
+            return Status(DB_ERROR, "Failed to connect to meta server(mysql)");
+        }
+
+        mysqlpp::Query cleanUpFilesWithTTLQuery = connectionPtr->query();
+        cleanUpFilesWithTTLQuery << "SELECT id, table_id, file_id, date"
+                                 << " FROM " << META_TABLEFILES << " WHERE file_type IN ("
+                                 << std::to_string(TableFileSchema::TO_DELETE) << ","
+                                 << std::to_string(TableFileSchema::BACKUP) << ")"
+                                 << " AND updated_time < " << std::to_string(now - seconds * US_PS) << ";";
+
+        mysqlpp::StoreQueryResult res = cleanUpFilesWithTTLQuery.store();
+
+        TableFileSchema table_file;
+        std::vector<std::string> idsToDelete;
+
+        for (auto& resRow : res) {
+            table_file.id_ = resRow["id"];  // implicit conversion
+            resRow["table_id"].to_string(table_file.table_id_);
+            resRow["file_id"].to_string(table_file.file_id_);
+            table_file.date_ = resRow["date"];
+
+            utils::GetTableFilePath(options_, table_file);
+            server::CommonUtil::EraseFromCache(table_file.location_);
+        }
+    } catch (std::exception& e) {
+        return HandleException("GENERAL ERROR WHEN CLEANING UP FILES WITH TTL", e.what());
+    }
+
+    return Status::OK();
+}
+
+Status
+MySQLMetaImpl::CleanUpFilesWithTTL(uint64_t seconds) {
     auto now = utils::GetMicroSecTimeStamp();
     std::set<std::string> table_ids;
 
diff --git a/core/src/db/meta/MySQLMetaImpl.h b/core/src/db/meta/MySQLMetaImpl.h
index dd882fca2e..e7697316af 100644
--- a/core/src/db/meta/MySQLMetaImpl.h
+++ b/core/src/db/meta/MySQLMetaImpl.h
@@ -117,10 +117,13 @@ class MySQLMetaImpl : public Meta {
     Size(uint64_t& result) override;
 
     Status
-    CleanUp() override;
+    CleanUpShadowFiles() override;
 
     Status
-    CleanUpFilesWithTTL(uint16_t seconds) override;
+    CleanUpCacheWithTTL(uint64_t seconds) override;
+
+    Status
+    CleanUpFilesWithTTL(uint64_t seconds) override;
 
     Status
     DropAll() override;
diff --git a/core/src/db/meta/SqliteMetaImpl.cpp b/core/src/db/meta/SqliteMetaImpl.cpp
index 74460c1b4d..07f890d50a 100644
--- a/core/src/db/meta/SqliteMetaImpl.cpp
+++ b/core/src/db/meta/SqliteMetaImpl.cpp
@@ -20,6 +20,7 @@
 #include "db/IDGenerator.h"
 #include "db/Utils.h"
 #include "metrics/Metrics.h"
+#include "utils/CommonUtil.h"
 #include "utils/Exception.h"
 #include "utils/Log.h"
 #include "utils/StringHelpFunctions.h"
@@ -154,7 +155,7 @@ SqliteMetaImpl::Initialize() {
     ConnectorPtr->open_forever();                          // thread safe option
     ConnectorPtr->pragma.journal_mode(journal_mode::WAL);  // WAL => write ahead log
 
-    CleanUp();
+    CleanUpShadowFiles();
 
     return Status::OK();
 }
@@ -1156,10 +1157,34 @@ SqliteMetaImpl::FilesByType(const std::string& table_id,
                 table_files.emplace_back(file_schema);
             }
 
-            ENGINE_LOG_DEBUG << "Table " << table_id << " currently has raw files:" << raw_count
-                             << " new files:" << new_count << " new_merge files:" << new_merge_count
-                             << " new_index files:" << new_index_count << " to_index files:" << to_index_count
-                             << " index files:" << index_count << " backup files:" << backup_count;
+            std::string msg = "Get table files by type.";
+            for (int file_type : file_types) {
+                switch (file_type) {
+                    case (int)TableFileSchema::RAW:
+                        msg = msg + " raw files:" + std::to_string(raw_count);
+                        break;
+                    case (int)TableFileSchema::NEW:
+                        msg = msg + " new files:" + std::to_string(new_count);
+                        break;
+                    case (int)TableFileSchema::NEW_MERGE:
+                        msg = msg + " new_merge files:" + std::to_string(new_merge_count);
+                        break;
+                    case (int)TableFileSchema::NEW_INDEX:
+                        msg = msg + " new_index files:" + std::to_string(new_index_count);
+                        break;
+                    case (int)TableFileSchema::TO_INDEX:
+                        msg = msg + " to_index files:" + std::to_string(to_index_count);
+                        break;
+                    case (int)TableFileSchema::INDEX:
+                        msg = msg + " index files:" + std::to_string(index_count);
+                        break;
+                    case (int)TableFileSchema::BACKUP:
+                        msg = msg + " backup files:" + std::to_string(backup_count);
+                        break;
+                    default:break;
+                }
+            }
+            ENGINE_LOG_DEBUG << msg;
         }
     } catch (std::exception& e) {
         return HandleException("Encounter exception when check non index files", e.what());
@@ -1231,7 +1256,7 @@ SqliteMetaImpl::Size(uint64_t& result) {
 }
 
 Status
-SqliteMetaImpl::CleanUp() {
+SqliteMetaImpl::CleanUpShadowFiles() {
     try {
         server::MetricCollector metric;
 
@@ -1269,7 +1294,51 @@ SqliteMetaImpl::CleanUp() {
 }
 
 Status
-SqliteMetaImpl::CleanUpFilesWithTTL(uint16_t seconds) {
+SqliteMetaImpl::CleanUpCacheWithTTL(uint64_t seconds) {
+    auto now = utils::GetMicroSecTimeStamp();
+
+    // erase deleted/backup files from cache
+    try {
+        server::MetricCollector metric;
+
+        // multi-threads call sqlite update may get exception('bad logic', etc), so we add a lock here
+        std::lock_guard<std::mutex> meta_lock(meta_mutex_);
+
+        std::vector<int> file_types = {
+            (int)TableFileSchema::TO_DELETE,
+            (int)TableFileSchema::BACKUP,
+        };
+
+        auto files = ConnectorPtr->select(columns(&TableFileSchema::id_,
+                                                  &TableFileSchema::table_id_,
+                                                  &TableFileSchema::file_id_,
+                                                  &TableFileSchema::date_),
+                                          where(
+                                              in(&TableFileSchema::file_type_, file_types)
+                                              and
+                                              c(&TableFileSchema::updated_time_)
+                                              < now - seconds * US_PS));
+
+        for (auto& file : files) {
+            TableFileSchema table_file;
+            table_file.id_ = std::get<0>(file);
+            table_file.table_id_ = std::get<1>(file);
+            table_file.file_id_ = std::get<2>(file);
+            table_file.date_ = std::get<3>(file);
+
+            utils::GetTableFilePath(options_, table_file);
+            server::CommonUtil::EraseFromCache(table_file.location_);
+        }
+
+    } catch (std::exception& e) {
+        return HandleException("Encounter exception when clean cache", e.what());
+    }
+
+    return Status::OK();
+}
+
+Status
+SqliteMetaImpl::CleanUpFilesWithTTL(uint64_t seconds) {
     auto now = utils::GetMicroSecTimeStamp();
     std::set<std::string> table_ids;
 
diff --git a/core/src/db/meta/SqliteMetaImpl.h b/core/src/db/meta/SqliteMetaImpl.h
index 8e821d81de..5581efe361 100644
--- a/core/src/db/meta/SqliteMetaImpl.h
+++ b/core/src/db/meta/SqliteMetaImpl.h
@@ -117,10 +117,13 @@ class SqliteMetaImpl : public Meta {
     Archive() override;
 
     Status
-    CleanUp() override;
+    CleanUpShadowFiles() override;
 
     Status
-    CleanUpFilesWithTTL(uint16_t seconds) override;
+    CleanUpCacheWithTTL(uint64_t seconds) override;
+
+    Status
+    CleanUpFilesWithTTL(uint64_t seconds) override;
 
     Status
     DropAll() override;
diff --git a/core/src/grpc/README.md b/core/src/grpc/README.md
deleted file mode 100644
index 44c4e90841..0000000000
--- a/core/src/grpc/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-We manually change two APIs in "milvus.pd.h":
-    add_vector_data()
-    add_row_id_array()
-    add_ids()
-    add_distances()
-If proto files need be generated again, remember to re-change above APIs.
\ No newline at end of file
diff --git a/core/src/index/CMakeLists.txt b/core/src/index/CMakeLists.txt
index 53453d53aa..6e58566dd0 100644
--- a/core/src/index/CMakeLists.txt
+++ b/core/src/index/CMakeLists.txt
@@ -72,6 +72,11 @@ include(ExternalProject)
 include(DefineOptionsCore)
 include(BuildUtilsCore)
 
+if (CUSTOMIZATION)
+    set(MILVUS_GPU_VERSION ON)
+    add_compile_definitions(CUSTOMIZATION)
+endif ()
+
 set(KNOWHERE_CPU_VERSION false)
 if (MILVUS_GPU_VERSION OR KNOWHERE_GPU_VERSION)
     message(STATUS "Building Knowhere GPU version")
diff --git a/core/src/index/cmake/DefineOptionsCore.cmake b/core/src/index/cmake/DefineOptionsCore.cmake
index 787a9c484f..ccf0048e46 100644
--- a/core/src/index/cmake/DefineOptionsCore.cmake
+++ b/core/src/index/cmake/DefineOptionsCore.cmake
@@ -49,6 +49,8 @@ else ()
     define_option(KNOWHERE_GPU_VERSION "Build GPU version" OFF)
 endif ()
 
+define_option(CUSTOMIZATION "Build with customized FAISS library" OFF)
+
 #----------------------------------------------------------------------
 set_option_category("Thirdparty")
 
diff --git a/core/src/index/cmake/ThirdPartyPackagesCore.cmake b/core/src/index/cmake/ThirdPartyPackagesCore.cmake
index 1a22a9d2be..624f1f422e 100644
--- a/core/src/index/cmake/ThirdPartyPackagesCore.cmake
+++ b/core/src/index/cmake/ThirdPartyPackagesCore.cmake
@@ -225,11 +225,11 @@ foreach (_VERSION_ENTRY ${TOOLCHAIN_VERSIONS_TXT})
     set(${_LIB_NAME} "${_LIB_VERSION}")
 endforeach ()
 
+set(FAISS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/faiss)
 if (DEFINED ENV{FAISS_SOURCE_URL})
     set(FAISS_SOURCE_URL "$ENV{FAISS_SOURCE_URL}")
 else ()
     set(FAISS_SOURCE_URL "https://github.com/JinHai-CN/faiss/archive/${FAISS_VERSION}.tar.gz")
-    set(FAISS_MD5 "b02c1a53234f5acc9bea1b0c55524f50")
 endif ()
 
 if (DEFINED ENV{KNOWHERE_ARROW_URL})
@@ -708,7 +708,7 @@ macro(build_faiss)
     set(FAISS_CONFIGURE_ARGS
             "--prefix=${FAISS_PREFIX}"
             "CFLAGS=${EP_C_FLAGS}"
-            "CXXFLAGS=${EP_CXX_FLAGS}"
+            "CXXFLAGS=${EP_CXX_FLAGS} -mavx2 -mf16c"
             --without-python)
 
     if (FAISS_WITH_MKL)
@@ -737,12 +737,12 @@ macro(build_faiss)
             set(FAISS_COMPUTE_TYPE "gpu")
         else ()
             set(FAISS_COMPUTE_TYPE "cpu")
-        endif()
+        endif ()
         if (FAISS_WITH_MKL)
             set(FAISS_CACHE_PACKAGE_NAME "faiss_${FAISS_COMPUTE_TYPE}_mkl_${FAISS_COMBINE_MD5}.tar.gz")
         else ()
             set(FAISS_CACHE_PACKAGE_NAME "faiss_${FAISS_COMPUTE_TYPE}_openblas_${FAISS_COMBINE_MD5}.tar.gz")
-        endif()
+        endif ()
         set(FAISS_CACHE_URL "${JFROG_ARTFACTORY_CACHE_URL}/${FAISS_CACHE_PACKAGE_NAME}")
         set(FAISS_CACHE_PACKAGE_PATH "${THIRDPARTY_PACKAGE_CACHE}/${FAISS_CACHE_PACKAGE_NAME}")
 
@@ -779,21 +779,41 @@ macro(build_faiss)
             endif ()
         endif ()
     else ()
-        externalproject_add(faiss_ep
-                URL
-                ${FAISS_SOURCE_URL}
-                ${EP_LOG_OPTIONS}
-                CONFIGURE_COMMAND
-                "./configure"
-                ${FAISS_CONFIGURE_ARGS}
-                BUILD_COMMAND
-                ${MAKE} ${MAKE_BUILD_ARGS} all
-                BUILD_IN_SOURCE
-                1
-                INSTALL_COMMAND
-                ${MAKE} install
-                BUILD_BYPRODUCTS
-                ${FAISS_STATIC_LIB})
+        if (CUSTOMIZATION)
+            externalproject_add(faiss_ep
+                    DOWNLOAD_COMMAND
+                    ""
+                    SOURCE_DIR
+                    ${FAISS_SOURCE_DIR}
+                    ${EP_LOG_OPTIONS}
+                    CONFIGURE_COMMAND
+                    "./configure"
+                    ${FAISS_CONFIGURE_ARGS}
+                    BUILD_COMMAND
+                    ${MAKE} ${MAKE_BUILD_ARGS} all
+                    BUILD_IN_SOURCE
+                    1
+                    INSTALL_COMMAND
+                    ${MAKE} install
+                    BUILD_BYPRODUCTS
+                    ${FAISS_STATIC_LIB})
+        else ()
+            externalproject_add(faiss_ep
+                    URL
+                    ${FAISS_SOURCE_URL}
+                    ${EP_LOG_OPTIONS}
+                    CONFIGURE_COMMAND
+                    "./configure"
+                    ${FAISS_CONFIGURE_ARGS}
+                    BUILD_COMMAND
+                    ${MAKE} ${MAKE_BUILD_ARGS} all
+                    BUILD_IN_SOURCE
+                    1
+                    INSTALL_COMMAND
+                    ${MAKE} install
+                    BUILD_BYPRODUCTS
+                    ${FAISS_STATIC_LIB})
+        endif ()
 
         if (NOT FAISS_WITH_MKL)
             ExternalProject_Add_StepDependencies(faiss_ep build openblas_ep lapack_ep)
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIDMAP.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIDMAP.cpp
index edf42abc8d..3982921b9a 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIDMAP.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIDMAP.cpp
@@ -47,16 +47,16 @@ GPUIDMAP::CopyGpuToCpu(const Config& config) {
     return std::make_shared<IDMAP>(new_index);
 }
 
-VectorIndexPtr
-GPUIDMAP::Clone() {
-    auto cpu_idx = CopyGpuToCpu(Config());
-
-    if (auto idmap = std::dynamic_pointer_cast<IDMAP>(cpu_idx)) {
-        return idmap->CopyCpuToGpu(gpu_id_, Config());
-    } else {
-        KNOWHERE_THROW_MSG("IndexType not Support GpuClone");
-    }
-}
+// VectorIndexPtr
+// GPUIDMAP::Clone() {
+//    auto cpu_idx = CopyGpuToCpu(Config());
+//
+//    if (auto idmap = std::dynamic_pointer_cast<IDMAP>(cpu_idx)) {
+//        return idmap->CopyCpuToGpu(gpu_id_, Config());
+//    } else {
+//        KNOWHERE_THROW_MSG("IndexType not Support GpuClone");
+//    }
+//}
 
 BinarySet
 GPUIDMAP::SerializeImpl() {
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIDMAP.h b/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIDMAP.h
index b9325a9cc1..d538f2d0da 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIDMAP.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIDMAP.h
@@ -41,8 +41,8 @@ class GPUIDMAP : public IDMAP, public GPUIndex {
     int64_t*
     GetRawIds() override;
 
-    VectorIndexPtr
-    Clone() override;
+    //    VectorIndexPtr
+    //    Clone() override;
 
     VectorIndexPtr
     CopyGpuToGpu(const int64_t& device_id, const Config& config) override;
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVF.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVF.cpp
index d69f87a061..923ca0db56 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVF.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVF.cpp
@@ -158,11 +158,11 @@ GPUIVF::CopyGpuToCpu(const Config& config) {
     }
 }
 
-VectorIndexPtr
-GPUIVF::Clone() {
-    auto cpu_idx = CopyGpuToCpu(Config());
-    return knowhere::cloner::CopyCpuToGpu(cpu_idx, gpu_id_, Config());
-}
+// VectorIndexPtr
+// GPUIVF::Clone() {
+//    auto cpu_idx = CopyGpuToCpu(Config());
+//    return knowhere::cloner::CopyCpuToGpu(cpu_idx, gpu_id_, Config());
+//}
 
 VectorIndexPtr
 GPUIVF::CopyGpuToGpu(const int64_t& device_id, const Config& config) {
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVF.h b/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVF.h
index fa9a206c48..a27f61d693 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVF.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexGPUIVF.h
@@ -75,8 +75,8 @@ class GPUIVF : public IVF, public GPUIndex {
     VectorIndexPtr
     CopyGpuToGpu(const int64_t& device_id, const Config& config) override;
 
-    VectorIndexPtr
-    Clone() final;
+    //    VectorIndexPtr
+    //    Clone() final;
 
  protected:
     void
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexIDMAP.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexIDMAP.cpp
index 351209c10f..96cb76683a 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexIDMAP.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexIDMAP.cpp
@@ -184,15 +184,15 @@ IDMAP::Train(const Config& config) {
     index_.reset(index);
 }
 
-VectorIndexPtr
-IDMAP::Clone() {
-    std::lock_guard<std::mutex> lk(mutex_);
-
-    auto clone_index = faiss::clone_index(index_.get());
-    std::shared_ptr<faiss::Index> new_index;
-    new_index.reset(clone_index);
-    return std::make_shared<IDMAP>(new_index);
-}
+// VectorIndexPtr
+// IDMAP::Clone() {
+//    std::lock_guard<std::mutex> lk(mutex_);
+//
+//    auto clone_index = faiss::clone_index(index_.get());
+//    std::shared_ptr<faiss::Index> new_index;
+//    new_index.reset(clone_index);
+//    return std::make_shared<IDMAP>(new_index);
+//}
 
 VectorIndexPtr
 IDMAP::CopyCpuToGpu(const int64_t& device_id, const Config& config) {
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexIDMAP.h b/core/src/index/knowhere/knowhere/index/vector_index/IndexIDMAP.h
index 9f1369c7d3..8ae6839f65 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexIDMAP.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexIDMAP.h
@@ -47,8 +47,8 @@ class IDMAP : public VectorIndex, public FaissBaseIndex {
     int64_t
     Count() override;
 
-    VectorIndexPtr
-    Clone() override;
+    //    VectorIndexPtr
+    //    Clone() override;
 
     int64_t
     Dimension() override;
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVF.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVF.cpp
index a74bddc94c..04baf42aa5 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVF.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVF.cpp
@@ -256,20 +256,20 @@ IVF::CopyCpuToGpu(const int64_t& device_id, const Config& config) {
 #endif
 }
 
-VectorIndexPtr
-IVF::Clone() {
-    std::lock_guard<std::mutex> lk(mutex_);
-
-    auto clone_index = faiss::clone_index(index_.get());
-    std::shared_ptr<faiss::Index> new_index;
-    new_index.reset(clone_index);
-    return Clone_impl(new_index);
-}
-
-VectorIndexPtr
-IVF::Clone_impl(const std::shared_ptr<faiss::Index>& index) {
-    return std::make_shared<IVF>(index);
-}
+// VectorIndexPtr
+// IVF::Clone() {
+//    std::lock_guard<std::mutex> lk(mutex_);
+//
+//    auto clone_index = faiss::clone_index(index_.get());
+//    std::shared_ptr<faiss::Index> new_index;
+//    new_index.reset(clone_index);
+//    return Clone_impl(new_index);
+//}
+//
+// VectorIndexPtr
+// IVF::Clone_impl(const std::shared_ptr<faiss::Index>& index) {
+//    return std::make_shared<IVF>(index);
+//}
 
 void
 IVF::Seal() {
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVF.h b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVF.h
index 24b006a565..9742bea40b 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVF.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVF.h
@@ -38,8 +38,8 @@ class IVF : public VectorIndex, public FaissBaseIndex {
     explicit IVF(std::shared_ptr<faiss::Index> index) : FaissBaseIndex(std::move(index)) {
     }
 
-    VectorIndexPtr
-    Clone() override;
+    //    VectorIndexPtr
+    //    Clone() override;
 
     IndexModelPtr
     Train(const DatasetPtr& dataset, const Config& config) override;
@@ -81,8 +81,8 @@ class IVF : public VectorIndex, public FaissBaseIndex {
     virtual std::shared_ptr<faiss::IVFSearchParameters>
     GenParams(const Config& config);
 
-    virtual VectorIndexPtr
-    Clone_impl(const std::shared_ptr<faiss::Index>& index);
+    //    virtual VectorIndexPtr
+    //    Clone_impl(const std::shared_ptr<faiss::Index>& index);
 
     virtual void
     search_impl(int64_t n, const float* data, int64_t k, float* distances, int64_t* labels, const Config& cfg);
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFPQ.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFPQ.cpp
index 841ef63245..5622e31a11 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFPQ.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFPQ.cpp
@@ -63,10 +63,10 @@ IVFPQ::GenParams(const Config& config) {
     return params;
 }
 
-VectorIndexPtr
-IVFPQ::Clone_impl(const std::shared_ptr<faiss::Index>& index) {
-    return std::make_shared<IVFPQ>(index);
-}
+// VectorIndexPtr
+// IVFPQ::Clone_impl(const std::shared_ptr<faiss::Index>& index) {
+//    return std::make_shared<IVFPQ>(index);
+//}
 
 VectorIndexPtr
 IVFPQ::CopyCpuToGpu(const int64_t& device_id, const Config& config) {
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFPQ.h b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFPQ.h
index fc50c68389..2e3192e2ca 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFPQ.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFPQ.h
@@ -41,8 +41,8 @@ class IVFPQ : public IVF {
     std::shared_ptr<faiss::IVFSearchParameters>
     GenParams(const Config& config) override;
 
-    VectorIndexPtr
-    Clone_impl(const std::shared_ptr<faiss::Index>& index) override;
+    //    VectorIndexPtr
+    //    Clone_impl(const std::shared_ptr<faiss::Index>& index) override;
 };
 
 }  // namespace knowhere
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQ.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQ.cpp
index 7c86cd4dbd..273061d20d 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQ.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQ.cpp
@@ -54,10 +54,10 @@ IVFSQ::Train(const DatasetPtr& dataset, const Config& config) {
     return std::make_shared<IVFIndexModel>(ret_index);
 }
 
-VectorIndexPtr
-IVFSQ::Clone_impl(const std::shared_ptr<faiss::Index>& index) {
-    return std::make_shared<IVFSQ>(index);
-}
+// VectorIndexPtr
+// IVFSQ::Clone_impl(const std::shared_ptr<faiss::Index>& index) {
+//    return std::make_shared<IVFSQ>(index);
+//}
 
 VectorIndexPtr
 IVFSQ::CopyCpuToGpu(const int64_t& device_id, const Config& config) {
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQ.h b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQ.h
index cac95faebf..0dcb6555f6 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQ.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexIVFSQ.h
@@ -38,8 +38,8 @@ class IVFSQ : public IVF {
     CopyCpuToGpu(const int64_t& device_id, const Config& config) override;
 
  protected:
-    VectorIndexPtr
-    Clone_impl(const std::shared_ptr<faiss::Index>& index) override;
+    //    VectorIndexPtr
+    //    Clone_impl(const std::shared_ptr<faiss::Index>& index) override;
 };
 
 }  // namespace knowhere
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp
index 9571571945..661bde3bf3 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp
@@ -20,12 +20,13 @@
 #include "knowhere/common/Exception.h"
 #include "knowhere/common/Timer.h"
 #ifdef MILVUS_GPU_VERSION
-#include "knowhere/index/vector_index/IndexGPUIVF.h"
 #include "knowhere/index/vector_index/IndexGPUIDMAP.h"
+#include "knowhere/index/vector_index/IndexGPUIVF.h"
 #include "knowhere/index/vector_index/helpers/Cloner.h"
 #endif
 
 #include "knowhere/index/vector_index/IndexIVF.h"
+#include "knowhere/index/vector_index/IndexIDMAP.h"
 #include "knowhere/index/vector_index/nsg/NSG.h"
 #include "knowhere/index/vector_index/nsg/NSGIO.h"
 
@@ -118,23 +119,32 @@ NSG::Train(const DatasetPtr& dataset, const Config& config) {
         build_cfg->CheckValid();  // throw exception
     }
 
-    // TODO(linxj): dev IndexFactory, support more IndexType
+    auto idmap = std::make_shared<IDMAP>();
+    idmap->Train(config);
+    idmap->AddWithoutId(dataset, config);
+    Graph knng;
+    float* raw_data = idmap->GetRawVectors();
 #ifdef MILVUS_GPU_VERSION
-//     auto preprocess_index = std::make_shared<GPUIVF>(build_cfg->gpu_id);
+    if (build_cfg->gpu_id == knowhere::INVALID_VALUE) {
+        auto preprocess_index = std::make_shared<IVF>();
+        auto model = preprocess_index->Train(dataset, config);
+        preprocess_index->set_index_model(model);
+        preprocess_index->Add(dataset, config);
+        preprocess_index->GenGraph(raw_data, build_cfg->knng, knng, config);
+    } else {
+        // TODO(linxj): use ivf instead?
+        auto gpu_idx = cloner::CopyCpuToGpu(idmap, build_cfg->gpu_id, config);
+        auto gpu_idmap = std::dynamic_pointer_cast<GPUIDMAP>(gpu_idx);
+        gpu_idmap->GenGraph(raw_data, build_cfg->knng, knng, config);
+    }
 #else
     auto preprocess_index = std::make_shared<IVF>();
+    auto model = preprocess_index->Train(dataset, config);
+    preprocess_index->set_index_model(model);
+    preprocess_index->AddWithoutIds(dataset, config);
+    preprocess_index->GenGraph(raw_data, build_cfg->knng, knng, config);
 #endif
-    auto preprocess_index = std::make_shared<IDMAP>();
-    preprocess_index->Train(config);
-    preprocess_index->AddWithoutId(dataset, config);
-    float* raw_data = preprocess_index->GetRawVectors();
-    auto xx = cloner::CopyCpuToGpu(preprocess_index, 0, config);
-    auto ss = std::dynamic_pointer_cast<GPUIDMAP>(xx);
 
-    Graph knng;
-    ss->GenGraph(raw_data, build_cfg->knng, knng, config);
-
-    GETTENSOR(dataset)
     algo::BuildParams b_params;
     b_params.candidate_pool_size = build_cfg->candidate_pool_size;
     b_params.out_degree = build_cfg->out_degree;
@@ -143,6 +153,7 @@ NSG::Train(const DatasetPtr& dataset, const Config& config) {
     auto array = dataset->array()[0];
     auto p_ids = array->data()->GetValues<int64_t>(1, 0);
 
+    GETTENSOR(dataset)
     index_ = std::make_shared<algo::NsgIndex>(dim, rows);
     index_->SetKnnGraph(knng);
     index_->Build_with_ids(rows, (float*)p_data, (int64_t*)p_ids, b_params);
@@ -164,10 +175,10 @@ NSG::Dimension() {
     return index_->dimension;
 }
 
-VectorIndexPtr
-NSG::Clone() {
-    KNOWHERE_THROW_MSG("not support");
-}
+// VectorIndexPtr
+// NSG::Clone() {
+//    KNOWHERE_THROW_MSG("not support");
+//}
 
 void
 NSG::Seal() {
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.h b/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.h
index 04a146d58a..1af12b2ae1 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.h
@@ -49,8 +49,8 @@ class NSG : public VectorIndex {
     Count() override;
     int64_t
     Dimension() override;
-    VectorIndexPtr
-    Clone() override;
+    //    VectorIndexPtr
+    //    Clone() override;
     void
     Seal() override;
 
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexSPTAG.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexSPTAG.cpp
index 17a93fdcc7..9d1d693c14 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexSPTAG.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexSPTAG.cpp
@@ -210,6 +210,9 @@ CPUSPTAGRNG::Load(const BinarySet& binary_set) {
 IndexModelPtr
 CPUSPTAGRNG::Train(const DatasetPtr& origin, const Config& train_config) {
     SetParameters(train_config);
+    if (train_config != nullptr) {
+        train_config->CheckValid();  // throw exception
+    }
     DatasetPtr dataset = origin->Clone();
 
     // if (index_ptr_->GetDistCalcMethod() == SPTAG::DistCalcMethod::Cosine
@@ -295,6 +298,9 @@ CPUSPTAGRNG::SetParameters(const Config& config) {
 DatasetPtr
 CPUSPTAGRNG::Search(const DatasetPtr& dataset, const Config& config) {
     SetParameters(config);
+    if (config != nullptr) {
+        config->CheckValid();  // throw exception
+    }
     auto tensor = dataset->tensor()[0];
     auto p = (float*)tensor->raw_mutable_data();
     for (auto i = 0; i < 10; ++i) {
@@ -325,10 +331,10 @@ CPUSPTAGRNG::Dimension() {
     return index_ptr_->GetFeatureDim();
 }
 
-VectorIndexPtr
-CPUSPTAGRNG::Clone() {
-    KNOWHERE_THROW_MSG("not support");
-}
+// VectorIndexPtr
+// CPUSPTAGRNG::Clone() {
+//    KNOWHERE_THROW_MSG("not support");
+//}
 
 void
 CPUSPTAGRNG::Seal() {
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexSPTAG.h b/core/src/index/knowhere/knowhere/index/vector_index/IndexSPTAG.h
index 01380ce943..92942dd24f 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexSPTAG.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexSPTAG.h
@@ -36,8 +36,8 @@ class CPUSPTAGRNG : public VectorIndex {
     BinarySet
     Serialize() override;
 
-    VectorIndexPtr
-    Clone() override;
+    //    VectorIndexPtr
+    //    Clone() override;
 
     void
     Load(const BinarySet& index_array) override;
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/VectorIndex.h b/core/src/index/knowhere/knowhere/index/vector_index/VectorIndex.h
index 6509458b7b..6626d9d94d 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/VectorIndex.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/VectorIndex.h
@@ -49,8 +49,8 @@ class VectorIndex : public Index {
     Seal() = 0;
 
     // TODO(linxj): Deprecated
-    virtual VectorIndexPtr
-    Clone() = 0;
+    //    virtual VectorIndexPtr
+    //    Clone() = 0;
 
     virtual int64_t
     Count() = 0;
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/helpers/IndexParameter.h b/core/src/index/knowhere/knowhere/index/vector_index/helpers/IndexParameter.h
index e30088ecdf..2a274d474f 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/helpers/IndexParameter.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/helpers/IndexParameter.h
@@ -180,10 +180,10 @@ struct SPTAGCfg : public Cfg {
 
     SPTAGCfg() = default;
 
-    bool
-    CheckValid() override {
-        return true;
-    };
+    //    bool
+    //    CheckValid() override {
+    //        return true;
+    //    };
 };
 using SPTAGConfig = std::shared_ptr<SPTAGCfg>;
 
diff --git a/core/src/index/thirdparty/faiss/.dockerignore b/core/src/index/thirdparty/faiss/.dockerignore
new file mode 100644
index 0000000000..7763a51dc3
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/.dockerignore
@@ -0,0 +1 @@
+sift1M
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/.gitignore b/core/src/index/thirdparty/faiss/.gitignore
new file mode 100644
index 0000000000..a25bc7f112
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/.gitignore
@@ -0,0 +1,21 @@
+*.swp
+*.swo
+*.o
+*.a
+*.dSYM
+*.so
+*.dylib
+*.pyc
+*~
+.DS_Store
+depend
+/config.*
+/aclocal.m4
+/autom4te.cache/
+/makefile.inc
+/bin/
+/c_api/bin/
+/c_api/gpu/bin/
+/tests/test
+/tests/gtest/
+include/
diff --git a/core/src/index/thirdparty/faiss/AutoTune.cpp b/core/src/index/thirdparty/faiss/AutoTune.cpp
new file mode 100644
index 0000000000..a90a6f53ea
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/AutoTune.cpp
@@ -0,0 +1,719 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/*
+ * implementation of Hyper-parameter auto-tuning
+ */
+
+#include <faiss/AutoTune.h>
+
+#include <cmath>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/utils.h>
+#include <faiss/utils/random.h>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/VectorTransform.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/IndexLSH.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexIVFPQR.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/MetaIndexes.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/IndexHNSW.h>
+
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/IndexBinaryHNSW.h>
+#include <faiss/IndexBinaryIVF.h>
+
+namespace faiss {
+
+
+AutoTuneCriterion::AutoTuneCriterion (idx_t nq, idx_t nnn):
+    nq (nq), nnn (nnn), gt_nnn (0)
+{}
+
+
+void AutoTuneCriterion::set_groundtruth (
+     int gt_nnn, const float *gt_D_in, const idx_t *gt_I_in)
+{
+    this->gt_nnn = gt_nnn;
+    if (gt_D_in) { // allow null for this, as it is often not used
+        gt_D.resize (nq * gt_nnn);
+        memcpy (gt_D.data(), gt_D_in, sizeof (gt_D[0]) * nq * gt_nnn);
+    }
+    gt_I.resize (nq * gt_nnn);
+    memcpy (gt_I.data(), gt_I_in, sizeof (gt_I[0]) * nq * gt_nnn);
+}
+
+
+
+OneRecallAtRCriterion::OneRecallAtRCriterion (idx_t nq, idx_t R):
+    AutoTuneCriterion(nq, R), R(R)
+{}
+
+double OneRecallAtRCriterion::evaluate(const float* /*D*/, const idx_t* I)
+    const {
+  FAISS_THROW_IF_NOT_MSG(
+      (gt_I.size() == gt_nnn * nq && gt_nnn >= 1 && nnn >= R),
+      "ground truth not initialized");
+  idx_t n_ok = 0;
+  for (idx_t q = 0; q < nq; q++) {
+    idx_t gt_nn = gt_I[q * gt_nnn];
+    const idx_t* I_line = I + q * nnn;
+    for (int i = 0; i < R; i++) {
+      if (I_line[i] == gt_nn) {
+        n_ok++;
+        break;
+      }
+    }
+  }
+  return n_ok / double(nq);
+}
+
+
+IntersectionCriterion::IntersectionCriterion (idx_t nq, idx_t R):
+    AutoTuneCriterion(nq, R), R(R)
+{}
+
+double IntersectionCriterion::evaluate(const float* /*D*/, const idx_t* I)
+    const {
+    FAISS_THROW_IF_NOT_MSG(
+      (gt_I.size() == gt_nnn * nq && gt_nnn >= R && nnn >= R),
+      "ground truth not initialized");
+    int64_t n_ok = 0;
+#pragma omp parallel for reduction(+: n_ok)
+    for (idx_t q = 0; q < nq; q++) {
+        n_ok += ranklist_intersection_size (
+             R, &gt_I [q * gt_nnn],
+             R, I + q * nnn);
+    }
+    return n_ok / double (nq * R);
+}
+
+/***************************************************************
+ * OperatingPoints
+ ***************************************************************/
+
+OperatingPoints::OperatingPoints ()
+{
+    clear();
+}
+
+void OperatingPoints::clear ()
+{
+    all_pts.clear();
+    optimal_pts.clear();
+    /// default point: doing nothing gives 0 performance and takes 0 time
+    OperatingPoint op = {0, 0, "", -1};
+    optimal_pts.push_back(op);
+}
+
+/// add a performance measure
+bool OperatingPoints::add (double perf, double t, const std::string & key,
+                           size_t cno)
+{
+    OperatingPoint op = {perf, t, key, int64_t(cno)};
+    all_pts.push_back (op);
+    if (perf == 0) {
+        return false;  // no method for 0 accuracy is faster than doing nothing
+    }
+    std::vector<OperatingPoint> & a = optimal_pts;
+    if (perf > a.back().perf) {
+        // keep unconditionally
+        a.push_back (op);
+    } else if (perf == a.back().perf) {
+        if (t < a.back ().t) {
+            a.back() = op;
+        } else {
+            return false;
+        }
+    } else {
+        int i;
+        // stricto sensu this should be a bissection
+        for (i = 0; i < a.size(); i++) {
+            if (a[i].perf >= perf) break;
+        }
+        assert (i < a.size());
+        if (t < a[i].t) {
+            if (a[i].perf == perf) {
+                a[i] = op;
+            } else {
+                a.insert (a.begin() + i, op);
+            }
+        } else {
+            return false;
+        }
+    }
+    { // remove non-optimal points from array
+        int i = a.size() - 1;
+        while (i > 0) {
+            if (a[i].t < a[i - 1].t)
+                a.erase (a.begin() + (i - 1));
+            i--;
+        }
+    }
+    return true;
+}
+
+
+int OperatingPoints::merge_with (const OperatingPoints &other,
+                                 const std::string & prefix)
+{
+    int n_add = 0;
+    for (int i = 0; i < other.all_pts.size(); i++) {
+        const OperatingPoint & op = other.all_pts[i];
+        if (add (op.perf, op.t, prefix + op.key, op.cno))
+            n_add++;
+    }
+    return n_add;
+}
+
+
+
+/// get time required to obtain a given performance measure
+double OperatingPoints::t_for_perf (double perf) const
+{
+    const std::vector<OperatingPoint> & a = optimal_pts;
+    if (perf > a.back().perf) return 1e50;
+    int i0 = -1, i1 = a.size() - 1;
+    while (i0 + 1 < i1) {
+        int imed = (i0 + i1 + 1) / 2;
+        if (a[imed].perf < perf) i0 = imed;
+        else                     i1 = imed;
+    }
+    return a[i1].t;
+}
+
+
+void OperatingPoints::all_to_gnuplot (const char *fname) const
+{
+    FILE *f = fopen(fname, "w");
+    if (!f) {
+        fprintf (stderr, "cannot open %s", fname);
+        perror("");
+        abort();
+    }
+    for (int i = 0; i < all_pts.size(); i++) {
+        const OperatingPoint & op = all_pts[i];
+        fprintf (f, "%g %g %s\n", op.perf, op.t, op.key.c_str());
+    }
+    fclose(f);
+}
+
+void OperatingPoints::optimal_to_gnuplot (const char *fname) const
+{
+    FILE *f = fopen(fname, "w");
+    if (!f) {
+        fprintf (stderr, "cannot open %s", fname);
+        perror("");
+        abort();
+    }
+    double prev_perf = 0.0;
+    for (int i = 0; i < optimal_pts.size(); i++) {
+        const OperatingPoint & op = optimal_pts[i];
+        fprintf (f, "%g %g\n", prev_perf, op.t);
+        fprintf (f, "%g %g %s\n", op.perf, op.t, op.key.c_str());
+        prev_perf = op.perf;
+    }
+    fclose(f);
+}
+
+void OperatingPoints::display (bool only_optimal) const
+{
+    const std::vector<OperatingPoint> &pts =
+        only_optimal ? optimal_pts : all_pts;
+    printf("Tested %ld operating points, %ld ones are optimal:\n",
+           all_pts.size(), optimal_pts.size());
+
+    for (int i = 0; i < pts.size(); i++) {
+        const OperatingPoint & op = pts[i];
+        const char *star = "";
+        if (!only_optimal) {
+            for (int j = 0; j < optimal_pts.size(); j++) {
+                if (op.cno == optimal_pts[j].cno) {
+                    star = "*";
+                    break;
+                }
+            }
+        }
+        printf ("cno=%ld key=%s perf=%.4f t=%.3f %s\n",
+                op.cno, op.key.c_str(), op.perf, op.t, star);
+    }
+
+}
+
+/***************************************************************
+ * ParameterSpace
+ ***************************************************************/
+
+ParameterSpace::ParameterSpace ():
+    verbose (1), n_experiments (500),
+    batchsize (1<<30), thread_over_batches (false),
+    min_test_duration (0)
+{
+}
+
+/* not keeping this constructor as inheritors will call the parent
+   initialize()
+ */
+
+#if 0
+ParameterSpace::ParameterSpace (Index *index):
+    verbose (1), n_experiments (500),
+    batchsize (1<<30), thread_over_batches (false)
+
+{
+    initialize(index);
+}
+#endif
+
+size_t ParameterSpace::n_combinations () const
+{
+    size_t n = 1;
+    for (int i = 0; i < parameter_ranges.size(); i++)
+        n *= parameter_ranges[i].values.size();
+    return n;
+}
+
+/// get string representation of the combination
+std::string ParameterSpace::combination_name (size_t cno) const {
+    char buf[1000], *wp = buf;
+    *wp = 0;
+    for (int i = 0; i < parameter_ranges.size(); i++) {
+        const ParameterRange & pr = parameter_ranges[i];
+        size_t j = cno % pr.values.size();
+        cno /= pr.values.size();
+        wp += snprintf (
+              wp, buf + 1000 - wp, "%s%s=%g", i == 0 ? "" : ",",
+              pr.name.c_str(), pr.values[j]);
+    }
+    return std::string (buf);
+}
+
+
+bool ParameterSpace::combination_ge (size_t c1, size_t c2) const
+{
+    for (int i = 0; i < parameter_ranges.size(); i++) {
+        int nval = parameter_ranges[i].values.size();
+        size_t j1 = c1 % nval;
+        size_t j2 = c2 % nval;
+        if (!(j1 >= j2)) return false;
+        c1 /= nval;
+        c2 /= nval;
+    }
+    return true;
+}
+
+
+
+#define DC(classname) \
+    const classname *ix = dynamic_cast<const classname *>(index)
+
+static void init_pq_ParameterRange (const ProductQuantizer & pq,
+                                    ParameterRange & pr)
+{
+    if (pq.code_size % 4 == 0) {
+        // Polysemous not supported for code sizes that are not a
+        // multiple of 4
+        for (int i = 2; i <= pq.code_size * 8 / 2; i+= 2)
+            pr.values.push_back(i);
+    }
+    pr.values.push_back (pq.code_size * 8);
+}
+
+ParameterRange &ParameterSpace::add_range(const char * name)
+{
+    for (auto & pr : parameter_ranges) {
+        if (pr.name == name) {
+            return pr;
+        }
+    }
+    parameter_ranges.push_back (ParameterRange ());
+    parameter_ranges.back ().name = name;
+    return parameter_ranges.back ();
+}
+
+
+/// initialize with reasonable parameters for the index
+void ParameterSpace::initialize (const Index * index)
+{
+    if (DC (IndexPreTransform)) {
+        index = ix->index;
+    }
+    if (DC (IndexRefineFlat)) {
+        ParameterRange & pr = add_range("k_factor_rf");
+        for (int i = 0; i <= 6; i++) {
+            pr.values.push_back (1 << i);
+        }
+        index = ix->base_index;
+    }
+    if (DC (IndexPreTransform)) {
+        index = ix->index;
+    }
+
+    if (DC (IndexIVF)) {
+        {
+            ParameterRange & pr = add_range("nprobe");
+            for (int i = 0; i < 13; i++) {
+                size_t nprobe = 1 << i;
+                if (nprobe >= ix->nlist) break;
+                pr.values.push_back (nprobe);
+            }
+        }
+        if (dynamic_cast<const IndexHNSW*>(ix->quantizer)) {
+            ParameterRange & pr = add_range("efSearch");
+            for (int i = 2; i <= 9; i++) {
+                pr.values.push_back (1 << i);
+            }
+        }
+    }
+    if (DC (IndexPQ)) {
+        ParameterRange & pr = add_range("ht");
+        init_pq_ParameterRange (ix->pq, pr);
+    }
+    if (DC (IndexIVFPQ)) {
+        ParameterRange & pr = add_range("ht");
+        init_pq_ParameterRange (ix->pq, pr);
+    }
+
+    if (DC (IndexIVF)) {
+        const MultiIndexQuantizer *miq =
+            dynamic_cast<const MultiIndexQuantizer *> (ix->quantizer);
+        if (miq) {
+            ParameterRange & pr_max_codes = add_range("max_codes");
+            for (int i = 8; i < 20; i++) {
+                pr_max_codes.values.push_back (1 << i);
+            }
+            pr_max_codes.values.push_back (
+                std::numeric_limits<double>::infinity()
+            );
+        }
+    }
+    if (DC (IndexIVFPQR)) {
+        ParameterRange & pr = add_range("k_factor");
+        for (int i = 0; i <= 6; i++) {
+            pr.values.push_back (1 << i);
+        }
+    }
+    if (dynamic_cast<const IndexHNSW*>(index)) {
+        ParameterRange & pr = add_range("efSearch");
+        for (int i = 2; i <= 9; i++) {
+            pr.values.push_back (1 << i);
+        }
+    }
+}
+
+#undef DC
+
+// non-const version
+#define DC(classname) classname *ix = dynamic_cast<classname *>(index)
+
+
+/// set a combination of parameters on an index
+void ParameterSpace::set_index_parameters (Index *index, size_t cno) const
+{
+
+    for (int i = 0; i < parameter_ranges.size(); i++) {
+        const ParameterRange & pr = parameter_ranges[i];
+        size_t j = cno % pr.values.size();
+        cno /= pr.values.size();
+        double val = pr.values [j];
+        set_index_parameter (index, pr.name, val);
+    }
+}
+
+/// set a combination of parameters on an index
+void ParameterSpace::set_index_parameters (
+     Index *index, const char *description_in) const
+{
+    char description[strlen(description_in) + 1];
+    char *ptr;
+    memcpy (description, description_in, strlen(description_in) + 1);
+
+    for (char *tok = strtok_r (description, " ,", &ptr);
+         tok;
+         tok = strtok_r (nullptr, " ,", &ptr)) {
+        char name[100];
+        double val;
+        int ret = sscanf (tok, "%100[^=]=%lf", name, &val);
+        FAISS_THROW_IF_NOT_FMT (
+           ret == 2, "could not interpret parameters %s", tok);
+        set_index_parameter (index, name, val);
+    }
+
+}
+
+void ParameterSpace::set_index_parameter (
+        Index * index, const std::string & name, double val) const
+{
+    if (verbose > 1)
+        printf("    set %s=%g\n", name.c_str(), val);
+
+    if (name == "verbose") {
+        index->verbose = int(val);
+        // and fall through to also enable it on sub-indexes
+    }
+    if (DC (IndexPreTransform)) {
+        set_index_parameter (ix->index, name, val);
+        return;
+    }
+    if (DC (IndexShards)) {
+        // call on all sub-indexes
+        auto fn =
+          [this, name, val](int, Index* subIndex) {
+            set_index_parameter(subIndex, name, val);
+          };
+
+        ix->runOnIndex(fn);
+        return;
+    }
+    if (DC (IndexReplicas)) {
+        // call on all sub-indexes
+        auto fn =
+          [this, name, val](int, Index* subIndex) {
+            set_index_parameter(subIndex, name, val);
+          };
+
+        ix->runOnIndex(fn);
+        return;
+    }
+    if (DC (IndexRefineFlat)) {
+        if (name == "k_factor_rf") {
+            ix->k_factor = int(val);
+            return;
+        }
+        // otherwise it is for the sub-index
+        set_index_parameter (&ix->refine_index, name, val);
+        return;
+    }
+
+    if (name == "verbose") {
+        index->verbose = int(val);
+        return; // last verbose that we could find
+    }
+
+    if (name == "nprobe") {
+        if (DC (IndexIDMap)) {
+            set_index_parameter (ix->index, name, val);
+            return;
+        } else if (DC (IndexIVF)) {
+            ix->nprobe = int(val);
+            return;
+        }
+    }
+
+    if (name == "ht") {
+        if (DC (IndexPQ)) {
+            if (val >= ix->pq.code_size * 8) {
+                ix->search_type = IndexPQ::ST_PQ;
+            } else {
+                ix->search_type = IndexPQ::ST_polysemous;
+                ix->polysemous_ht = int(val);
+            }
+            return;
+        } else if (DC (IndexIVFPQ)) {
+            if (val >= ix->pq.code_size * 8) {
+                ix->polysemous_ht = 0;
+            } else {
+                ix->polysemous_ht = int(val);
+            }
+            return;
+        }
+    }
+
+    if (name == "k_factor") {
+        if (DC (IndexIVFPQR)) {
+            ix->k_factor = val;
+            return;
+        }
+    }
+    if (name == "max_codes") {
+        if (DC (IndexIVF)) {
+            ix->max_codes = std::isfinite(val) ? size_t(val) : 0;
+            return;
+        }
+    }
+
+    if (name == "efSearch") {
+        if (DC (IndexHNSW)) {
+            ix->hnsw.efSearch = int(val);
+            return;
+        }
+        if (DC (IndexIVF)) {
+            if (IndexHNSW *cq =
+                dynamic_cast<IndexHNSW *>(ix->quantizer)) {
+                cq->hnsw.efSearch = int(val);
+                return;
+            }
+        }
+    }
+
+    FAISS_THROW_FMT ("ParameterSpace::set_index_parameter:"
+                     "could not set parameter %s",
+                     name.c_str());
+}
+
+void ParameterSpace::display () const
+{
+    printf ("ParameterSpace, %ld parameters, %ld combinations:\n",
+            parameter_ranges.size (), n_combinations ());
+    for (int i = 0; i < parameter_ranges.size(); i++) {
+        const ParameterRange & pr = parameter_ranges[i];
+        printf ("   %s: ", pr.name.c_str ());
+        char sep = '[';
+        for (int j = 0; j < pr.values.size(); j++) {
+            printf ("%c %g", sep, pr.values [j]);
+            sep = ',';
+        }
+        printf ("]\n");
+    }
+}
+
+
+
+void ParameterSpace::update_bounds (size_t cno, const OperatingPoint & op,
+                                    double *upper_bound_perf,
+                                    double *lower_bound_t) const
+{
+    if (combination_ge (cno, op.cno)) {
+        if (op.t > *lower_bound_t) *lower_bound_t = op.t;
+    }
+    if (combination_ge (op.cno, cno)) {
+        if (op.perf < *upper_bound_perf) *upper_bound_perf = op.perf;
+    }
+}
+
+
+
+void ParameterSpace::explore (Index *index,
+                              size_t nq, const float *xq,
+                              const AutoTuneCriterion & crit,
+                              OperatingPoints * ops) const
+{
+    FAISS_THROW_IF_NOT_MSG (nq == crit.nq,
+                      "criterion does not have the same nb of queries");
+
+    size_t n_comb = n_combinations ();
+
+    if (n_experiments == 0) {
+
+        for (size_t cno = 0; cno < n_comb; cno++) {
+            set_index_parameters (index, cno);
+            std::vector<Index::idx_t> I(nq * crit.nnn);
+            std::vector<float> D(nq * crit.nnn);
+
+            double t0 = getmillisecs ();
+            index->search (nq, xq, crit.nnn, D.data(), I.data());
+            double t_search = (getmillisecs() - t0) / 1e3;
+
+            double perf = crit.evaluate (D.data(), I.data());
+
+            bool keep = ops->add (perf, t_search, combination_name (cno), cno);
+
+            if (verbose)
+                printf("  %ld/%ld: %s perf=%.3f t=%.3f s %s\n", cno, n_comb,
+                       combination_name (cno).c_str(), perf, t_search,
+                       keep ? "*" : "");
+        }
+        return;
+    }
+
+    int n_exp = n_experiments;
+
+    if (n_exp > n_comb) n_exp = n_comb;
+    FAISS_THROW_IF_NOT (n_comb == 1 || n_exp > 2);
+    std::vector<int> perm (n_comb);
+    // make sure the slowest and fastest experiment are run
+    perm[0] = 0;
+    if (n_comb > 1) {
+        perm[1] = n_comb - 1;
+        rand_perm (&perm[2], n_comb - 2, 1234);
+        for (int i = 2; i < perm.size(); i++) perm[i] ++;
+    }
+
+    for (size_t xp = 0; xp < n_exp; xp++) {
+        size_t cno = perm[xp];
+
+        if (verbose)
+            printf("  %ld/%d: cno=%ld %s ", xp, n_exp, cno,
+                   combination_name (cno).c_str());
+
+        {
+            double lower_bound_t = 0.0;
+            double upper_bound_perf = 1.0;
+            for (int i = 0; i < ops->all_pts.size(); i++) {
+                update_bounds (cno, ops->all_pts[i],
+                               &upper_bound_perf, &lower_bound_t);
+            }
+            double best_t = ops->t_for_perf (upper_bound_perf);
+            if (verbose)
+                printf ("bounds [perf<=%.3f t>=%.3f] %s",
+                        upper_bound_perf, lower_bound_t,
+                        best_t <= lower_bound_t ? "skip\n" : "");
+            if (best_t <= lower_bound_t) continue;
+        }
+
+        set_index_parameters (index, cno);
+        std::vector<Index::idx_t> I(nq * crit.nnn);
+        std::vector<float> D(nq * crit.nnn);
+
+        double t0 = getmillisecs ();
+
+        int nrun = 0;
+        double t_search;
+
+        do {
+
+            if (thread_over_batches) {
+#pragma omp parallel for
+                for (size_t q0 = 0; q0 < nq; q0 += batchsize) {
+                    size_t q1 = q0 + batchsize;
+                    if (q1 > nq) q1 = nq;
+                    index->search (q1 - q0, xq + q0 * index->d,
+                                   crit.nnn,
+                                   D.data() + q0 * crit.nnn,
+                                   I.data() + q0 * crit.nnn);
+                }
+            } else {
+                for (size_t q0 = 0; q0 < nq; q0 += batchsize) {
+                    size_t q1 = q0 + batchsize;
+                    if (q1 > nq) q1 = nq;
+                    index->search (q1 - q0, xq + q0 * index->d,
+                                   crit.nnn,
+                                   D.data() + q0 * crit.nnn,
+                                   I.data() + q0 * crit.nnn);
+                }
+            }
+            nrun ++;
+            t_search = (getmillisecs() - t0) / 1e3;
+
+        } while (t_search < min_test_duration);
+
+        t_search /= nrun;
+
+        double perf = crit.evaluate (D.data(), I.data());
+
+        bool keep = ops->add (perf, t_search, combination_name (cno), cno);
+
+        if (verbose)
+            printf(" perf %.3f t %.3f (%d runs) %s\n",
+                   perf, t_search, nrun,
+                   keep ? "*" : "");
+    }
+}
+
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/AutoTune.h b/core/src/index/thirdparty/faiss/AutoTune.h
new file mode 100644
index 0000000000..d7eff14e64
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/AutoTune.h
@@ -0,0 +1,212 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_AUTO_TUNE_H
+#define FAISS_AUTO_TUNE_H
+
+#include <vector>
+#include <unordered_map>
+#include <stdint.h>
+
+#include <faiss/Index.h>
+#include <faiss/IndexBinary.h>
+
+namespace faiss {
+
+
+/**
+ * Evaluation criterion. Returns a performance measure in [0,1],
+ * higher is better.
+ */
+struct AutoTuneCriterion {
+    typedef Index::idx_t idx_t;
+    idx_t nq;  ///< nb of queries this criterion is evaluated on
+    idx_t nnn; ///< nb of NNs that the query should request
+    idx_t gt_nnn; ///< nb of GT NNs required to evaluate crterion
+
+    std::vector<float> gt_D;  ///< Ground-truth distances (size nq * gt_nnn)
+    std::vector<idx_t> gt_I;  ///< Ground-truth indexes (size nq * gt_nnn)
+
+    AutoTuneCriterion (idx_t nq, idx_t nnn);
+
+    /** Intitializes the gt_D and gt_I vectors. Must be called before evaluating
+     *
+     * @param gt_D_in  size nq * gt_nnn
+     * @param gt_I_in  size nq * gt_nnn
+     */
+    void set_groundtruth (int gt_nnn, const float *gt_D_in,
+                          const idx_t *gt_I_in);
+
+    /** Evaluate the criterion.
+     *
+     * @param D  size nq * nnn
+     * @param I  size nq * nnn
+     * @return the criterion, between 0 and 1. Larger is better.
+     */
+    virtual double evaluate (const float *D, const idx_t *I) const = 0;
+
+    virtual ~AutoTuneCriterion () {}
+
+};
+
+struct OneRecallAtRCriterion: AutoTuneCriterion {
+
+    idx_t R;
+
+    OneRecallAtRCriterion (idx_t nq, idx_t R);
+
+    double evaluate(const float* D, const idx_t* I) const override;
+
+    ~OneRecallAtRCriterion() override {}
+};
+
+
+struct IntersectionCriterion: AutoTuneCriterion {
+
+    idx_t R;
+
+    IntersectionCriterion (idx_t nq, idx_t R);
+
+    double evaluate(const float* D, const idx_t* I) const override;
+
+    ~IntersectionCriterion() override {}
+};
+
+/**
+ * Maintains a list of experimental results. Each operating point is a
+ * (perf, t, key) triplet, where higher perf and lower t is
+ * better. The key field is an arbitrary identifier for the operating point
+ */
+
+struct OperatingPoint {
+    double perf;     ///< performance measure (output of a Criterion)
+    double t;        ///< corresponding execution time (ms)
+    std::string key; ///< key that identifies this op pt
+    int64_t cno;        ///< integer identifer
+};
+
+struct OperatingPoints {
+    /// all operating points
+    std::vector<OperatingPoint> all_pts;
+
+    /// optimal operating points, sorted by perf
+    std::vector<OperatingPoint> optimal_pts;
+
+    // begins with a single operating point: t=0, perf=0
+    OperatingPoints ();
+
+    /// add operating points from other to this, with a prefix to the keys
+    int merge_with (const OperatingPoints &other,
+                    const std::string & prefix = "");
+
+    void clear ();
+
+    /// add a performance measure. Return whether it is an optimal point
+    bool add (double perf, double t, const std::string & key, size_t cno = 0);
+
+    /// get time required to obtain a given performance measure
+    double t_for_perf (double perf) const;
+
+    /// easy-to-read output
+    void display (bool only_optimal = true) const;
+
+    /// output to a format easy to digest by gnuplot
+    void all_to_gnuplot (const char *fname) const;
+    void optimal_to_gnuplot (const char *fname) const;
+
+};
+
+/// possible values of a parameter, sorted from least to most expensive/accurate
+struct ParameterRange {
+    std::string name;
+    std::vector<double> values;
+};
+
+/** Uses a-priori knowledge on the Faiss indexes to extract tunable parameters.
+ */
+struct ParameterSpace {
+    /// all tunable parameters
+    std::vector<ParameterRange> parameter_ranges;
+
+    // exploration parameters
+
+    /// verbosity during exploration
+    int verbose;
+
+    /// nb of experiments during optimization (0 = try all combinations)
+    int n_experiments;
+
+    /// maximum number of queries to submit at a time.
+    size_t batchsize;
+
+    /// use multithreading over batches (useful to benchmark
+    /// independent single-searches)
+    bool thread_over_batches;
+
+    /// run tests several times until they reach at least this
+    /// duration (to avoid jittering in MT mode)
+    double min_test_duration;
+
+    ParameterSpace ();
+
+    /// nb of combinations, = product of values sizes
+    size_t n_combinations () const;
+
+    /// returns whether combinations c1 >= c2 in the tuple sense
+    bool combination_ge (size_t c1, size_t c2) const;
+
+    /// get string representation of the combination
+    std::string combination_name (size_t cno) const;
+
+    /// print a description on stdout
+    void display () const;
+
+    /// add a new parameter (or return it if it exists)
+    ParameterRange &add_range(const char * name);
+
+    /// initialize with reasonable parameters for the index
+    virtual void initialize (const Index * index);
+
+    /// set a combination of parameters on an index
+    void set_index_parameters (Index *index, size_t cno) const;
+
+    /// set a combination of parameters described by a string
+    void set_index_parameters (Index *index, const char *param_string) const;
+
+    /// set one of the parameters
+    virtual void set_index_parameter (
+        Index * index, const std::string & name, double val) const;
+
+    /** find an upper bound on the performance and a lower bound on t
+     * for configuration cno given another operating point op */
+    void update_bounds (size_t cno, const OperatingPoint & op,
+                        double *upper_bound_perf,
+                        double *lower_bound_t) const;
+
+    /** explore operating points
+     * @param index   index to run on
+     * @param xq      query vectors (size nq * index.d)
+     * @param crit    selection criterion
+     * @param ops     resulting operating points
+     */
+    void explore (Index *index,
+                  size_t nq, const float *xq,
+                  const AutoTuneCriterion & crit,
+                  OperatingPoints * ops)  const;
+
+    virtual ~ParameterSpace () {}
+};
+
+
+
+} // namespace faiss
+
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/CODE_OF_CONDUCT.md b/core/src/index/thirdparty/faiss/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000..ac27d8a51b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/CODE_OF_CONDUCT.md
@@ -0,0 +1,2 @@
+# Code of Conduct
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to. Please [read the full text](https://code.fb.com/codeofconduct) so that you can understand what actions will and will not be tolerated.
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/CONTRIBUTING.md b/core/src/index/thirdparty/faiss/CONTRIBUTING.md
new file mode 100644
index 0000000000..a93141be47
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/CONTRIBUTING.md
@@ -0,0 +1,53 @@
+# Contributing to Faiss
+
+We want to make contributing to this project as easy and transparent as
+possible. 
+
+## Our Development Process
+
+We mainly develop Faiss within Facebook. Sometimes, we will sync the 
+github version of Faiss with the internal state. 
+
+## Pull Requests
+
+We welcome pull requests that add significant value to Faiss. If you plan to do
+a major development and contribute it back to Faiss, please contact us first before
+putting too much effort into it.
+
+1. Fork the repo and create your branch from `master`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+There is a Facebook internal test suite for Faiss, and we need to run 
+all changes to Faiss through it.
+
+## Contributor License Agreement ("CLA")
+
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## Coding Style  
+
+* 4 or 2 spaces for indentation in C++ (no tabs)
+* 80 character line length (both for C++ and Python)
+* C++ language level: C++11
+
+## License
+
+By contributing to Faiss, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
+
diff --git a/core/src/index/thirdparty/faiss/Clustering.cpp b/core/src/index/thirdparty/faiss/Clustering.cpp
new file mode 100644
index 0000000000..6864b98e26
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/Clustering.cpp
@@ -0,0 +1,261 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/Clustering.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+
+#include <faiss/utils/utils.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/distances.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/IndexFlat.h>
+
+namespace faiss {
+
+ClusteringParameters::ClusteringParameters ():
+    niter(25),
+    nredo(1),
+    verbose(false),
+    spherical(false),
+    int_centroids(false),
+    update_index(false),
+    frozen_centroids(false),
+    min_points_per_centroid(39),
+    max_points_per_centroid(256),
+    seed(1234)
+{}
+// 39 corresponds to 10000 / 256 -> to avoid warnings on PQ tests with randu10k
+
+
+Clustering::Clustering (int d, int k):
+    d(d), k(k) {}
+
+Clustering::Clustering (int d, int k, const ClusteringParameters &cp):
+    ClusteringParameters (cp), d(d), k(k) {}
+
+
+
+static double imbalance_factor (int n, int k, int64_t *assign) {
+    std::vector<int> hist(k, 0);
+    for (int i = 0; i < n; i++)
+        hist[assign[i]]++;
+
+    double tot = 0, uf = 0;
+
+    for (int i = 0 ; i < k ; i++) {
+        tot += hist[i];
+        uf += hist[i] * (double) hist[i];
+    }
+    uf = uf * k / (tot * tot);
+
+    return uf;
+}
+
+void Clustering::post_process_centroids ()
+{
+
+    if (spherical) {
+        fvec_renorm_L2 (d, k, centroids.data());
+    }
+
+    if (int_centroids) {
+        for (size_t i = 0; i < centroids.size(); i++)
+            centroids[i] = roundf (centroids[i]);
+    }
+}
+
+
+void Clustering::train (idx_t nx, const float *x_in, Index & index) {
+    FAISS_THROW_IF_NOT_FMT (nx >= k,
+             "Number of training points (%ld) should be at least "
+             "as large as number of clusters (%ld)", nx, k);
+
+    double t0 = getmillisecs();
+
+    // yes it is the user's responsibility, but it may spare us some
+    // hard-to-debug reports.
+    for (size_t i = 0; i < nx * d; i++) {
+      FAISS_THROW_IF_NOT_MSG (finite (x_in[i]),
+                        "input contains NaN's or Inf's");
+    }
+
+    const float *x = x_in;
+    ScopeDeleter<float> del1;
+
+    if (nx > k * max_points_per_centroid) {
+        if (verbose)
+            printf("Sampling a subset of %ld / %ld for training\n",
+                   k * max_points_per_centroid, nx);
+        std::vector<int> perm (nx);
+        rand_perm (perm.data (), nx, seed);
+        nx = k * max_points_per_centroid;
+        float * x_new = new float [nx * d];
+        for (idx_t i = 0; i < nx; i++)
+            memcpy (x_new + i * d, x + perm[i] * d, sizeof(x_new[0]) * d);
+        x = x_new;
+        del1.set (x);
+    } else if (nx < k * min_points_per_centroid) {
+        fprintf (stderr,
+                 "WARNING clustering %ld points to %ld centroids: "
+                 "please provide at least %ld training points\n",
+                 nx, k, idx_t(k) * min_points_per_centroid);
+    }
+
+
+    if (nx == k) {
+        if (verbose) {
+            printf("Number of training points (%ld) same as number of "
+                   "clusters, just copying\n", nx);
+        }
+        // this is a corner case, just copy training set to clusters
+        centroids.resize (d * k);
+        memcpy (centroids.data(), x_in, sizeof (*x_in) * d * k);
+        index.reset();
+        index.add(k, x_in);
+        return;
+    }
+
+
+    if (verbose)
+        printf("Clustering %d points in %ldD to %ld clusters, "
+               "redo %d times, %d iterations\n",
+               int(nx), d, k, nredo, niter);
+
+    idx_t * assign = new idx_t[nx];
+    ScopeDeleter<idx_t> del (assign);
+    float * dis = new float[nx];
+    ScopeDeleter<float> del2(dis);
+
+    // for redo
+    float best_err = HUGE_VALF;
+    std::vector<float> best_obj;
+    std::vector<float> best_centroids;
+
+    // support input centroids
+
+    FAISS_THROW_IF_NOT_MSG (
+       centroids.size() % d == 0,
+       "size of provided input centroids not a multiple of dimension");
+
+    size_t n_input_centroids = centroids.size() / d;
+
+    if (verbose && n_input_centroids > 0) {
+        printf ("  Using %zd centroids provided as input (%sfrozen)\n",
+                n_input_centroids, frozen_centroids ? "" : "not ");
+    }
+
+    double t_search_tot = 0;
+    if (verbose) {
+        printf("  Preprocessing in %.2f s\n",
+               (getmillisecs() - t0) / 1000.);
+    }
+    t0 = getmillisecs();
+
+    for (int redo = 0; redo < nredo; redo++) {
+
+        if (verbose && nredo > 1) {
+            printf("Outer iteration %d / %d\n", redo, nredo);
+        }
+
+        // initialize remaining centroids with random points from the dataset
+        centroids.resize (d * k);
+        std::vector<int> perm (nx);
+
+        rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L);
+        for (int i = n_input_centroids; i < k ; i++)
+            memcpy (&centroids[i * d], x + perm[i] * d,
+                    d * sizeof (float));
+
+        post_process_centroids ();
+
+        if (index.ntotal != 0) {
+            index.reset();
+        }
+
+        if (!index.is_trained) {
+            index.train (k, centroids.data());
+        }
+
+        index.add (k, centroids.data());
+        float err = 0;
+        for (int i = 0; i < niter; i++) {
+            double t0s = getmillisecs();
+            index.search (nx, x, 1, dis, assign);
+            InterruptCallback::check();
+            t_search_tot += getmillisecs() - t0s;
+
+            err = 0;
+            for (int j = 0; j < nx; j++)
+                err += dis[j];
+            obj.push_back (err);
+
+            int nsplit = km_update_centroids (
+                  x, centroids.data(),
+                  assign, d, k, nx, frozen_centroids ? n_input_centroids : 0);
+
+            if (verbose) {
+                printf ("  Iteration %d (%.2f s, search %.2f s): "
+                        "objective=%g imbalance=%.3f nsplit=%d       \r",
+                        i, (getmillisecs() - t0) / 1000.0,
+                        t_search_tot / 1000,
+                        err, imbalance_factor (nx, k, assign),
+                        nsplit);
+                fflush (stdout);
+            }
+
+            post_process_centroids ();
+
+            index.reset ();
+            if (update_index)
+                index.train (k, centroids.data());
+
+            assert (index.ntotal == 0);
+            index.add (k, centroids.data());
+            InterruptCallback::check ();
+        }
+        if (verbose) printf("\n");
+        if (nredo > 1) {
+            if (err < best_err) {
+                if (verbose)
+                    printf ("Objective improved: keep new clusters\n");
+                best_centroids = centroids;
+                best_obj = obj;
+                best_err = err;
+            }
+            index.reset ();
+        }
+    }
+    if (nredo > 1) {
+        centroids = best_centroids;
+        obj = best_obj;
+        index.reset();
+        index.add(k, best_centroids.data());
+    }
+
+}
+
+float kmeans_clustering (size_t d, size_t n, size_t k,
+                         const float *x,
+                         float *centroids)
+{
+    Clustering clus (d, k);
+    clus.verbose = d * n * k > (1L << 30);
+    // display logs if > 1Gflop per iteration
+    IndexFlatL2 index (d);
+    clus.train (n, x, index);
+    memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
+    return clus.obj.back();
+}
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/Clustering.h b/core/src/index/thirdparty/faiss/Clustering.h
new file mode 100644
index 0000000000..fd51ef599b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/Clustering.h
@@ -0,0 +1,101 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_CLUSTERING_H
+#define FAISS_CLUSTERING_H
+#include <faiss/Index.h>
+
+#include <vector>
+
+namespace faiss {
+
+
+/** Class for the clustering parameters. Can be passed to the
+ * constructor of the Clustering object.
+ */
+struct ClusteringParameters {
+    int niter;          ///< clustering iterations
+    int nredo;          ///< redo clustering this many times and keep best
+
+    bool verbose;
+    bool spherical;     ///< do we want normalized centroids?
+    bool int_centroids; ///< round centroids coordinates to integer
+    bool update_index;  ///< update index after each iteration?
+    bool frozen_centroids;  ///< use the centroids provided as input and do not change them during iterations
+
+    int min_points_per_centroid; ///< otherwise you get a warning
+    int max_points_per_centroid;  ///< to limit size of dataset
+
+    int seed; ///< seed for the random number generator
+
+    /// sets reasonable defaults
+    ClusteringParameters ();
+};
+
+
+/** clustering based on assignment - centroid update iterations
+ *
+ * The clustering is based on an Index object that assigns training
+ * points to the centroids. Therefore, at each iteration the centroids
+ * are added to the index.
+ *
+ * On output, the centoids table is set to the latest version
+ * of the centroids and they are also added to the index. If the
+ * centroids table it is not empty on input, it is also used for
+ * initialization.
+ *
+ * To do several clusterings, just call train() several times on
+ * different training sets, clearing the centroid table in between.
+ */
+struct Clustering: ClusteringParameters {
+    typedef Index::idx_t idx_t;
+    size_t d;              ///< dimension of the vectors
+    size_t k;              ///< nb of centroids
+
+    /// centroids (k * d)
+    std::vector<float> centroids;
+
+    /// objective values (sum of distances reported by index) over
+    /// iterations
+    std::vector<float> obj;
+
+    /// the only mandatory parameters are k and d
+    Clustering (int d, int k);
+    Clustering (int d, int k, const ClusteringParameters &cp);
+
+    /// Index is used during the assignment stage
+    virtual void train (idx_t n, const float * x, faiss::Index & index);
+
+    /// Post-process the centroids after each centroid update.
+    /// includes optional L2 normalization and nearest integer rounding
+    void post_process_centroids ();
+
+    virtual ~Clustering() {}
+};
+
+
+/** simplified interface
+ *
+ * @param d dimension of the data
+ * @param n nb of training vectors
+ * @param k nb of output centroids
+ * @param x training set (size n * d)
+ * @param centroids output centroids (size k * d)
+ * @return final quantization error
+ */
+float kmeans_clustering (size_t d, size_t n, size_t k,
+                         const float *x,
+                         float *centroids);
+
+
+
+}
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/Dockerfile b/core/src/index/thirdparty/faiss/Dockerfile
new file mode 100644
index 0000000000..418fb2929e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/Dockerfile
@@ -0,0 +1,29 @@
+FROM nvidia/cuda:8.0-devel-centos7
+
+# Install MKL
+RUN yum-config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo
+RUN rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB
+RUN yum install -y intel-mkl-2019.3-062
+ENV LD_LIBRARY_PATH /opt/intel/mkl/lib/intel64:$LD_LIBRARY_PATH
+ENV LIBRARY_PATH /opt/intel/mkl/lib/intel64:$LIBRARY_PATH
+ENV LD_PRELOAD /usr/lib64/libgomp.so.1:/opt/intel/mkl/lib/intel64/libmkl_def.so:\
+/opt/intel/mkl/lib/intel64/libmkl_avx2.so:/opt/intel/mkl/lib/intel64/libmkl_core.so:\
+/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.so:/opt/intel/mkl/lib/intel64/libmkl_gnu_thread.so
+
+# Install necessary build tools
+RUN yum install -y gcc-c++ make swig3
+
+# Install necesary headers/libs
+RUN yum install -y python-devel numpy
+
+COPY . /opt/faiss
+
+WORKDIR /opt/faiss
+
+# --with-cuda=/usr/local/cuda-8.0 
+RUN ./configure --prefix=/usr --libdir=/usr/lib64 --without-cuda
+RUN make -j $(nproc)
+RUN make -C python
+RUN make test
+RUN make install
+RUN make -C demos demo_ivfpq_indexing && ./demos/demo_ivfpq_indexing
diff --git a/core/src/index/thirdparty/faiss/INSTALL.md b/core/src/index/thirdparty/faiss/INSTALL.md
new file mode 100644
index 0000000000..01f29e46e1
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/INSTALL.md
@@ -0,0 +1,353 @@
+[//]: # "**********************************************************"
+[//]: # "** INSTALL file for Faiss (Fair AI Similarity Search    **"
+[//]: # "**********************************************************"
+
+INSTALL file for Faiss (Fair AI Similarity Search)
+==================================================
+
+Install via Conda
+-----------------
+
+The easiest way to install FAISS is from Anaconda. We regularly push stable releases to the pytorch conda channel.
+
+Currently we support faiss-cpu both on Linux and OSX. We also provide faiss-gpu compiled with CUDA8/CUDA9/CUDA10 on Linux systems.
+
+You can easily install it by
+
+```
+# CPU version only
+conda install faiss-cpu -c pytorch
+
+# GPU version
+conda install faiss-gpu cudatoolkit=8.0 -c pytorch # For CUDA8
+conda install faiss-gpu cudatoolkit=9.0 -c pytorch # For CUDA9
+conda install faiss-gpu cudatoolkit=10.0 -c pytorch # For CUDA10
+```
+
+Compile from source
+-------------------
+
+The Faiss compilation works in 2 steps:
+
+1. compile the C++ core and examples
+
+2. compile the Python interface
+
+Steps 2 depends on 1.
+
+It is also possible to build a pure C interface. This optional process is
+described separately (please see the [C interface installation file](c_api/INSTALL.md))
+
+General compilation instructions
+================================
+
+TL;DR: `./configure && make (&& make install)` for the C++ library, and then `cd python; make && make install` for the python interface.
+
+1. `./configure`
+
+This generates the system-dependent configuration for the `Makefile`, stored in
+a file called `makefile.inc`.
+
+A few useful options:
+- `./configure --without-cuda` in order to build the CPU part only.
+- `./configure --with-cuda=/path/to/cuda-10.1` in order to hint to the path of
+the cudatoolkit.
+- `./configure --with-cuda-arch="-gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_72,code=sm_72"` for specifying which GPU architectures to build against.
+- `./configure --with-python=/path/to/python3.7` in order to build a python
+interface for a different python than the default one.
+- `LDFLAGS=-L/path_to_mkl/lib/ ./configure` so that configure detects the MKL BLAS imeplementation. Note that this may require to set the LD_LIBRARY_PATH at runtime.
+
+2. `make`
+
+This builds the C++ library (the whole library if a suitable cuda toolkit was
+found, or the CPU part only otherwise).
+
+3. `make install` (optional)
+
+This installs the headers and libraries.
+
+4. `make -C python` (or `make py`)
+
+This builds the python interface.
+
+5. `make -C python install`
+
+This installs the python library.
+
+
+Faiss has been tested only on x86_64 machines on Linux and Mac OS.
+
+Faiss requires a C++ compiler that understands:
+- the Intel intrinsics for SSE instructions,
+- the GCC intrinsic for the popcount instruction,
+- basic OpenMP.
+
+There are a few examples for makefile.inc in the example_makefiles/
+subdirectory. There are also indications for specific configurations in the
+troubleshooting section of the wiki.
+
+https://github.com/facebookresearch/faiss/wiki/Troubleshooting
+
+Faiss comes as a .a archive, that can be linked with executables or
+dynamic libraries (useful for the Python wrapper).
+
+
+BLAS/Lapack
+-----------
+
+The only variables that need to be configured for the C++ Faiss are
+the BLAS/Lapack flags (a linear aglebra software package). It needs a
+flag telling whether BLAS/Lapack uses 32 or 64 bit integers and the
+linking flags. Faiss uses the Fortran 77 interface of BLAS/Lapack and
+thus does not need an include path.
+
+There are several BLAS implementations, depending on the OS and
+machine. To have reasonable performance, the BLAS library should be
+multithreaded. See the example makefile.inc's for hints and examples
+on how to set the flags, or simply run the configure script:
+
+   `./configure`
+
+To check that the link flags are correct, and verify whether the
+implementation uses 32 or 64 bit integers, you can
+
+  `make misc/test_blas`
+
+and run
+
+  `./misc/test_blas`
+
+
+Testing Faiss
+-------------
+
+A basic usage example is in
+
+  `demos/demo_ivfpq_indexing`
+
+which you can build by calling
+  `make -C demos demo_ivfpq_indexing`
+
+It makes a small index, stores it and performs some searches. A normal
+runtime is around 20s. With a fast machine and Intel MKL's BLAS it
+runs in 2.5s.
+
+To run the whole test suite:
+
+   `make test` (for the CPU part)
+
+   `make test_gpu` (for the GPU part)
+
+
+A real-life benchmark
+---------------------
+
+A bit longer example runs and evaluates Faiss on the SIFT1M
+dataset. To run it, please download the ANN_SIFT1M dataset from
+
+http://corpus-texmex.irisa.fr/
+
+and unzip it to the subdirectory `sift1M` at the root of the source
+directory for this repository.
+
+Then compile and run the following (after ensuring you have installed faiss):
+
+```
+make demos
+./demos/demo_sift1M
+```
+
+This is a demonstration of the high-level auto-tuning API. You can try
+setting a different index_key to find the indexing structure that
+gives the best performance.
+
+
+The Python interface
+======================================
+
+The Python interface is compiled with
+
+  `make -C python` (or `make py`)
+
+How it works
+------------
+
+The Python interface is provided via SWIG (Simple Wrapper and
+Interface Generator) and an additional level of manual wrappers (in python/faiss.py).
+
+SWIG generates two wrapper files: a Python file (`python/swigfaiss.py`) and a
+C++ file that must be compiled to a dynamic library (`python/_swigfaiss.so`).
+
+Testing the Python wrapper
+--------------------------
+
+Often, a successful compile does not mean that the library works,
+because missing symbols are detected only at runtime. You should be
+able to load the Faiss dynamic library:
+
+  `python -c "import faiss"`
+
+In case of failure, it reports the first missing symbol. To see all
+missing symbols (on Linux), use
+
+  `ldd -r _swigfaiss.so`
+
+Sometimes, problems (eg with BLAS libraries) appear only when actually
+calling a BLAS function. A simple way to check this
+
+```python
+python -c "import faiss, numpy
+faiss.Kmeans(10, 20).train(numpy.random.rand(1000, 10).astype('float32'))
+```
+
+
+Real-life test
+--------------
+
+The following script extends the demo_sift1M test to several types of
+indexes.  This must be run from the root of the source directory for this
+repository:
+
+```
+mkdir tmp             # graphs of the output will be written here
+PYTHONPATH=. python demos/demo_auto_tune.py
+```
+
+It will cycle through a few types of indexes and find optimal
+operating points. You can play around with the types of indexes.
+
+
+Step 3: Compiling the GPU implementation
+========================================
+
+The GPU version is a superset of the CPU version. In addition it
+requires the cuda compiler and related libraries (Cublas)
+
+The nvcc-specific flags to pass to the compiler, based on your desired
+compute capability can be customized by providing the `--with-cuda-arch` to
+`./configure`. Only compute capability 3.5+ is supported. For example, we enable
+by default:
+
+```
+-gencode=arch=compute_35,code=compute_35
+-gencode=arch=compute_52,code=compute_52
+-gencode=arch=compute_60,code=compute_60
+-gencode=arch=compute_61,code=compute_61
+-gencode=arch=compute_70,code=compute_70
+-gencode=arch=compute_75,code=compute_75
+```
+
+However, look at https://developer.nvidia.com/cuda-gpus to determine
+what compute capability you need to use, and replace our gencode
+specifications with the one(s) you need.
+
+Most other flags are related to the C++11 compiler used by nvcc to
+complile the actual C++ code. They are normally just transmitted by
+nvcc, except some of them that are not recognized and that should be
+escaped by prefixing them with -Xcompiler. Also link flags that are
+prefixed with -Wl, should be passed with -Xlinker.
+
+You may want to add `-j 10` to use 10 threads during compile.
+
+Testing the GPU implementation
+------------------------------
+
+Compile the example with
+
+  `make -C gpu/test demo_ivfpq_indexing_gpu`
+
+This produce the GPU code equivalent to the CPU
+demo_ivfpq_indexing. It also shows how to translate indexed from/to
+the GPU.
+
+
+Python example with GPU support
+-------------------------------
+
+The auto-tuning example above also runs on the GPU. Edit
+`demos/demo_auto_tune.py` at line 100 with the values
+
+```python
+keys_to_test = keys_gpu
+use_gpu = True
+```
+
+and you can run
+
+```
+export PYTHONPATH=.
+python demos/demo_auto_tune.py
+```
+
+to test the GPU code.
+
+
+Docker instructions
+===================
+
+For using GPU capabilities of Faiss, you'll need to run "nvidia-docker"
+rather than "docker". Make sure that docker
+(https://docs.docker.com/engine/installation/) and nvidia-docker
+(https://github.com/NVIDIA/nvidia-docker) are installed on your system
+
+To build the "faiss" image, run
+
+  `nvidia-docker build -t faiss .`
+
+or if you don't want/need to clone the sources, just run
+
+  `nvidia-docker build -t faiss github.com/facebookresearch/faiss`
+
+If you want to run the tests during the docker build, uncomment the
+last 3 "RUN" steps in the Dockerfile. But you might want to run the
+tests by yourself, so just run
+
+  `nvidia-docker run -ti --name faiss faiss bash`
+
+and run what you want. If you need a dataset (like sift1M), download it
+inside the created container, or better, mount a directory from the host
+
+  nvidia-docker run -ti --name faiss -v /my/host/data/folder/ann_dataset/sift/:/opt/faiss/sift1M faiss bash
+
+
+How to use Faiss in your own projects
+=====================================
+
+C++
+---
+
+The makefile generates a static and a dynamic library
+
+```
+libfaiss.a
+libfaiss.so (or libfaiss.dylib)
+```
+
+the executable should be linked to one of these. If you use
+the static version (.a), add the LDFLAGS used in the Makefile.
+
+For binary-only distributions, the headers should be under
+a `faiss/` directory, so that they can be included as
+
+```c++
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+```
+
+Python
+------
+
+To import Faiss in your own Python project, you need the files
+
+```
+__init__.py
+swigfaiss.py
+_swigfaiss.so
+```
+to be present in a `faiss/` directory visible in the PYTHONPATH or in the
+current directory.
+Then Faiss can be used in python with
+
+```python
+import faiss
+```
diff --git a/core/src/index/thirdparty/faiss/IVFlib.cpp b/core/src/index/thirdparty/faiss/IVFlib.cpp
new file mode 100644
index 0000000000..9af93e38dc
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IVFlib.cpp
@@ -0,0 +1,344 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IVFlib.h>
+
+#include <memory>
+
+#include <faiss/IndexPreTransform.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/utils.h>
+
+
+namespace faiss { namespace ivflib {
+
+
+void check_compatible_for_merge (const Index * index0,
+                                 const Index * index1)
+{
+
+    const faiss::IndexPreTransform *pt0 =
+        dynamic_cast<const faiss::IndexPreTransform *>(index0);
+
+    if (pt0) {
+        const faiss::IndexPreTransform *pt1 =
+            dynamic_cast<const faiss::IndexPreTransform *>(index1);
+        FAISS_THROW_IF_NOT_MSG (pt1, "both indexes should be pretransforms");
+
+        FAISS_THROW_IF_NOT (pt0->chain.size() == pt1->chain.size());
+        for (int i = 0; i < pt0->chain.size(); i++) {
+            FAISS_THROW_IF_NOT (typeid(pt0->chain[i]) == typeid(pt1->chain[i]));
+        }
+
+        index0 = pt0->index;
+        index1 = pt1->index;
+    }
+    FAISS_THROW_IF_NOT (typeid(index0) == typeid(index1));
+    FAISS_THROW_IF_NOT (index0->d == index1->d &&
+                        index0->metric_type == index1->metric_type);
+
+    const faiss::IndexIVF *ivf0 = dynamic_cast<const faiss::IndexIVF *>(index0);
+    if (ivf0) {
+        const faiss::IndexIVF *ivf1 =
+            dynamic_cast<const faiss::IndexIVF *>(index1);
+        FAISS_THROW_IF_NOT (ivf1);
+
+        ivf0->check_compatible_for_merge (*ivf1);
+    }
+
+    // TODO: check as thoroughfully for other index types
+
+}
+
+const IndexIVF * extract_index_ivf (const Index * index)
+{
+    if (auto *pt =
+        dynamic_cast<const IndexPreTransform *>(index)) {
+        index = pt->index;
+    }
+
+    auto *ivf = dynamic_cast<const IndexIVF *>(index);
+
+    FAISS_THROW_IF_NOT (ivf);
+
+    return ivf;
+}
+
+IndexIVF * extract_index_ivf (Index * index) {
+    return const_cast<IndexIVF*> (extract_index_ivf ((const Index*)(index)));
+}
+
+void merge_into(faiss::Index *index0, faiss::Index *index1, bool shift_ids) {
+
+    check_compatible_for_merge (index0, index1);
+    IndexIVF * ivf0 = extract_index_ivf (index0);
+    IndexIVF * ivf1 = extract_index_ivf (index1);
+
+    ivf0->merge_from (*ivf1, shift_ids ? ivf0->ntotal : 0);
+
+    // useful for IndexPreTransform
+    index0->ntotal = ivf0->ntotal;
+    index1->ntotal = ivf1->ntotal;
+}
+
+
+
+void search_centroid(faiss::Index *index,
+                     const float* x, int n,
+                     idx_t* centroid_ids)
+{
+    std::unique_ptr<float[]> del;
+    if (auto index_pre = dynamic_cast<faiss::IndexPreTransform*>(index)) {
+        x = index_pre->apply_chain(n, x);
+        del.reset((float*)x);
+        index = index_pre->index;
+    }
+    faiss::IndexIVF* index_ivf = dynamic_cast<faiss::IndexIVF*>(index);
+    assert(index_ivf);
+    index_ivf->quantizer->assign(n, x, centroid_ids);
+}
+
+
+
+void search_and_return_centroids(faiss::Index *index,
+                                 size_t n,
+                                 const float* xin,
+                                 long k,
+                                 float *distances,
+                                 idx_t* labels,
+                                 idx_t* query_centroid_ids,
+                                 idx_t* result_centroid_ids)
+{
+    const float *x = xin;
+    std::unique_ptr<float []> del;
+    if (auto index_pre = dynamic_cast<faiss::IndexPreTransform*>(index)) {
+        x = index_pre->apply_chain(n, x);
+        del.reset((float*)x);
+        index = index_pre->index;
+    }
+    faiss::IndexIVF* index_ivf = dynamic_cast<faiss::IndexIVF*>(index);
+    assert(index_ivf);
+
+    size_t nprobe = index_ivf->nprobe;
+    std::vector<idx_t> cent_nos (n * nprobe);
+    std::vector<float> cent_dis (n * nprobe);
+    index_ivf->quantizer->search(
+        n, x, nprobe, cent_dis.data(), cent_nos.data());
+
+    if (query_centroid_ids) {
+        for (size_t i = 0; i < n; i++)
+            query_centroid_ids[i] = cent_nos[i * nprobe];
+    }
+
+    index_ivf->search_preassigned (n, x, k,
+                                   cent_nos.data(), cent_dis.data(),
+                                   distances, labels, true);
+
+    for (size_t i = 0; i < n * k; i++) {
+        idx_t label = labels[i];
+        if (label < 0) {
+            if (result_centroid_ids)
+                result_centroid_ids[i] = -1;
+        } else {
+            long list_no = label >> 32;
+            long list_index = label & 0xffffffff;
+            if (result_centroid_ids)
+                result_centroid_ids[i] = list_no;
+            labels[i] = index_ivf->invlists->get_single_id(list_no, list_index);
+        }
+    }
+}
+
+
+SlidingIndexWindow::SlidingIndexWindow (Index *index): index (index) {
+    n_slice = 0;
+    IndexIVF* index_ivf = const_cast<IndexIVF*>(extract_index_ivf (index));
+    ils = dynamic_cast<ArrayInvertedLists *> (index_ivf->invlists);
+    nlist = ils->nlist;
+    FAISS_THROW_IF_NOT_MSG (ils,
+               "only supports indexes with ArrayInvertedLists");
+    sizes.resize(nlist);
+}
+
+template<class T>
+static void shift_and_add (std::vector<T> & dst,
+                           size_t remove,
+                           const std::vector<T> & src)
+{
+    if (remove > 0)
+        memmove (dst.data(), dst.data() + remove,
+                 (dst.size() - remove) * sizeof (T));
+    size_t insert_point = dst.size() - remove;
+    dst.resize (insert_point + src.size());
+    memcpy (dst.data() + insert_point, src.data (), src.size() * sizeof(T));
+}
+
+template<class T>
+static void remove_from_begin (std::vector<T> & v,
+                               size_t remove)
+{
+    if (remove > 0)
+        v.erase (v.begin(), v.begin() + remove);
+}
+
+void SlidingIndexWindow::step(const Index *sub_index, bool remove_oldest) {
+
+    FAISS_THROW_IF_NOT_MSG (!remove_oldest || n_slice > 0,
+                            "cannot remove slice: there is none");
+
+    const ArrayInvertedLists *ils2 = nullptr;
+    if(sub_index) {
+        check_compatible_for_merge (index, sub_index);
+        ils2 = dynamic_cast<const ArrayInvertedLists*>(
+                                   extract_index_ivf (sub_index)->invlists);
+        FAISS_THROW_IF_NOT_MSG (ils2, "supports only ArrayInvertedLists");
+    }
+    IndexIVF *index_ivf = extract_index_ivf (index);
+
+    if (remove_oldest && ils2) {
+        for (int i = 0; i < nlist; i++) {
+            std::vector<size_t> & sizesi = sizes[i];
+            size_t amount_to_remove = sizesi[0];
+            index_ivf->ntotal += ils2->ids[i].size() - amount_to_remove;
+
+            shift_and_add (ils->ids[i], amount_to_remove, ils2->ids[i]);
+            shift_and_add (ils->codes[i], amount_to_remove * ils->code_size,
+                           ils2->codes[i]);
+            for (int j = 0; j + 1 < n_slice; j++) {
+                sizesi[j] = sizesi[j + 1] - amount_to_remove;
+            }
+            sizesi[n_slice - 1] = ils->ids[i].size();
+        }
+    } else if (ils2) {
+        for (int i = 0; i < nlist; i++) {
+            index_ivf->ntotal += ils2->ids[i].size();
+            shift_and_add (ils->ids[i], 0, ils2->ids[i]);
+            shift_and_add (ils->codes[i], 0, ils2->codes[i]);
+            sizes[i].push_back(ils->ids[i].size());
+        }
+        n_slice++;
+    } else if (remove_oldest) {
+        for (int i = 0; i < nlist; i++) {
+            size_t amount_to_remove = sizes[i][0];
+            index_ivf->ntotal -= amount_to_remove;
+            remove_from_begin (ils->ids[i], amount_to_remove);
+            remove_from_begin (ils->codes[i],
+                               amount_to_remove * ils->code_size);
+            for (int j = 0; j + 1 < n_slice; j++) {
+                sizes[i][j] = sizes[i][j + 1] - amount_to_remove;
+            }
+            sizes[i].pop_back ();
+        }
+        n_slice--;
+    } else {
+        FAISS_THROW_MSG ("nothing to do???");
+    }
+    index->ntotal = index_ivf->ntotal;
+}
+
+
+
+// Get a subset of inverted lists [i0, i1). Works on IndexIVF's and
+// IndexIVF's embedded in a IndexPreTransform
+
+ArrayInvertedLists *
+get_invlist_range (const Index *index, long i0, long i1)
+{
+    const IndexIVF *ivf = extract_index_ivf (index);
+
+    FAISS_THROW_IF_NOT (0 <= i0 && i0 <= i1 && i1 <= ivf->nlist);
+
+    const InvertedLists *src = ivf->invlists;
+
+    ArrayInvertedLists * il = new ArrayInvertedLists(i1 - i0, src->code_size);
+
+    for (long i = i0; i < i1; i++) {
+        il->add_entries(i - i0, src->list_size(i),
+                        InvertedLists::ScopedIds (src, i).get(),
+                        InvertedLists::ScopedCodes (src, i).get());
+    }
+    return il;
+}
+
+
+
+void set_invlist_range (Index *index, long i0, long i1,
+                        ArrayInvertedLists * src)
+{
+    IndexIVF *ivf = extract_index_ivf (index);
+
+    FAISS_THROW_IF_NOT (0 <= i0 && i0 <= i1 && i1 <= ivf->nlist);
+
+    ArrayInvertedLists *dst = dynamic_cast<ArrayInvertedLists *>(ivf->invlists);
+    FAISS_THROW_IF_NOT_MSG (dst, "only ArrayInvertedLists supported");
+    FAISS_THROW_IF_NOT (src->nlist == i1 - i0 &&
+                        dst->code_size == src->code_size);
+
+    size_t ntotal = index->ntotal;
+    for (long i = i0 ; i < i1; i++) {
+        ntotal -= dst->list_size (i);
+        ntotal += src->list_size (i - i0);
+        std::swap (src->codes[i - i0], dst->codes[i]);
+        std::swap (src->ids[i - i0], dst->ids[i]);
+    }
+    ivf->ntotal = index->ntotal = ntotal;
+}
+
+
+void search_with_parameters (const Index *index,
+                             idx_t n, const float *x, idx_t k,
+                             float *distances, idx_t *labels,
+                             IVFSearchParameters *params,
+                             size_t *nb_dis_ptr)
+{
+    FAISS_THROW_IF_NOT (params);
+    const float *prev_x = x;
+    ScopeDeleter<float> del;
+
+    if (auto ip = dynamic_cast<const IndexPreTransform *> (index)) {
+        x = ip->apply_chain (n, x);
+        if (x != prev_x) {
+            del.set(x);
+        }
+        index = ip->index;
+    }
+
+    std::vector<idx_t> Iq(params->nprobe * n);
+    std::vector<float> Dq(params->nprobe * n);
+
+    const IndexIVF *index_ivf = dynamic_cast<const IndexIVF *>(index);
+    FAISS_THROW_IF_NOT (index_ivf);
+
+    double t0 = getmillisecs();
+    index_ivf->quantizer->search(n, x, params->nprobe,
+                                 Dq.data(), Iq.data());
+    double t1 = getmillisecs();
+    indexIVF_stats.quantization_time += t1 - t0;
+
+    if (nb_dis_ptr) {
+        size_t nb_dis = 0;
+        const InvertedLists *il = index_ivf->invlists;
+        for (idx_t i = 0; i < n * params->nprobe; i++) {
+          if (Iq[i] >= 0) {
+              nb_dis += il->list_size(Iq[i]);
+          }
+        }
+        *nb_dis_ptr = nb_dis;
+    }
+
+    index_ivf->search_preassigned(n, x, k, Iq.data(), Dq.data(),
+                                  distances, labels,
+                                  false, params);
+    double t2 = getmillisecs();
+    indexIVF_stats.search_time += t2 - t1;
+}
+
+
+
+} } // namespace faiss::ivflib
diff --git a/core/src/index/thirdparty/faiss/IVFlib.h b/core/src/index/thirdparty/faiss/IVFlib.h
new file mode 100644
index 0000000000..7b6f3157ea
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IVFlib.h
@@ -0,0 +1,132 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_IVFLIB_H
+#define FAISS_IVFLIB_H
+
+/** Since IVF (inverted file) indexes are of so much use for
+ * large-scale use cases, we group a few functions related to them in
+ * this small library. Most functions work both on IndexIVFs and
+ * IndexIVFs embedded within an IndexPreTransform.
+ */
+
+#include <vector>
+#include <faiss/IndexIVF.h>
+
+namespace faiss { namespace ivflib {
+
+
+/** check if two indexes have the same parameters and are trained in
+ * the same way, otherwise throw. */
+void check_compatible_for_merge (const Index * index1,
+                                 const Index * index2);
+
+/** get an IndexIVF from an index. The index may be an IndexIVF or
+ * some wrapper class that encloses an IndexIVF
+ *
+ * throws an exception if this is not the case.
+ */
+const IndexIVF * extract_index_ivf (const Index * index);
+IndexIVF * extract_index_ivf (Index * index);
+
+/** Merge index1 into index0. Works on IndexIVF's and IndexIVF's
+ *  embedded in a IndexPreTransform. On output, the index1 is empty.
+ *
+ * @param shift_ids: translate the ids from index1 to index0->prev_ntotal
+ */
+void merge_into(Index *index0, Index *index1, bool shift_ids);
+
+typedef Index::idx_t idx_t;
+
+/* Returns the cluster the embeddings belong to.
+ *
+ * @param index      Index, which should be an IVF index
+ *                   (otherwise there are no clusters)
+ * @param embeddings object descriptors for which the centroids should be found,
+ *                   size num_objects * d
+ * @param centroid_ids
+ *                   cluster id each object belongs to, size num_objects
+ */
+void search_centroid(Index *index,
+                     const float* x, int n,
+                     idx_t* centroid_ids);
+
+/* Returns the cluster the embeddings belong to.
+ *
+ * @param index      Index, which should be an IVF index
+ *                   (otherwise there are no clusters)
+ * @param query_centroid_ids
+ *                   centroid ids corresponding to the query vectors (size n)
+ * @param result_centroid_ids
+ *                   centroid ids corresponding to the results (size n * k)
+ * other arguments are the same as the standard search function
+ */
+void search_and_return_centroids(Index *index,
+                                 size_t n,
+                                 const float* xin,
+                                 long k,
+                                 float *distances,
+                                 idx_t* labels,
+                                 idx_t* query_centroid_ids,
+                                 idx_t* result_centroid_ids);
+
+
+/** A set of IndexIVFs concatenated together in a FIFO fashion.
+ * at each "step", the oldest index slice is removed and a new index is added.
+ */
+struct SlidingIndexWindow {
+    /// common index that contains the sliding window
+    Index * index;
+
+    /// InvertedLists of index
+    ArrayInvertedLists *ils;
+
+    /// number of slices currently in index
+    int n_slice;
+
+    /// same as index->nlist
+    size_t nlist;
+
+    /// cumulative list sizes at each slice
+    std::vector<std::vector<size_t> > sizes;
+
+    /// index should be initially empty and trained
+    SlidingIndexWindow (Index *index);
+
+    /** Add one index to the current index and remove the oldest one.
+     *
+     * @param sub_index        slice to swap in (can be NULL)
+     * @param remove_oldest    if true, remove the oldest slices */
+    void step(const Index *sub_index, bool remove_oldest);
+
+};
+
+
+/// Get a subset of inverted lists [i0, i1)
+ArrayInvertedLists * get_invlist_range (const Index *index,
+                                        long i0, long i1);
+
+/// Set a subset of inverted lists
+void set_invlist_range (Index *index, long i0, long i1,
+                        ArrayInvertedLists * src);
+
+// search an IndexIVF, possibly embedded in an IndexPreTransform with
+// given parameters. Optionally returns the number of distances
+// computed
+void search_with_parameters (const Index *index,
+                             idx_t n, const float *x, idx_t k,
+                             float *distances, idx_t *labels,
+                             IVFSearchParameters *params,
+                             size_t *nb_dis = nullptr);
+
+
+
+} } // namespace faiss::ivflib
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/Index.cpp b/core/src/index/thirdparty/faiss/Index.cpp
new file mode 100644
index 0000000000..a85f9ab594
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/Index.cpp
@@ -0,0 +1,171 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/Index.h>
+
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/distances.h>
+
+#include <cstring>
+
+
+namespace faiss {
+
+Index::~Index ()
+{
+}
+
+
+void Index::train(idx_t /*n*/, const float* /*x*/) {
+    // does nothing by default
+}
+
+
+void Index::range_search (idx_t , const float *, float,
+                          RangeSearchResult *) const
+{
+  FAISS_THROW_MSG ("range search not implemented");
+}
+
+void Index::assign (idx_t n, const float * x, idx_t * labels, idx_t k)
+{
+  float * distances = new float[n * k];
+  ScopeDeleter<float> del(distances);
+  search (n, x, k, distances, labels);
+}
+
+void Index::add_with_ids(
+    idx_t /*n*/,
+    const float* /*x*/,
+    const idx_t* /*xids*/) {
+  FAISS_THROW_MSG ("add_with_ids not implemented for this type of index");
+}
+
+size_t Index::remove_ids(const IDSelector& /*sel*/) {
+  FAISS_THROW_MSG ("remove_ids not implemented for this type of index");
+  return -1;
+}
+
+
+void Index::reconstruct (idx_t, float * ) const {
+  FAISS_THROW_MSG ("reconstruct not implemented for this type of index");
+}
+
+
+void Index::reconstruct_n (idx_t i0, idx_t ni, float *recons) const {
+  for (idx_t i = 0; i < ni; i++) {
+    reconstruct (i0 + i, recons + i * d);
+  }
+}
+
+
+void Index::search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                    float *distances, idx_t *labels,
+                                    float *recons) const {
+  search (n, x, k, distances, labels);
+  for (idx_t i = 0; i < n; ++i) {
+    for (idx_t j = 0; j < k; ++j) {
+      idx_t ij = i * k + j;
+      idx_t key = labels[ij];
+      float* reconstructed = recons + ij * d;
+      if (key < 0) {
+        // Fill with NaNs
+        memset(reconstructed, -1, sizeof(*reconstructed) * d);
+      } else {
+        reconstruct (key, reconstructed);
+      }
+    }
+  }
+}
+
+void Index::compute_residual (const float * x,
+                              float * residual, idx_t key) const {
+  reconstruct (key, residual);
+  for (size_t i = 0; i < d; i++) {
+    residual[i] = x[i] - residual[i];
+  }
+}
+
+void Index::compute_residual_n (idx_t n, const float* xs,
+                                float* residuals,
+                                const idx_t* keys) const {
+#pragma omp parallel for
+  for (idx_t i = 0; i < n; ++i) {
+    compute_residual(&xs[i * d], &residuals[i * d], keys[i]);
+  }
+}
+
+
+
+size_t Index::sa_code_size () const
+{
+    FAISS_THROW_MSG ("standalone codec not implemented for this type of index");
+}
+
+void Index::sa_encode (idx_t, const float *,
+                             uint8_t *) const
+{
+    FAISS_THROW_MSG ("standalone codec not implemented for this type of index");
+}
+
+void Index::sa_decode (idx_t, const uint8_t *,
+                            float *) const
+{
+    FAISS_THROW_MSG ("standalone codec not implemented for this type of index");
+}
+
+
+namespace {
+
+
+// storage that explicitly reconstructs vectors before computing distances
+struct GenericDistanceComputer : DistanceComputer {
+  size_t d;
+  const Index& storage;
+  std::vector<float> buf;
+  const float *q;
+
+  explicit GenericDistanceComputer(const Index& storage)
+      : storage(storage) {
+    d = storage.d;
+    buf.resize(d * 2);
+  }
+
+  float operator () (idx_t i) override {
+    storage.reconstruct(i, buf.data());
+    return fvec_L2sqr(q, buf.data(), d);
+  }
+
+  float symmetric_dis(idx_t i, idx_t j) override {
+    storage.reconstruct(i, buf.data());
+    storage.reconstruct(j, buf.data() + d);
+    return fvec_L2sqr(buf.data() + d, buf.data(), d);
+  }
+
+  void set_query(const float *x) override {
+    q = x;
+  }
+
+};
+
+
+}  // namespace
+
+
+DistanceComputer * Index::get_distance_computer() const {
+    if (metric_type == METRIC_L2) {
+        return new GenericDistanceComputer(*this);
+    } else {
+        FAISS_THROW_MSG ("get_distance_computer() not implemented");
+    }
+}
+
+
+}
diff --git a/core/src/index/thirdparty/faiss/Index.h b/core/src/index/thirdparty/faiss/Index.h
new file mode 100644
index 0000000000..7c41f87dd0
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/Index.h
@@ -0,0 +1,261 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_H
+#define FAISS_INDEX_H
+
+
+#include <cstdio>
+#include <typeinfo>
+#include <string>
+#include <sstream>
+
+#define FAISS_VERSION_MAJOR 1
+#define FAISS_VERSION_MINOR 6
+#define FAISS_VERSION_PATCH 0
+
+/**
+ * @namespace faiss
+ *
+ * Throughout the library, vectors are provided as float * pointers.
+ * Most algorithms can be optimized when several vectors are processed
+ * (added/searched) together in a batch. In this case, they are passed
+ * in as a matrix. When n vectors of size d are provided as float * x,
+ * component j of vector i is
+ *
+ *   x[ i * d + j ]
+ *
+ * where 0 <= i < n and 0 <= j < d. In other words, matrices are
+ * always compact. When specifying the size of the matrix, we call it
+ * an n*d matrix, which implies a row-major storage.
+ */
+
+
+namespace faiss {
+
+
+/// Some algorithms support both an inner product version and a L2 search version.
+enum MetricType {
+    METRIC_INNER_PRODUCT = 0,  ///< maximum inner product search
+    METRIC_L2 = 1,             ///< squared L2 search
+    METRIC_L1,                 ///< L1 (aka cityblock)
+    METRIC_Linf,               ///< infinity distance
+    METRIC_Lp,                 ///< L_p distance, p is given by metric_arg
+
+    /// some additional metrics defined in scipy.spatial.distance
+    METRIC_Canberra = 20,
+    METRIC_BrayCurtis,
+    METRIC_JensenShannon,
+
+};
+
+
+/// Forward declarations see AuxIndexStructures.h
+struct IDSelector;
+struct RangeSearchResult;
+struct DistanceComputer;
+
+/** Abstract structure for an index
+ *
+ * Supports adding vertices and searching them.
+ *
+ * Currently only asymmetric queries are supported:
+ * database-to-database queries are not implemented.
+ */
+struct Index {
+    using idx_t = int64_t;  ///< all indices are this type
+    using component_t = float;
+    using distance_t = float;
+
+    int d;                 ///< vector dimension
+    idx_t ntotal;          ///< total nb of indexed vectors
+    bool verbose;          ///< verbosity level
+
+    /// set if the Index does not require training, or if training is
+    /// done already
+    bool is_trained;
+
+    /// type of metric this index uses for search
+    MetricType metric_type;
+    float metric_arg;     ///< argument of the metric type
+
+    explicit Index (idx_t d = 0, MetricType metric = METRIC_L2):
+                    d(d),
+                    ntotal(0),
+                    verbose(false),
+                    is_trained(true),
+                    metric_type (metric),
+                    metric_arg(0) {}
+
+    virtual ~Index ();
+
+
+    /** Perform training on a representative set of vectors
+     *
+     * @param n      nb of training vectors
+     * @param x      training vecors, size n * d
+     */
+    virtual void train(idx_t n, const float* x);
+
+    /** Add n vectors of dimension d to the index.
+     *
+     * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
+     * This function slices the input vectors in chuncks smaller than
+     * blocksize_add and calls add_core.
+     * @param x      input matrix, size n * d
+     */
+    virtual void add (idx_t n, const float *x) = 0;
+
+    /** Same as add, but stores xids instead of sequential ids.
+     *
+     * The default implementation fails with an assertion, as it is
+     * not supported by all indexes.
+     *
+     * @param xids if non-null, ids to store for the vectors (size n)
+     */
+    virtual void add_with_ids (idx_t n, const float * x, const idx_t *xids);
+
+    /** query n vectors of dimension d to the index.
+     *
+     * return at most k vectors. If there are not enough results for a
+     * query, the result array is padded with -1s.
+     *
+     * @param x           input vectors to search, size n * d
+     * @param labels      output labels of the NNs, size n*k
+     * @param distances   output pairwise distances, size n*k
+     */
+    virtual void search (idx_t n, const float *x, idx_t k,
+                         float *distances, idx_t *labels) const = 0;
+
+    /** query n vectors of dimension d to the index.
+     *
+     * return all vectors with distance < radius. Note that many
+     * indexes do not implement the range_search (only the k-NN search
+     * is mandatory).
+     *
+     * @param x           input vectors to search, size n * d
+     * @param radius      search radius
+     * @param result      result table
+     */
+    virtual void range_search (idx_t n, const float *x, float radius,
+                               RangeSearchResult *result) const;
+
+    /** return the indexes of the k vectors closest to the query x.
+     *
+     * This function is identical as search but only return labels of neighbors.
+     * @param x           input vectors to search, size n * d
+     * @param labels      output labels of the NNs, size n*k
+     */
+    void assign (idx_t n, const float * x, idx_t * labels, idx_t k = 1);
+
+    /// removes all elements from the database.
+    virtual void reset() = 0;
+
+    /** removes IDs from the index. Not supported by all
+     * indexes. Returns the number of elements removed.
+     */
+    virtual size_t remove_ids (const IDSelector & sel);
+
+    /** Reconstruct a stored vector (or an approximation if lossy coding)
+     *
+     * this function may not be defined for some indexes
+     * @param key         id of the vector to reconstruct
+     * @param recons      reconstucted vector (size d)
+     */
+    virtual void reconstruct (idx_t key, float * recons) const;
+
+    /** Reconstruct vectors i0 to i0 + ni - 1
+     *
+     * this function may not be defined for some indexes
+     * @param recons      reconstucted vector (size ni * d)
+     */
+    virtual void reconstruct_n (idx_t i0, idx_t ni, float *recons) const;
+
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * If there are not enough results for a query, the resulting arrays
+     * is padded with -1s.
+     *
+     * @param recons      reconstructed vectors size (n, k, d)
+     **/
+    virtual void search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                         float *distances, idx_t *labels,
+                                         float *recons) const;
+
+    /** Computes a residual vector after indexing encoding.
+     *
+     * The residual vector is the difference between a vector and the
+     * reconstruction that can be decoded from its representation in
+     * the index. The residual can be used for multiple-stage indexing
+     * methods, like IndexIVF's methods.
+     *
+     * @param x           input vector, size d
+     * @param residual    output residual vector, size d
+     * @param key         encoded index, as returned by search and assign
+     */
+    virtual void compute_residual (const float * x,
+                                   float * residual, idx_t key) const;
+
+    /** Computes a residual vector after indexing encoding (batch form).
+     * Equivalent to calling compute_residual for each vector.
+     *
+     * The residual vector is the difference between a vector and the
+     * reconstruction that can be decoded from its representation in
+     * the index. The residual can be used for multiple-stage indexing
+     * methods, like IndexIVF's methods.
+     *
+     * @param n           number of vectors
+     * @param xs          input vectors, size (n x d)
+     * @param residuals   output residual vectors, size (n x d)
+     * @param keys        encoded index, as returned by search and assign
+     */
+    virtual void compute_residual_n (idx_t n, const float* xs,
+                                     float* residuals,
+                                     const idx_t* keys) const;
+
+    /** Get a DistanceComputer (defined in AuxIndexStructures) object
+     * for this kind of index.
+     *
+     * DistanceComputer is implemented for indexes that support random
+     * access of their vectors.
+     */
+    virtual DistanceComputer * get_distance_computer() const;
+
+
+    /* The standalone codec interface */
+
+    /** size of the produced codes in bytes */
+    virtual size_t sa_code_size () const;
+
+    /** encode a set of vectors
+     *
+     * @param n       number of vectors
+     * @param x       input vectors, size n * d
+     * @param bytes   output encoded vectors, size n * sa_code_size()
+     */
+    virtual void sa_encode (idx_t n, const float *x,
+                                  uint8_t *bytes) const;
+
+    /** encode a set of vectors
+     *
+     * @param n       number of vectors
+     * @param bytes   input encoded vectors, size n * sa_code_size()
+     * @param x       output vectors, size n * d
+     */
+    virtual void sa_decode (idx_t n, const uint8_t *bytes,
+                                    float *x) const;
+
+
+};
+
+}
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/Index2Layer.cpp b/core/src/index/thirdparty/faiss/Index2Layer.cpp
new file mode 100644
index 0000000000..45ff042a62
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/Index2Layer.cpp
@@ -0,0 +1,437 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/Index2Layer.h>
+
+#include <cmath>
+#include <cstdio>
+#include <cassert>
+#include <stdint.h>
+
+#ifdef __SSE__
+#include <immintrin.h>
+#endif
+
+#include <algorithm>
+
+#include <faiss/IndexIVFPQ.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/utils.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/utils/distances.h>
+
+
+/*
+#include <faiss/utils/Heap.h>
+
+#include <faiss/Clustering.h>
+
+#include <faiss/utils/hamming.h>
+
+
+*/
+
+
+namespace faiss {
+
+using idx_t = Index::idx_t;
+
+/*************************************
+ * Index2Layer implementation
+ *************************************/
+
+
+Index2Layer::Index2Layer (Index * quantizer, size_t nlist,
+                          int M, int nbit,
+                          MetricType metric):
+    Index (quantizer->d, metric),
+    q1 (quantizer, nlist),
+    pq (quantizer->d, M, nbit)
+{
+    is_trained = false;
+    for (int nbyte = 0; nbyte < 7; nbyte++) {
+        if ((1L << (8 * nbyte)) >= nlist) {
+            code_size_1 = nbyte;
+            break;
+        }
+    }
+    code_size_2 = pq.code_size;
+    code_size = code_size_1 + code_size_2;
+}
+
+Index2Layer::Index2Layer ()
+{
+    code_size = code_size_1 = code_size_2 = 0;
+}
+
+Index2Layer::~Index2Layer ()
+{}
+
+void Index2Layer::train(idx_t n, const float* x)
+{
+    if (verbose) {
+        printf ("training level-1 quantizer %ld vectors in %dD\n",
+                n, d);
+    }
+
+    q1.train_q1 (n, x, verbose, metric_type);
+
+    if (verbose) {
+        printf("computing residuals\n");
+    }
+
+    const float * x_in = x;
+
+    x = fvecs_maybe_subsample (
+         d, (size_t*)&n, pq.cp.max_points_per_centroid * pq.ksub,
+         x, verbose, pq.cp.seed);
+
+    ScopeDeleter<float> del_x (x_in == x ? nullptr : x);
+
+    std::vector<idx_t> assign(n); // assignement to coarse centroids
+    q1.quantizer->assign (n, x, assign.data());
+    std::vector<float> residuals(n * d);
+    for (idx_t i = 0; i < n; i++) {
+        q1.quantizer->compute_residual (
+           x + i * d, residuals.data() + i * d, assign[i]);
+    }
+
+    if (verbose)
+        printf ("training %zdx%zd product quantizer on %ld vectors in %dD\n",
+                pq.M, pq.ksub, n, d);
+    pq.verbose = verbose;
+    pq.train (n, residuals.data());
+
+    is_trained = true;
+}
+
+void Index2Layer::add(idx_t n, const float* x)
+{
+    idx_t bs = 32768;
+    if (n > bs) {
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min(i0 + bs, n);
+            if (verbose) {
+                printf("Index2Layer::add: adding %ld:%ld / %ld\n",
+                       i0, i1, n);
+            }
+            add (i1 - i0, x + i0 * d);
+        }
+        return;
+    }
+
+    std::vector<idx_t> codes1 (n);
+    q1.quantizer->assign (n, x, codes1.data());
+    std::vector<float> residuals(n * d);
+    for (idx_t i = 0; i < n; i++) {
+        q1.quantizer->compute_residual (
+            x + i * d, residuals.data() + i * d, codes1[i]);
+    }
+    std::vector<uint8_t> codes2 (n * code_size_2);
+
+    pq.compute_codes (residuals.data(), codes2.data(), n);
+
+    codes.resize ((ntotal + n) * code_size);
+    uint8_t *wp = &codes[ntotal * code_size];
+
+    {
+        int i = 0x11223344;
+        const char *ip = (char*)&i;
+        FAISS_THROW_IF_NOT_MSG (ip[0] == 0x44,
+                                "works only on a little-endian CPU");
+    }
+
+    // copy to output table
+    for (idx_t i = 0; i < n; i++) {
+        memcpy (wp, &codes1[i], code_size_1);
+        wp += code_size_1;
+        memcpy (wp, &codes2[i * code_size_2], code_size_2);
+        wp += code_size_2;
+    }
+
+    ntotal += n;
+
+}
+
+void Index2Layer::search(
+    idx_t /*n*/,
+    const float* /*x*/,
+    idx_t /*k*/,
+    float* /*distances*/,
+    idx_t* /*labels*/) const {
+  FAISS_THROW_MSG("not implemented");
+}
+
+
+void Index2Layer::reconstruct_n(idx_t i0, idx_t ni, float* recons) const
+{
+    float recons1[d];
+    FAISS_THROW_IF_NOT (i0 >= 0 && i0 + ni <= ntotal);
+    const uint8_t *rp = &codes[i0 * code_size];
+
+    for (idx_t i = 0; i < ni; i++) {
+        idx_t key = 0;
+        memcpy (&key, rp, code_size_1);
+        q1.quantizer->reconstruct (key, recons1);
+        rp += code_size_1;
+        pq.decode (rp, recons);
+        for (idx_t j = 0; j < d; j++) {
+            recons[j] += recons1[j];
+        }
+        rp += code_size_2;
+        recons += d;
+    }
+}
+
+void Index2Layer::transfer_to_IVFPQ (IndexIVFPQ & other) const
+{
+    FAISS_THROW_IF_NOT (other.nlist == q1.nlist);
+    FAISS_THROW_IF_NOT (other.code_size == code_size_2);
+    FAISS_THROW_IF_NOT (other.ntotal == 0);
+
+    const uint8_t *rp = codes.data();
+
+    for (idx_t i = 0; i < ntotal; i++) {
+        idx_t key = 0;
+        memcpy (&key, rp, code_size_1);
+        rp += code_size_1;
+        other.invlists->add_entry (key, i, rp);
+        rp += code_size_2;
+    }
+
+    other.ntotal = ntotal;
+
+}
+
+
+
+void Index2Layer::reconstruct(idx_t key, float* recons) const
+{
+    reconstruct_n (key, 1, recons);
+}
+
+void Index2Layer::reset()
+{
+    ntotal = 0;
+    codes.clear ();
+}
+
+
+namespace {
+
+
+struct Distance2Level : DistanceComputer {
+    size_t d;
+    const Index2Layer& storage;
+    std::vector<float> buf;
+    const float *q;
+
+    const float *pq_l1_tab, *pq_l2_tab;
+
+    explicit Distance2Level(const Index2Layer& storage)
+        : storage(storage) {
+        d = storage.d;
+        FAISS_ASSERT(storage.pq.dsub == 4);
+        pq_l2_tab = storage.pq.centroids.data();
+        buf.resize(2 * d);
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        storage.reconstruct(i, buf.data());
+        storage.reconstruct(j, buf.data() + d);
+        return fvec_L2sqr(buf.data() + d, buf.data(), d);
+    }
+
+    void set_query(const float *x) override {
+        q = x;
+    }
+};
+
+// well optimized for xNN+PQNN
+struct DistanceXPQ4 : Distance2Level {
+
+    int M, k;
+
+    explicit DistanceXPQ4(const Index2Layer& storage)
+        : Distance2Level (storage) {
+        const IndexFlat *quantizer =
+            dynamic_cast<IndexFlat*> (storage.q1.quantizer);
+
+        FAISS_ASSERT(quantizer);
+        M = storage.pq.M;
+        pq_l1_tab = quantizer->xb.data();
+    }
+
+    float operator () (idx_t i) override {
+#ifdef __SSE__
+        const uint8_t *code = storage.codes.data() + i * storage.code_size;
+        long key = 0;
+        memcpy (&key, code, storage.code_size_1);
+        code += storage.code_size_1;
+
+        // walking pointers
+        const float *qa = q;
+        const __m128 *l1_t = (const __m128 *)(pq_l1_tab + d * key);
+        const __m128 *pq_l2_t = (const __m128 *)pq_l2_tab;
+        __m128 accu = _mm_setzero_ps();
+
+        for (int m = 0; m < M; m++) {
+            __m128 qi = _mm_loadu_ps(qa);
+            __m128 recons = l1_t[m] + pq_l2_t[*code++];
+            __m128 diff = qi - recons;
+            accu += diff * diff;
+            pq_l2_t += 256;
+            qa += 4;
+        }
+
+        accu = _mm_hadd_ps (accu, accu);
+        accu = _mm_hadd_ps (accu, accu);
+        return  _mm_cvtss_f32 (accu);
+#else
+        FAISS_THROW_MSG("not implemented for non-x64 platforms");
+#endif
+    }
+
+};
+
+// well optimized for 2xNN+PQNN
+struct Distance2xXPQ4 : Distance2Level {
+
+    int M_2, mi_nbits;
+
+    explicit Distance2xXPQ4(const Index2Layer& storage)
+        : Distance2Level(storage) {
+        const MultiIndexQuantizer *mi =
+            dynamic_cast<MultiIndexQuantizer*> (storage.q1.quantizer);
+
+        FAISS_ASSERT(mi);
+        FAISS_ASSERT(storage.pq.M % 2 == 0);
+        M_2 = storage.pq.M / 2;
+        mi_nbits = mi->pq.nbits;
+        pq_l1_tab = mi->pq.centroids.data();
+    }
+
+    float operator () (idx_t i) override {
+        const uint8_t *code = storage.codes.data() + i * storage.code_size;
+        long key01 = 0;
+        memcpy (&key01, code, storage.code_size_1);
+        code += storage.code_size_1;
+#ifdef __SSE__
+
+        // walking pointers
+        const float *qa = q;
+        const __m128 *pq_l1_t = (const __m128 *)pq_l1_tab;
+        const __m128 *pq_l2_t = (const __m128 *)pq_l2_tab;
+        __m128 accu = _mm_setzero_ps();
+
+        for (int mi_m = 0; mi_m < 2; mi_m++) {
+            long l1_idx = key01 & ((1L << mi_nbits) - 1);
+            const __m128 * pq_l1 = pq_l1_t + M_2 * l1_idx;
+
+            for (int m = 0; m < M_2; m++) {
+                __m128 qi = _mm_loadu_ps(qa);
+                __m128 recons = pq_l1[m] + pq_l2_t[*code++];
+                __m128 diff = qi - recons;
+                accu += diff * diff;
+                pq_l2_t += 256;
+                qa += 4;
+            }
+            pq_l1_t += M_2 << mi_nbits;
+            key01 >>= mi_nbits;
+        }
+        accu = _mm_hadd_ps (accu, accu);
+        accu = _mm_hadd_ps (accu, accu);
+        return  _mm_cvtss_f32 (accu);
+#else
+        FAISS_THROW_MSG("not implemented for non-x64 platforms");
+#endif
+    }
+
+};
+
+
+}  // namespace
+
+
+DistanceComputer * Index2Layer::get_distance_computer() const {
+#ifdef __SSE__
+    const MultiIndexQuantizer *mi =
+        dynamic_cast<MultiIndexQuantizer*> (q1.quantizer);
+
+    if (mi && pq.M % 2 == 0 && pq.dsub == 4) {
+        return new Distance2xXPQ4(*this);
+    }
+
+    const IndexFlat *fl =
+        dynamic_cast<IndexFlat*> (q1.quantizer);
+
+    if (fl && pq.dsub == 4) {
+        return new DistanceXPQ4(*this);
+    }
+#endif
+
+    return Index::get_distance_computer();
+}
+
+
+/* The standalone codec interface */
+size_t Index2Layer::sa_code_size () const
+{
+    return code_size;
+}
+
+void Index2Layer::sa_encode (idx_t n, const float *x, uint8_t *bytes) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    std::unique_ptr<int64_t []> list_nos (new int64_t [n]);
+    q1.quantizer->assign (n, x, list_nos.get());
+    std::vector<float> residuals(n * d);
+    for (idx_t i = 0; i < n; i++) {
+        q1.quantizer->compute_residual (
+            x + i * d, residuals.data() + i * d, list_nos[i]);
+    }
+    pq.compute_codes (residuals.data(), bytes, n);
+
+    for (idx_t i = n - 1; i >= 0; i--) {
+        uint8_t * code = bytes + i * code_size;
+        memmove (code + code_size_1,
+                 bytes + i * code_size_2, code_size_2);
+        q1.encode_listno (list_nos[i], code);
+    }
+
+}
+
+void Index2Layer::sa_decode (idx_t n, const uint8_t *bytes, float *x) const
+{
+
+#pragma omp parallel
+    {
+        std::vector<float> residual (d);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            const uint8_t *code = bytes + i * code_size;
+            int64_t list_no = q1.decode_listno (code);
+            float *xi = x + i * d;
+            pq.decode (code + code_size_1, xi);
+            q1.quantizer->reconstruct (list_no, residual.data());
+            for (size_t j = 0; j < d; j++) {
+                xi[j] += residual[j];
+            }
+        }
+    }
+
+}
+
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/Index2Layer.h b/core/src/index/thirdparty/faiss/Index2Layer.h
new file mode 100644
index 0000000000..89f6ec776d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/Index2Layer.h
@@ -0,0 +1,85 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <vector>
+
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexIVF.h>
+
+namespace faiss {
+
+struct IndexIVFPQ;
+
+
+/** Same as an IndexIVFPQ without the inverted lists: codes are stored sequentially
+ *
+ * The class is mainly inteded to store encoded vectors that can be
+ * accessed randomly, the search function is not implemented.
+ */
+struct Index2Layer: Index {
+    /// first level quantizer
+    Level1Quantizer q1;
+
+    /// second level quantizer is always a PQ
+    ProductQuantizer pq;
+
+    /// Codes. Size ntotal * code_size.
+    std::vector<uint8_t> codes;
+
+    /// size of the code for the first level (ceil(log8(q1.nlist)))
+    size_t code_size_1;
+
+    /// size of the code for the second level
+    size_t code_size_2;
+
+    /// code_size_1 + code_size_2
+    size_t code_size;
+
+    Index2Layer (Index * quantizer, size_t nlist,
+                 int M, int nbit = 8,
+                 MetricType metric = METRIC_L2);
+
+    Index2Layer ();
+    ~Index2Layer ();
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    /// not implemented
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    void reset() override;
+
+    DistanceComputer * get_distance_computer() const override;
+
+    /// transfer the flat codes to an IVFPQ index
+    void transfer_to_IVFPQ(IndexIVFPQ & other) const;
+
+
+    /* The standalone codec interface */
+    size_t sa_code_size () const override;
+    void sa_encode (idx_t n, const float *x, uint8_t *bytes) const override;
+    void sa_decode (idx_t n, const uint8_t *bytes, float *x) const override;
+
+};
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexBinary.cpp b/core/src/index/thirdparty/faiss/IndexBinary.cpp
new file mode 100644
index 0000000000..5330004f84
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexBinary.cpp
@@ -0,0 +1,77 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexBinary.h>
+#include <faiss/impl/FaissAssert.h>
+
+#include <cstring>
+
+namespace faiss {
+
+IndexBinary::~IndexBinary() {}
+
+void IndexBinary::train(idx_t, const uint8_t *) {
+  // Does nothing by default.
+}
+
+void IndexBinary::range_search(idx_t, const uint8_t *, int,
+                               RangeSearchResult *) const {
+  FAISS_THROW_MSG("range search not implemented");
+}
+
+void IndexBinary::assign(idx_t n, const uint8_t *x, idx_t *labels, idx_t k) {
+  int *distances = new int[n * k];
+  ScopeDeleter<int> del(distances);
+  search(n, x, k, distances, labels);
+}
+
+void IndexBinary::add_with_ids(idx_t, const uint8_t *, const idx_t *) {
+  FAISS_THROW_MSG("add_with_ids not implemented for this type of index");
+}
+
+size_t IndexBinary::remove_ids(const IDSelector&) {
+  FAISS_THROW_MSG("remove_ids not implemented for this type of index");
+  return 0;
+}
+
+void IndexBinary::reconstruct(idx_t, uint8_t *) const {
+  FAISS_THROW_MSG("reconstruct not implemented for this type of index");
+}
+
+void IndexBinary::reconstruct_n(idx_t i0, idx_t ni, uint8_t *recons) const {
+  for (idx_t i = 0; i < ni; i++) {
+    reconstruct(i0 + i, recons + i * d);
+  }
+}
+
+void IndexBinary::search_and_reconstruct(idx_t n, const uint8_t *x, idx_t k,
+                                         int32_t *distances, idx_t *labels,
+                                         uint8_t *recons) const {
+  search(n, x, k, distances, labels);
+  for (idx_t i = 0; i < n; ++i) {
+    for (idx_t j = 0; j < k; ++j) {
+      idx_t ij = i * k + j;
+      idx_t key = labels[ij];
+      uint8_t *reconstructed = recons + ij * d;
+      if (key < 0) {
+        // Fill with NaNs
+        memset(reconstructed, -1, sizeof(*reconstructed) * d);
+      } else {
+        reconstruct(key, reconstructed);
+      }
+    }
+  }
+}
+
+void IndexBinary::display() const {
+  printf("Index: %s  -> %ld elements\n", typeid (*this).name(), ntotal);
+}
+
+
+}  // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexBinary.h b/core/src/index/thirdparty/faiss/IndexBinary.h
new file mode 100644
index 0000000000..88042002e0
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexBinary.h
@@ -0,0 +1,163 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_BINARY_H
+#define FAISS_INDEX_BINARY_H
+
+#include <cstdio>
+#include <typeinfo>
+#include <string>
+#include <sstream>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/Index.h>
+
+
+namespace faiss {
+
+
+/// Forward declarations see AuxIndexStructures.h
+struct IDSelector;
+struct RangeSearchResult;
+
+/** Abstract structure for a binary index.
+ *
+ * Supports adding vertices and searching them.
+ *
+ * All queries are symmetric because there is no distinction between codes and
+ * vectors.
+ */
+struct IndexBinary {
+  using idx_t = Index::idx_t;    ///< all indices are this type
+  using component_t = uint8_t;
+  using distance_t = int32_t;
+
+  int d;                 ///< vector dimension
+  int code_size;   ///< number of bytes per vector ( = d / 8 )
+  idx_t ntotal;          ///< total nb of indexed vectors
+  bool verbose;          ///< verbosity level
+
+  /// set if the Index does not require training, or if training is done already
+  bool is_trained;
+
+  /// type of metric this index uses for search
+  MetricType metric_type;
+
+  explicit IndexBinary(idx_t d = 0, MetricType metric = METRIC_L2)
+      : d(d),
+        code_size(d / 8),
+        ntotal(0),
+        verbose(false),
+        is_trained(true),
+        metric_type(metric) {
+        FAISS_THROW_IF_NOT(d % 8 == 0);
+      }
+
+  virtual ~IndexBinary();
+
+
+  /** Perform training on a representative set of vectors.
+   *
+   * @param n      nb of training vectors
+   * @param x      training vecors, size n * d / 8
+   */
+  virtual void train(idx_t n, const uint8_t *x);
+
+  /** Add n vectors of dimension d to the index.
+   *
+   * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
+   * @param x      input matrix, size n * d / 8
+   */
+  virtual void add(idx_t n, const uint8_t *x) = 0;
+
+  /** Same as add, but stores xids instead of sequential ids.
+   *
+   * The default implementation fails with an assertion, as it is
+   * not supported by all indexes.
+   *
+   * @param xids if non-null, ids to store for the vectors (size n)
+   */
+  virtual void add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids);
+
+  /** Query n vectors of dimension d to the index.
+   *
+   * return at most k vectors. If there are not enough results for a
+   * query, the result array is padded with -1s.
+   *
+   * @param x           input vectors to search, size n * d / 8
+   * @param labels      output labels of the NNs, size n*k
+   * @param distances   output pairwise distances, size n*k
+   */
+  virtual void search(idx_t n, const uint8_t *x, idx_t k,
+                      int32_t *distances, idx_t *labels) const = 0;
+
+  /** Query n vectors of dimension d to the index.
+   *
+   * return all vectors with distance < radius. Note that many
+   * indexes do not implement the range_search (only the k-NN search
+   * is mandatory).
+   *
+   * @param x           input vectors to search, size n * d / 8
+   * @param radius      search radius
+   * @param result      result table
+   */
+  virtual void range_search(idx_t n, const uint8_t *x, int radius,
+                            RangeSearchResult *result) const;
+
+  /** Return the indexes of the k vectors closest to the query x.
+   *
+   * This function is identical to search but only returns labels of neighbors.
+   * @param x           input vectors to search, size n * d / 8
+   * @param labels      output labels of the NNs, size n*k
+   */
+  void assign(idx_t n, const uint8_t *x, idx_t *labels, idx_t k = 1);
+
+  /// Removes all elements from the database.
+  virtual void reset() = 0;
+
+  /** Removes IDs from the index. Not supported by all indexes.
+   */
+  virtual size_t remove_ids(const IDSelector& sel);
+
+  /** Reconstruct a stored vector.
+   *
+   * This function may not be defined for some indexes.
+   * @param key         id of the vector to reconstruct
+   * @param recons      reconstucted vector (size d / 8)
+   */
+  virtual void reconstruct(idx_t key, uint8_t *recons) const;
+
+
+  /** Reconstruct vectors i0 to i0 + ni - 1.
+   *
+   * This function may not be defined for some indexes.
+   * @param recons      reconstucted vectors (size ni * d / 8)
+   */
+  virtual void reconstruct_n(idx_t i0, idx_t ni, uint8_t *recons) const;
+
+  /** Similar to search, but also reconstructs the stored vectors (or an
+   * approximation in the case of lossy coding) for the search results.
+   *
+   * If there are not enough results for a query, the resulting array
+   * is padded with -1s.
+   *
+   * @param recons      reconstructed vectors size (n, k, d)
+   **/
+  virtual void search_and_reconstruct(idx_t n, const uint8_t *x, idx_t k,
+                                      int32_t *distances, idx_t *labels,
+                                      uint8_t *recons) const;
+
+  /** Display the actual class name and some more info. */
+  void display() const;
+};
+
+
+}  // namespace faiss
+
+#endif  // FAISS_INDEX_BINARY_H
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryFlat.cpp b/core/src/index/thirdparty/faiss/IndexBinaryFlat.cpp
new file mode 100644
index 0000000000..a3de92d449
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexBinaryFlat.cpp
@@ -0,0 +1,83 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexBinaryFlat.h>
+
+#include <cstring>
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/utils.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+namespace faiss {
+
+IndexBinaryFlat::IndexBinaryFlat(idx_t d)
+    : IndexBinary(d) {}
+
+void IndexBinaryFlat::add(idx_t n, const uint8_t *x) {
+  xb.insert(xb.end(), x, x + n * code_size);
+  ntotal += n;
+}
+
+void IndexBinaryFlat::reset() {
+  xb.clear();
+  ntotal = 0;
+}
+
+void IndexBinaryFlat::search(idx_t n, const uint8_t *x, idx_t k,
+                             int32_t *distances, idx_t *labels) const {
+  const idx_t block_size = query_batch_size;
+  for (idx_t s = 0; s < n; s += block_size) {
+    idx_t nn = block_size;
+    if (s + block_size > n) {
+      nn = n - s;
+    }
+
+    if (use_heap) {
+      // We see the distances and labels as heaps.
+      int_maxheap_array_t res = {
+        size_t(nn), size_t(k), labels + s * k, distances + s * k
+      };
+
+      hammings_knn_hc(&res, x + s * code_size, xb.data(), ntotal, code_size,
+                      /* ordered = */ true);
+    } else {
+      hammings_knn_mc(x + s * code_size, xb.data(), nn, ntotal, k, code_size,
+                      distances + s * k, labels + s * k);
+    }
+  }
+}
+
+size_t IndexBinaryFlat::remove_ids(const IDSelector& sel) {
+  idx_t j = 0;
+  for (idx_t i = 0; i < ntotal; i++) {
+    if (sel.is_member(i)) {
+      // should be removed
+    } else {
+      if (i > j) {
+        memmove(&xb[code_size * j], &xb[code_size * i], sizeof(xb[0]) * code_size);
+      }
+      j++;
+    }
+  }
+  long nremove = ntotal - j;
+  if (nremove > 0) {
+    ntotal = j;
+    xb.resize(ntotal * code_size);
+  }
+  return nremove;
+}
+
+void IndexBinaryFlat::reconstruct(idx_t key, uint8_t *recons) const {
+  memcpy(recons, &(xb[code_size * key]), sizeof(*recons) * code_size);
+}
+
+
+}  // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryFlat.h b/core/src/index/thirdparty/faiss/IndexBinaryFlat.h
new file mode 100644
index 0000000000..6f24aac5b6
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexBinaryFlat.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef INDEX_BINARY_FLAT_H
+#define INDEX_BINARY_FLAT_H
+
+#include <vector>
+
+#include <faiss/IndexBinary.h>
+
+namespace faiss {
+
+
+/** Index that stores the full vectors and performs exhaustive search. */
+struct IndexBinaryFlat : IndexBinary {
+  /// database vectors, size ntotal * d / 8
+  std::vector<uint8_t> xb;
+
+  /** Select between using a heap or counting to select the k smallest values
+   * when scanning inverted lists.
+   */
+  bool use_heap = true;
+
+  size_t query_batch_size = 32;
+
+  explicit IndexBinaryFlat(idx_t d);
+
+  void add(idx_t n, const uint8_t *x) override;
+
+  void reset() override;
+
+  void search(idx_t n, const uint8_t *x, idx_t k,
+              int32_t *distances, idx_t *labels) const override;
+
+  void reconstruct(idx_t key, uint8_t *recons) const override;
+
+  /** Remove some ids. Note that because of the indexing structure,
+   * the semantics of this operation are different from the usual ones:
+   * the new ids are shifted. */
+  size_t remove_ids(const IDSelector& sel) override;
+
+  IndexBinaryFlat() {}
+};
+
+
+}  // namespace faiss
+
+#endif  // INDEX_BINARY_FLAT_H
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.cpp b/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.cpp
new file mode 100644
index 0000000000..bc7200a80f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.cpp
@@ -0,0 +1,78 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexBinaryFromFloat.h>
+
+#include <memory>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+
+IndexBinaryFromFloat::IndexBinaryFromFloat() {}
+
+IndexBinaryFromFloat::IndexBinaryFromFloat(Index *index)
+    : IndexBinary(index->d),
+      index(index),
+      own_fields(false) {
+  is_trained = index->is_trained;
+  ntotal = index->ntotal;
+}
+
+IndexBinaryFromFloat::~IndexBinaryFromFloat() {
+  if (own_fields) {
+    delete index;
+  }
+}
+
+void IndexBinaryFromFloat::add(idx_t n, const uint8_t *x) {
+  constexpr idx_t bs = 32768;
+  std::unique_ptr<float[]> xf(new float[bs * d]);
+
+  for (idx_t b = 0; b < n; b += bs) {
+    idx_t bn = std::min(bs, n - b);
+    binary_to_real(bn * d, x + b * code_size, xf.get());
+
+    index->add(bn, xf.get());
+  }
+  ntotal = index->ntotal;
+}
+
+void IndexBinaryFromFloat::reset() {
+  index->reset();
+  ntotal = index->ntotal;
+}
+
+void IndexBinaryFromFloat::search(idx_t n, const uint8_t *x, idx_t k,
+                                  int32_t *distances, idx_t *labels) const {
+  constexpr idx_t bs = 32768;
+  std::unique_ptr<float[]> xf(new float[bs * d]);
+  std::unique_ptr<float[]> df(new float[bs * k]);
+
+  for (idx_t b = 0; b < n; b += bs) {
+    idx_t bn = std::min(bs, n - b);
+    binary_to_real(bn * d, x + b * code_size, xf.get());
+
+    index->search(bn, xf.get(), k, df.get(), labels + b * k);
+    for (int i = 0; i < bn * k; ++i) {
+      distances[b * k + i] = int32_t(std::round(df[i] / 4.0));
+    }
+  }
+}
+
+void IndexBinaryFromFloat::train(idx_t n, const uint8_t *x) {
+  std::unique_ptr<float[]> xf(new float[n * d]);
+  binary_to_real(n * d, x, xf.get());
+
+  index->train(n, xf.get());
+  is_trained = true;
+  ntotal = index->ntotal;
+}
+
+}  // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.h b/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.h
new file mode 100644
index 0000000000..215af73ce6
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_BINARY_FROM_FLOAT_H
+#define FAISS_INDEX_BINARY_FROM_FLOAT_H
+
+#include <faiss/IndexBinary.h>
+
+
+namespace faiss {
+
+
+struct Index;
+
+/** IndexBinary backed by a float Index.
+ *
+ * Supports adding vertices and searching them.
+ *
+ * All queries are symmetric because there is no distinction between codes and
+ * vectors.
+ */
+struct IndexBinaryFromFloat : IndexBinary {
+  Index *index = nullptr;
+
+  bool own_fields = false; ///< Whether object owns the index pointer.
+
+  IndexBinaryFromFloat();
+
+  explicit IndexBinaryFromFloat(Index *index);
+
+  ~IndexBinaryFromFloat();
+
+  void add(idx_t n, const uint8_t *x) override;
+
+  void reset() override;
+
+  void search(idx_t n, const uint8_t *x, idx_t k,
+              int32_t *distances, idx_t *labels) const override;
+
+  void train(idx_t n, const uint8_t *x) override;
+};
+
+
+}  // namespace faiss
+
+#endif  // FAISS_INDEX_BINARY_FROM_FLOAT_H
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryHNSW.cpp b/core/src/index/thirdparty/faiss/IndexBinaryHNSW.cpp
new file mode 100644
index 0000000000..8e886f7253
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexBinaryHNSW.cpp
@@ -0,0 +1,325 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexBinaryHNSW.h>
+
+
+#include <memory>
+#include <cstdlib>
+#include <cassert>
+#include <cstring>
+#include <cstdio>
+#include <cmath>
+#include <omp.h>
+
+#include <unordered_set>
+#include <queue>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <stdint.h>
+
+#include <faiss/utils/random.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+namespace faiss {
+
+
+/**************************************************************
+ * add / search blocks of descriptors
+ **************************************************************/
+
+namespace {
+
+
+void hnsw_add_vertices(IndexBinaryHNSW& index_hnsw,
+                       size_t n0,
+                       size_t n, const uint8_t *x,
+                       bool verbose,
+                       bool preset_levels = false) {
+  HNSW& hnsw = index_hnsw.hnsw;
+  size_t ntotal = n0 + n;
+  double t0 = getmillisecs();
+  if (verbose) {
+    printf("hnsw_add_vertices: adding %ld elements on top of %ld "
+           "(preset_levels=%d)\n",
+           n, n0, int(preset_levels));
+  }
+
+  int max_level = hnsw.prepare_level_tab(n, preset_levels);
+
+  if (verbose) {
+    printf("  max_level = %d\n", max_level);
+  }
+
+  std::vector<omp_lock_t> locks(ntotal);
+  for(int i = 0; i < ntotal; i++) {
+    omp_init_lock(&locks[i]);
+  }
+
+  // add vectors from highest to lowest level
+  std::vector<int> hist;
+  std::vector<int> order(n);
+
+  { // make buckets with vectors of the same level
+
+    // build histogram
+    for (int i = 0; i < n; i++) {
+      HNSW::storage_idx_t pt_id = i + n0;
+      int pt_level = hnsw.levels[pt_id] - 1;
+      while (pt_level >= hist.size()) {
+        hist.push_back(0);
+      }
+      hist[pt_level] ++;
+    }
+
+    // accumulate
+    std::vector<int> offsets(hist.size() + 1, 0);
+    for (int i = 0; i < hist.size() - 1; i++) {
+      offsets[i + 1] = offsets[i] + hist[i];
+    }
+
+    // bucket sort
+    for (int i = 0; i < n; i++) {
+      HNSW::storage_idx_t pt_id = i + n0;
+      int pt_level = hnsw.levels[pt_id] - 1;
+      order[offsets[pt_level]++] = pt_id;
+    }
+  }
+
+  { // perform add
+    RandomGenerator rng2(789);
+
+    int i1 = n;
+
+    for (int pt_level = hist.size() - 1; pt_level >= 0; pt_level--) {
+      int i0 = i1 - hist[pt_level];
+
+      if (verbose) {
+        printf("Adding %d elements at level %d\n",
+               i1 - i0, pt_level);
+      }
+
+      // random permutation to get rid of dataset order bias
+      for (int j = i0; j < i1; j++) {
+        std::swap(order[j], order[j + rng2.rand_int(i1 - j)]);
+      }
+
+#pragma omp parallel
+      {
+        VisitedTable vt (ntotal);
+
+        std::unique_ptr<DistanceComputer> dis(
+          index_hnsw.get_distance_computer()
+        );
+        int prev_display = verbose && omp_get_thread_num() == 0 ? 0 : -1;
+
+#pragma omp  for schedule(dynamic)
+        for (int i = i0; i < i1; i++) {
+          HNSW::storage_idx_t pt_id = order[i];
+          dis->set_query((float *)(x + (pt_id - n0) * index_hnsw.code_size));
+
+          hnsw.add_with_locks(*dis, pt_level, pt_id, locks, vt);
+
+          if (prev_display >= 0 && i - i0 > prev_display + 10000) {
+            prev_display = i - i0;
+            printf("  %d / %d\r", i - i0, i1 - i0);
+            fflush(stdout);
+          }
+        }
+      }
+      i1 = i0;
+    }
+    FAISS_ASSERT(i1 == 0);
+  }
+  if (verbose) {
+    printf("Done in %.3f ms\n", getmillisecs() - t0);
+  }
+
+  for(int i = 0; i < ntotal; i++)
+    omp_destroy_lock(&locks[i]);
+}
+
+
+} // anonymous namespace
+
+
+/**************************************************************
+ * IndexBinaryHNSW implementation
+ **************************************************************/
+
+IndexBinaryHNSW::IndexBinaryHNSW()
+{
+  is_trained = true;
+}
+
+IndexBinaryHNSW::IndexBinaryHNSW(int d, int M)
+    : IndexBinary(d),
+      hnsw(M),
+      own_fields(true),
+      storage(new IndexBinaryFlat(d))
+{
+  is_trained = true;
+}
+
+IndexBinaryHNSW::IndexBinaryHNSW(IndexBinary *storage, int M)
+    : IndexBinary(storage->d),
+      hnsw(M),
+      own_fields(false),
+      storage(storage)
+{
+  is_trained = true;
+}
+
+IndexBinaryHNSW::~IndexBinaryHNSW() {
+  if (own_fields) {
+    delete storage;
+  }
+}
+
+void IndexBinaryHNSW::train(idx_t n, const uint8_t *x)
+{
+  // hnsw structure does not require training
+  storage->train(n, x);
+  is_trained = true;
+}
+
+void IndexBinaryHNSW::search(idx_t n, const uint8_t *x, idx_t k,
+                             int32_t *distances, idx_t *labels) const
+{
+#pragma omp parallel
+  {
+    VisitedTable vt(ntotal);
+    std::unique_ptr<DistanceComputer> dis(get_distance_computer());
+
+#pragma omp for
+    for(idx_t i = 0; i < n; i++) {
+      idx_t *idxi = labels + i * k;
+      float *simi = (float *)(distances + i * k);
+
+      dis->set_query((float *)(x + i * code_size));
+
+      maxheap_heapify(k, simi, idxi);
+      hnsw.search(*dis, k, idxi, simi, vt);
+      maxheap_reorder(k, simi, idxi);
+    }
+  }
+
+#pragma omp parallel for
+  for (int i = 0; i < n * k; ++i) {
+    distances[i] = std::round(((float *)distances)[i]);
+  }
+}
+
+
+void IndexBinaryHNSW::add(idx_t n, const uint8_t *x)
+{
+  FAISS_THROW_IF_NOT(is_trained);
+  int n0 = ntotal;
+  storage->add(n, x);
+  ntotal = storage->ntotal;
+
+  hnsw_add_vertices(*this, n0, n, x, verbose,
+                    hnsw.levels.size() == ntotal);
+}
+
+void IndexBinaryHNSW::reset()
+{
+  hnsw.reset();
+  storage->reset();
+  ntotal = 0;
+}
+
+void IndexBinaryHNSW::reconstruct(idx_t key, uint8_t *recons) const
+{
+  storage->reconstruct(key, recons);
+}
+
+
+namespace {
+
+
+template<class HammingComputer>
+struct FlatHammingDis : DistanceComputer {
+  const int code_size;
+  const uint8_t *b;
+  size_t ndis;
+  HammingComputer hc;
+
+  float operator () (idx_t i) override {
+    ndis++;
+    return hc.hamming(b + i * code_size);
+  }
+
+  float symmetric_dis(idx_t i, idx_t j) override {
+    return HammingComputerDefault(b + j * code_size, code_size)
+      .hamming(b + i * code_size);
+  }
+
+
+  explicit FlatHammingDis(const IndexBinaryFlat& storage)
+      : code_size(storage.code_size),
+        b(storage.xb.data()),
+        ndis(0),
+        hc() {}
+
+  // NOTE: Pointers are cast from float in order to reuse the floating-point
+  //   DistanceComputer.
+  void set_query(const float *x) override {
+    hc.set((uint8_t *)x, code_size);
+  }
+
+  ~FlatHammingDis() override {
+#pragma omp critical
+    {
+      hnsw_stats.ndis += ndis;
+    }
+  }
+};
+
+
+}  // namespace
+
+
+DistanceComputer *IndexBinaryHNSW::get_distance_computer() const {
+  IndexBinaryFlat *flat_storage = dynamic_cast<IndexBinaryFlat *>(storage);
+
+  FAISS_ASSERT(flat_storage != nullptr);
+
+  switch(code_size) {
+    case 4:
+      return new FlatHammingDis<HammingComputer4>(*flat_storage);
+    case 8:
+      return new FlatHammingDis<HammingComputer8>(*flat_storage);
+    case 16:
+      return new FlatHammingDis<HammingComputer16>(*flat_storage);
+    case 20:
+      return new FlatHammingDis<HammingComputer20>(*flat_storage);
+    case 32:
+      return new FlatHammingDis<HammingComputer32>(*flat_storage);
+    case 64:
+      return new FlatHammingDis<HammingComputer64>(*flat_storage);
+    default:
+      if (code_size % 8 == 0) {
+        return new FlatHammingDis<HammingComputerM8>(*flat_storage);
+      } else if (code_size % 4 == 0) {
+        return new FlatHammingDis<HammingComputerM4>(*flat_storage);
+      }
+  }
+
+  return new FlatHammingDis<HammingComputerDefault>(*flat_storage);
+}
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryHNSW.h b/core/src/index/thirdparty/faiss/IndexBinaryHNSW.h
new file mode 100644
index 0000000000..a6def6655c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexBinaryHNSW.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <faiss/impl/HNSW.h>
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/utils/utils.h>
+
+
+namespace faiss {
+
+
+/** The HNSW index is a normal random-access index with a HNSW
+ * link structure built on top */
+
+struct IndexBinaryHNSW : IndexBinary {
+  typedef HNSW::storage_idx_t storage_idx_t;
+
+  // the link strcuture
+  HNSW hnsw;
+
+  // the sequential storage
+  bool own_fields;
+  IndexBinary *storage;
+
+  explicit IndexBinaryHNSW();
+  explicit IndexBinaryHNSW(int d, int M = 32);
+  explicit IndexBinaryHNSW(IndexBinary *storage, int M = 32);
+
+  ~IndexBinaryHNSW() override;
+
+  DistanceComputer *get_distance_computer() const;
+
+  void add(idx_t n, const uint8_t *x) override;
+
+  /// Trains the storage if needed
+  void train(idx_t n, const uint8_t* x) override;
+
+  /// entry point for search
+  void search(idx_t n, const uint8_t *x, idx_t k,
+              int32_t *distances, idx_t *labels) const override;
+
+  void reconstruct(idx_t key, uint8_t* recons) const override;
+
+  void reset() override;
+};
+
+
+}  // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryIVF.cpp b/core/src/index/thirdparty/faiss/IndexBinaryIVF.cpp
new file mode 100644
index 0000000000..c9c1c84070
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexBinaryIVF.cpp
@@ -0,0 +1,671 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved
+// -*- c++ -*-
+
+#include <faiss/IndexBinaryIVF.h>
+
+#include <cstdio>
+#include <memory>
+
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/utils.h>
+
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/IndexFlat.h>
+
+
+namespace faiss {
+
+IndexBinaryIVF::IndexBinaryIVF(IndexBinary *quantizer, size_t d, size_t nlist)
+    : IndexBinary(d),
+      invlists(new ArrayInvertedLists(nlist, code_size)),
+      own_invlists(true),
+      nprobe(1),
+      max_codes(0),
+      maintain_direct_map(false),
+      quantizer(quantizer),
+      nlist(nlist),
+      own_fields(false),
+      clustering_index(nullptr)
+{
+  FAISS_THROW_IF_NOT (d == quantizer->d);
+  is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
+
+  cp.niter = 10;
+}
+
+IndexBinaryIVF::IndexBinaryIVF()
+    : invlists(nullptr),
+      own_invlists(false),
+      nprobe(1),
+      max_codes(0),
+      maintain_direct_map(false),
+      quantizer(nullptr),
+      nlist(0),
+      own_fields(false),
+      clustering_index(nullptr)
+{}
+
+void IndexBinaryIVF::add(idx_t n, const uint8_t *x) {
+  add_with_ids(n, x, nullptr);
+}
+
+void IndexBinaryIVF::add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids) {
+  add_core(n, x, xids, nullptr);
+}
+
+void IndexBinaryIVF::add_core(idx_t n, const uint8_t *x, const idx_t *xids,
+                              const idx_t *precomputed_idx) {
+  FAISS_THROW_IF_NOT(is_trained);
+  assert(invlists);
+  FAISS_THROW_IF_NOT_MSG(!(maintain_direct_map && xids),
+                         "cannot have direct map and add with ids");
+
+  const idx_t * idx;
+
+  std::unique_ptr<idx_t[]> scoped_idx;
+
+  if (precomputed_idx) {
+    idx = precomputed_idx;
+  } else {
+    scoped_idx.reset(new idx_t[n]);
+    quantizer->assign(n, x, scoped_idx.get());
+    idx = scoped_idx.get();
+  }
+
+  long n_add = 0;
+  for (size_t i = 0; i < n; i++) {
+    idx_t id = xids ? xids[i] : ntotal + i;
+    idx_t list_no = idx[i];
+
+    if (list_no < 0)
+      continue;
+    const uint8_t *xi = x + i * code_size;
+    size_t offset = invlists->add_entry(list_no, id, xi);
+
+    if (maintain_direct_map)
+      direct_map.push_back(list_no << 32 | offset);
+    n_add++;
+  }
+  if (verbose) {
+    printf("IndexBinaryIVF::add_with_ids: added %ld / %ld vectors\n",
+           n_add, n);
+  }
+  ntotal += n_add;
+}
+
+void IndexBinaryIVF::make_direct_map(bool new_maintain_direct_map) {
+  // nothing to do
+  if (new_maintain_direct_map == maintain_direct_map)
+    return;
+
+  if (new_maintain_direct_map) {
+    direct_map.resize(ntotal, -1);
+    for (size_t key = 0; key < nlist; key++) {
+      size_t list_size = invlists->list_size(key);
+      const idx_t *idlist = invlists->get_ids(key);
+
+      for (size_t ofs = 0; ofs < list_size; ofs++) {
+        FAISS_THROW_IF_NOT_MSG(0 <= idlist[ofs] && idlist[ofs] < ntotal,
+                               "direct map supported only for seuquential ids");
+        direct_map[idlist[ofs]] = key << 32 | ofs;
+      }
+    }
+  } else {
+    direct_map.clear();
+  }
+  maintain_direct_map = new_maintain_direct_map;
+}
+
+void IndexBinaryIVF::search(idx_t n, const uint8_t *x, idx_t k,
+                            int32_t *distances, idx_t *labels) const {
+  std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+  std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
+
+  double t0 = getmillisecs();
+  quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+  indexIVF_stats.quantization_time += getmillisecs() - t0;
+
+  t0 = getmillisecs();
+  invlists->prefetch_lists(idx.get(), n * nprobe);
+
+  search_preassigned(n, x, k, idx.get(), coarse_dis.get(),
+                     distances, labels, false);
+  indexIVF_stats.search_time += getmillisecs() - t0;
+}
+
+void IndexBinaryIVF::reconstruct(idx_t key, uint8_t *recons) const {
+  FAISS_THROW_IF_NOT_MSG(direct_map.size() == ntotal,
+                         "direct map is not initialized");
+  idx_t list_no = direct_map[key] >> 32;
+  idx_t offset = direct_map[key] & 0xffffffff;
+  reconstruct_from_offset(list_no, offset, recons);
+}
+
+void IndexBinaryIVF::reconstruct_n(idx_t i0, idx_t ni, uint8_t *recons) const {
+  FAISS_THROW_IF_NOT(ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
+
+  for (idx_t list_no = 0; list_no < nlist; list_no++) {
+    size_t list_size = invlists->list_size(list_no);
+    const Index::idx_t *idlist = invlists->get_ids(list_no);
+
+    for (idx_t offset = 0; offset < list_size; offset++) {
+      idx_t id = idlist[offset];
+      if (!(id >= i0 && id < i0 + ni)) {
+        continue;
+      }
+
+      uint8_t *reconstructed = recons + (id - i0) * d;
+      reconstruct_from_offset(list_no, offset, reconstructed);
+    }
+  }
+}
+
+void IndexBinaryIVF::search_and_reconstruct(idx_t n, const uint8_t *x, idx_t k,
+                                            int32_t *distances, idx_t *labels,
+                                            uint8_t *recons) const {
+  std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+  std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
+
+  quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+
+  invlists->prefetch_lists(idx.get(), n * nprobe);
+
+  // search_preassigned() with `store_pairs` enabled to obtain the list_no
+  // and offset into `codes` for reconstruction
+  search_preassigned(n, x, k, idx.get(), coarse_dis.get(),
+                     distances, labels, /* store_pairs */true);
+  for (idx_t i = 0; i < n; ++i) {
+    for (idx_t j = 0; j < k; ++j) {
+      idx_t ij = i * k + j;
+      idx_t key = labels[ij];
+      uint8_t *reconstructed = recons + ij * d;
+      if (key < 0) {
+        // Fill with NaNs
+        memset(reconstructed, -1, sizeof(*reconstructed) * d);
+      } else {
+        int list_no = key >> 32;
+        int offset = key & 0xffffffff;
+
+        // Update label to the actual id
+        labels[ij] = invlists->get_single_id(list_no, offset);
+
+        reconstruct_from_offset(list_no, offset, reconstructed);
+      }
+    }
+  }
+}
+
+void IndexBinaryIVF::reconstruct_from_offset(idx_t list_no, idx_t offset,
+                                             uint8_t *recons) const {
+  memcpy(recons, invlists->get_single_code(list_no, offset), code_size);
+}
+
+void IndexBinaryIVF::reset() {
+  direct_map.clear();
+  invlists->reset();
+  ntotal = 0;
+}
+
+size_t IndexBinaryIVF::remove_ids(const IDSelector& sel) {
+  FAISS_THROW_IF_NOT_MSG(!maintain_direct_map,
+                         "direct map remove not implemented");
+
+  std::vector<idx_t> toremove(nlist);
+
+#pragma omp parallel for
+  for (idx_t i = 0; i < nlist; i++) {
+    idx_t l0 = invlists->list_size (i), l = l0, j = 0;
+    const idx_t *idsi = invlists->get_ids(i);
+    while (j < l) {
+      if (sel.is_member(idsi[j])) {
+        l--;
+        invlists->update_entry(
+          i, j,
+          invlists->get_single_id(i, l),
+          invlists->get_single_code(i, l));
+      } else {
+        j++;
+      }
+    }
+    toremove[i] = l0 - l;
+  }
+  // this will not run well in parallel on ondisk because of possible shrinks
+  size_t nremove = 0;
+  for (idx_t i = 0; i < nlist; i++) {
+    if (toremove[i] > 0) {
+      nremove += toremove[i];
+      invlists->resize(
+        i, invlists->list_size(i) - toremove[i]);
+    }
+  }
+  ntotal -= nremove;
+  return nremove;
+}
+
+void IndexBinaryIVF::train(idx_t n, const uint8_t *x) {
+  if (verbose) {
+    printf("Training quantizer\n");
+  }
+
+  if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
+    if (verbose) {
+      printf("IVF quantizer does not need training.\n");
+    }
+  } else {
+    if (verbose) {
+      printf("Training quantizer on %ld vectors in %dD\n", n, d);
+    }
+
+    Clustering clus(d, nlist, cp);
+    quantizer->reset();
+
+    std::unique_ptr<float[]> x_f(new float[n * d]);
+    binary_to_real(n * d, x, x_f.get());
+
+    IndexFlatL2 index_tmp(d);
+
+    if (clustering_index && verbose) {
+      printf("using clustering_index of dimension %d to do the clustering\n",
+             clustering_index->d);
+    }
+
+    clus.train(n, x_f.get(), clustering_index ? *clustering_index : index_tmp);
+
+    std::unique_ptr<uint8_t[]> x_b(new uint8_t[clus.k * code_size]);
+    real_to_binary(d * clus.k, clus.centroids.data(), x_b.get());
+
+    quantizer->add(clus.k, x_b.get());
+    quantizer->is_trained = true;
+  }
+
+  is_trained = true;
+}
+
+void IndexBinaryIVF::merge_from(IndexBinaryIVF &other, idx_t add_id) {
+  // minimal sanity checks
+  FAISS_THROW_IF_NOT(other.d == d);
+  FAISS_THROW_IF_NOT(other.nlist == nlist);
+  FAISS_THROW_IF_NOT(other.code_size == code_size);
+  FAISS_THROW_IF_NOT_MSG((!maintain_direct_map &&
+                          !other.maintain_direct_map),
+                         "direct map copy not implemented");
+  FAISS_THROW_IF_NOT_MSG(typeid (*this) == typeid (other),
+                         "can only merge indexes of the same type");
+
+  invlists->merge_from (other.invlists, add_id);
+
+  ntotal += other.ntotal;
+  other.ntotal = 0;
+}
+
+void IndexBinaryIVF::replace_invlists(InvertedLists *il, bool own) {
+  FAISS_THROW_IF_NOT(il->nlist == nlist &&
+                     il->code_size == code_size);
+  if (own_invlists) {
+    delete invlists;
+  }
+  invlists = il;
+  own_invlists = own;
+}
+
+
+namespace {
+
+using idx_t = Index::idx_t;
+
+
+template<class HammingComputer, bool store_pairs>
+struct IVFBinaryScannerL2: BinaryInvertedListScanner {
+
+    HammingComputer hc;
+    size_t code_size;
+
+    IVFBinaryScannerL2 (size_t code_size): code_size (code_size)
+    {}
+
+    void set_query (const uint8_t *query_vector) override {
+        hc.set (query_vector, code_size);
+    }
+
+    idx_t list_no;
+    void set_list (idx_t list_no, uint8_t /* coarse_dis */) override {
+        this->list_no = list_no;
+    }
+
+    uint32_t distance_to_code (const uint8_t *code) const override {
+        return hc.hamming (code);
+    }
+
+    size_t scan_codes (size_t n,
+                       const uint8_t *codes,
+                       const idx_t *ids,
+                       int32_t *simi, idx_t *idxi,
+                       size_t k) const override
+    {
+        using C = CMax<int32_t, idx_t>;
+
+        size_t nup = 0;
+        for (size_t j = 0; j < n; j++) {
+            uint32_t dis = hc.hamming (codes);
+            if (dis < simi[0]) {
+                heap_pop<C> (k, simi, idxi);
+                idx_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                heap_push<C> (k, simi, idxi, dis, id);
+                nup++;
+            }
+            codes += code_size;
+        }
+        return nup;
+    }
+
+
+};
+
+
+template <bool store_pairs>
+BinaryInvertedListScanner *select_IVFBinaryScannerL2 (size_t code_size) {
+
+    switch (code_size) {
+#define HANDLE_CS(cs)                                                  \
+    case cs:                                                            \
+        return new IVFBinaryScannerL2<HammingComputer ## cs, store_pairs> (cs);
+      HANDLE_CS(4);
+      HANDLE_CS(8);
+      HANDLE_CS(16);
+      HANDLE_CS(20);
+      HANDLE_CS(32);
+      HANDLE_CS(64);
+#undef HANDLE_CS
+    default:
+        if (code_size % 8 == 0) {
+            return new IVFBinaryScannerL2<HammingComputerM8,
+                store_pairs> (code_size);
+        } else if (code_size % 4 == 0) {
+            return new IVFBinaryScannerL2<HammingComputerM4,
+                store_pairs> (code_size);
+        } else {
+            return new IVFBinaryScannerL2<HammingComputerDefault,
+                store_pairs> (code_size);
+        }
+    }
+}
+
+
+void search_knn_hamming_heap(const IndexBinaryIVF& ivf,
+                             size_t n,
+                             const uint8_t *x,
+                             idx_t k,
+                             const idx_t *keys,
+                             const int32_t * coarse_dis,
+                             int32_t *distances, idx_t *labels,
+                             bool store_pairs,
+                             const IVFSearchParameters *params)
+{
+    long nprobe = params ? params->nprobe : ivf.nprobe;
+    long max_codes = params ? params->max_codes : ivf.max_codes;
+    MetricType metric_type = ivf.metric_type;
+
+    // almost verbatim copy from IndexIVF::search_preassigned
+
+    size_t nlistv = 0, ndis = 0, nheap = 0;
+    using HeapForIP = CMin<int32_t, idx_t>;
+    using HeapForL2 = CMax<int32_t, idx_t>;
+
+#pragma omp parallel if(n > 1) reduction(+: nlistv, ndis, nheap)
+    {
+        std::unique_ptr<BinaryInvertedListScanner> scanner
+            (ivf.get_InvertedListScanner (store_pairs));
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            const uint8_t *xi = x + i * ivf.code_size;
+            scanner->set_query(xi);
+
+            const idx_t * keysi = keys + i * nprobe;
+            int32_t * simi = distances + k * i;
+            idx_t * idxi = labels + k * i;
+
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_heapify<HeapForIP> (k, simi, idxi);
+            } else {
+                heap_heapify<HeapForL2> (k, simi, idxi);
+            }
+
+            size_t nscan = 0;
+
+            for (size_t ik = 0; ik < nprobe; ik++) {
+                idx_t key = keysi[ik];  /* select the list  */
+                if (key < 0) {
+                    // not enough centroids for multiprobe
+                    continue;
+                }
+                FAISS_THROW_IF_NOT_FMT
+                    (key < (idx_t) ivf.nlist,
+                     "Invalid key=%ld  at ik=%ld nlist=%ld\n",
+                     key, ik, ivf.nlist);
+
+                scanner->set_list (key, coarse_dis[i * nprobe + ik]);
+
+                nlistv++;
+
+                size_t list_size = ivf.invlists->list_size(key);
+                InvertedLists::ScopedCodes scodes (ivf.invlists, key);
+                std::unique_ptr<InvertedLists::ScopedIds> sids;
+                const Index::idx_t * ids = nullptr;
+
+                if (!store_pairs) {
+                    sids.reset (new InvertedLists::ScopedIds (ivf.invlists, key));
+                    ids = sids->get();
+                }
+
+                nheap += scanner->scan_codes (list_size, scodes.get(),
+                                              ids, simi, idxi, k);
+
+                nscan += list_size;
+                if (max_codes && nscan >= max_codes)
+                    break;
+            }
+
+            ndis += nscan;
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_reorder<HeapForIP> (k, simi, idxi);
+            } else {
+                heap_reorder<HeapForL2> (k, simi, idxi);
+            }
+
+        } // parallel for
+    } // parallel
+
+    indexIVF_stats.nq += n;
+    indexIVF_stats.nlist += nlistv;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.nheap_updates += nheap;
+
+}
+
+template<class HammingComputer, bool store_pairs>
+void search_knn_hamming_count(const IndexBinaryIVF& ivf,
+                              size_t nx,
+                              const uint8_t *x,
+                              const idx_t *keys,
+                              int k,
+                              int32_t *distances,
+                              idx_t *labels,
+                              const IVFSearchParameters *params) {
+  const int nBuckets = ivf.d + 1;
+  std::vector<int> all_counters(nx * nBuckets, 0);
+  std::unique_ptr<idx_t[]> all_ids_per_dis(new idx_t[nx * nBuckets * k]);
+
+  long nprobe = params ? params->nprobe : ivf.nprobe;
+  long max_codes = params ? params->max_codes : ivf.max_codes;
+
+  std::vector<HCounterState<HammingComputer>> cs;
+  for (size_t i = 0; i < nx; ++i) {
+    cs.push_back(HCounterState<HammingComputer>(
+                   all_counters.data() + i * nBuckets,
+                   all_ids_per_dis.get() + i * nBuckets * k,
+                   x + i * ivf.code_size,
+                   ivf.d,
+                   k
+                 ));
+  }
+
+  size_t nlistv = 0, ndis = 0;
+
+#pragma omp parallel for reduction(+: nlistv, ndis)
+  for (size_t i = 0; i < nx; i++) {
+    const idx_t * keysi = keys + i * nprobe;
+    HCounterState<HammingComputer>& csi = cs[i];
+
+    size_t nscan = 0;
+
+    for (size_t ik = 0; ik < nprobe; ik++) {
+      idx_t key = keysi[ik];  /* select the list  */
+      if (key < 0) {
+        // not enough centroids for multiprobe
+        continue;
+      }
+      FAISS_THROW_IF_NOT_FMT (
+        key < (idx_t) ivf.nlist,
+        "Invalid key=%ld  at ik=%ld nlist=%ld\n",
+        key, ik, ivf.nlist);
+
+      nlistv++;
+      size_t list_size = ivf.invlists->list_size(key);
+      InvertedLists::ScopedCodes scodes (ivf.invlists, key);
+      const uint8_t *list_vecs = scodes.get();
+      const Index::idx_t *ids = store_pairs
+        ? nullptr
+        : ivf.invlists->get_ids(key);
+
+      for (size_t j = 0; j < list_size; j++) {
+        const uint8_t * yj = list_vecs + ivf.code_size * j;
+
+        idx_t id = store_pairs ? (key << 32 | j) : ids[j];
+        csi.update_counter(yj, id);
+      }
+      if (ids)
+          ivf.invlists->release_ids (key, ids);
+
+      nscan += list_size;
+      if (max_codes && nscan >= max_codes)
+        break;
+    }
+    ndis += nscan;
+
+    int nres = 0;
+    for (int b = 0; b < nBuckets && nres < k; b++) {
+      for (int l = 0; l < csi.counters[b] && nres < k; l++) {
+        labels[i * k + nres] = csi.ids_per_dis[b * k + l];
+        distances[i * k + nres] = b;
+        nres++;
+      }
+    }
+    while (nres < k) {
+      labels[i * k + nres] = -1;
+      distances[i * k + nres] = std::numeric_limits<int32_t>::max();
+      ++nres;
+    }
+  }
+
+  indexIVF_stats.nq += nx;
+  indexIVF_stats.nlist += nlistv;
+  indexIVF_stats.ndis += ndis;
+}
+
+
+
+template<bool store_pairs>
+void search_knn_hamming_count_1 (
+                        const IndexBinaryIVF& ivf,
+                        size_t nx,
+                        const uint8_t *x,
+                        const idx_t *keys,
+                        int k,
+                        int32_t *distances,
+                        idx_t *labels,
+                        const IVFSearchParameters *params) {
+    switch (ivf.code_size) {
+#define HANDLE_CS(cs)                                                  \
+    case cs:                                                            \
+       search_knn_hamming_count<HammingComputer ## cs, store_pairs>(    \
+           ivf, nx, x, keys, k, distances, labels, params);             \
+      break;
+      HANDLE_CS(4);
+      HANDLE_CS(8);
+      HANDLE_CS(16);
+      HANDLE_CS(20);
+      HANDLE_CS(32);
+      HANDLE_CS(64);
+#undef HANDLE_CS
+    default:
+        if (ivf.code_size % 8 == 0) {
+            search_knn_hamming_count<HammingComputerM8, store_pairs>
+                (ivf, nx, x, keys, k, distances, labels, params);
+        } else if (ivf.code_size % 4 == 0) {
+            search_knn_hamming_count<HammingComputerM4, store_pairs>
+                (ivf, nx, x, keys, k, distances, labels, params);
+        } else {
+            search_knn_hamming_count<HammingComputerDefault, store_pairs>
+                (ivf, nx, x, keys, k, distances, labels, params);
+        }
+        break;
+    }
+
+}
+
+}  // namespace
+
+BinaryInvertedListScanner *IndexBinaryIVF::get_InvertedListScanner
+      (bool store_pairs) const
+{
+    if (store_pairs) {
+        return select_IVFBinaryScannerL2<true> (code_size);
+    } else {
+        return select_IVFBinaryScannerL2<false> (code_size);
+    }
+}
+
+void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k,
+                                        const idx_t *idx,
+                                        const int32_t * coarse_dis,
+                                        int32_t *distances, idx_t *labels,
+                                        bool store_pairs,
+                                        const IVFSearchParameters *params
+                                        ) const {
+
+    if (use_heap) {
+        search_knn_hamming_heap (*this, n, x, k, idx, coarse_dis,
+                                 distances, labels, store_pairs,
+                                 params);
+    } else {
+        if (store_pairs) {
+            search_knn_hamming_count_1<true>
+                (*this, n, x, idx, k, distances, labels, params);
+        } else {
+            search_knn_hamming_count_1<false>
+                (*this, n, x, idx, k, distances, labels, params);
+        }
+    }
+}
+
+IndexBinaryIVF::~IndexBinaryIVF() {
+  if (own_invlists) {
+    delete invlists;
+  }
+
+  if (own_fields) {
+    delete quantizer;
+  }
+}
+
+
+}  // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryIVF.h b/core/src/index/thirdparty/faiss/IndexBinaryIVF.h
new file mode 100644
index 0000000000..bf16a5b1a2
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexBinaryIVF.h
@@ -0,0 +1,211 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_BINARY_IVF_H
+#define FAISS_INDEX_BINARY_IVF_H
+
+
+#include <vector>
+
+#include <faiss/IndexBinary.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/Clustering.h>
+#include <faiss/utils/Heap.h>
+
+
+namespace faiss {
+
+struct BinaryInvertedListScanner;
+
+/** Index based on a inverted file (IVF)
+ *
+ * In the inverted file, the quantizer (an IndexBinary instance) provides a
+ * quantization index for each vector to be added. The quantization
+ * index maps to a list (aka inverted list or posting list), where the
+ * id of the vector is stored.
+ *
+ * Otherwise the object is similar to the IndexIVF
+ */
+struct IndexBinaryIVF : IndexBinary {
+    /// Acess to the actual data
+    InvertedLists *invlists;
+    bool own_invlists;
+
+    size_t nprobe;            ///< number of probes at query time
+    size_t max_codes;         ///< max nb of codes to visit to do a query
+
+    /** Select between using a heap or counting to select the k smallest values
+     * when scanning inverted lists.
+     */
+    bool use_heap = true;
+
+    /// map for direct access to the elements. Enables reconstruct().
+    bool maintain_direct_map;
+    std::vector<idx_t> direct_map;
+
+    IndexBinary *quantizer;   ///< quantizer that maps vectors to inverted lists
+    size_t nlist;             ///< number of possible key values
+
+    bool own_fields;          ///< whether object owns the quantizer
+
+    ClusteringParameters cp; ///< to override default clustering params
+    Index *clustering_index; ///< to override index used during clustering
+
+    /** The Inverted file takes a quantizer (an IndexBinary) on input,
+     * which implements the function mapping a vector to a list
+     * identifier. The pointer is borrowed: the quantizer should not
+     * be deleted while the IndexBinaryIVF is in use.
+     */
+    IndexBinaryIVF(IndexBinary *quantizer, size_t d, size_t nlist);
+
+    IndexBinaryIVF();
+
+    ~IndexBinaryIVF() override;
+
+    void reset() override;
+
+    /// Trains the quantizer
+    void train(idx_t n, const uint8_t *x) override;
+
+    void add(idx_t n, const uint8_t *x) override;
+
+    void add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids) override;
+
+    /// same as add_with_ids, with precomputed coarse quantizer
+    void add_core (idx_t n, const uint8_t * x, const idx_t *xids,
+                   const idx_t *precomputed_idx);
+
+    /** Search a set of vectors, that are pre-quantized by the IVF
+     *  quantizer. Fill in the corresponding heaps with the query
+     *  results. search() calls this.
+     *
+     * @param n      nb of vectors to query
+     * @param x      query vectors, size nx * d
+     * @param assign coarse quantization indices, size nx * nprobe
+     * @param centroid_dis
+     *               distances to coarse centroids, size nx * nprobe
+     * @param distance
+     *               output distances, size n * k
+     * @param labels output labels, size n * k
+     * @param store_pairs store inv list index + inv list offset
+     *                     instead in upper/lower 32 bit of result,
+     *                     instead of ids (used for reranking).
+     * @param params used to override the object's search parameters
+     */
+    void search_preassigned(idx_t n, const uint8_t *x, idx_t k,
+                            const idx_t *assign,
+                            const int32_t *centroid_dis,
+                            int32_t *distances, idx_t *labels,
+                            bool store_pairs,
+                            const IVFSearchParameters *params=nullptr
+                            ) const;
+
+    virtual BinaryInvertedListScanner *get_InvertedListScanner (
+                                         bool store_pairs=false) const;
+
+    /** assign the vectors, then call search_preassign */
+    virtual void search(idx_t n, const uint8_t *x, idx_t k,
+                        int32_t *distances, idx_t *labels) const override;
+
+    void reconstruct(idx_t key, uint8_t *recons) const override;
+
+    /** Reconstruct a subset of the indexed vectors.
+     *
+     * Overrides default implementation to bypass reconstruct() which requires
+     * direct_map to be maintained.
+     *
+     * @param i0     first vector to reconstruct
+     * @param ni     nb of vectors to reconstruct
+     * @param recons output array of reconstructed vectors, size ni * d / 8
+     */
+    void reconstruct_n(idx_t i0, idx_t ni, uint8_t *recons) const override;
+
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * Overrides default implementation to avoid having to maintain direct_map
+     * and instead fetch the code offsets through the `store_pairs` flag in
+     * search_preassigned().
+     *
+     * @param recons      reconstructed vectors size (n, k, d / 8)
+     */
+    void search_and_reconstruct(idx_t n, const uint8_t *x, idx_t k,
+                                int32_t *distances, idx_t *labels,
+                                uint8_t *recons) const override;
+
+    /** Reconstruct a vector given the location in terms of (inv list index +
+     * inv list offset) instead of the id.
+     *
+     * Useful for reconstructing when the direct_map is not maintained and
+     * the inv list offset is computed by search_preassigned() with
+     * `store_pairs` set.
+     */
+    virtual void reconstruct_from_offset(idx_t list_no, idx_t offset,
+                                         uint8_t* recons) const;
+
+
+    /// Dataset manipulation functions
+    size_t remove_ids(const IDSelector& sel) override;
+
+    /** moves the entries from another dataset to self. On output,
+     * other is empty. add_id is added to all moved ids (for
+     * sequential ids, this would be this->ntotal */
+    virtual void merge_from(IndexBinaryIVF& other, idx_t add_id);
+
+    size_t get_list_size(size_t list_no) const
+    { return invlists->list_size(list_no); }
+
+    /** intialize a direct map
+     *
+     * @param new_maintain_direct_map    if true, create a direct map,
+     *                                   else clear it
+     */
+    void make_direct_map(bool new_maintain_direct_map=true);
+
+    void replace_invlists(InvertedLists *il, bool own=false);
+};
+
+
+struct BinaryInvertedListScanner {
+
+    using idx_t = Index::idx_t;
+
+    /// from now on we handle this query.
+    virtual void set_query (const uint8_t *query_vector) = 0;
+
+    /// following codes come from this inverted list
+    virtual void set_list (idx_t list_no, uint8_t coarse_dis) = 0;
+
+    /// compute a single query-to-code distance
+    virtual uint32_t distance_to_code (const uint8_t *code) const = 0;
+
+    /** compute the distances to codes. (distances, labels) should be
+     * organized as a min- or max-heap
+     *
+     * @param n      number of codes to scan
+     * @param codes  codes to scan (n * code_size)
+     * @param ids        corresponding ids (ignored if store_pairs)
+     * @param distances  heap distances (size k)
+     * @param labels     heap labels (size k)
+     * @param k          heap size
+     */
+    virtual size_t scan_codes (size_t n,
+                               const uint8_t *codes,
+                               const idx_t *ids,
+                               int32_t *distances, idx_t *labels,
+                               size_t k) const = 0;
+
+    virtual ~BinaryInvertedListScanner () {}
+
+};
+
+
+}  // namespace faiss
+
+#endif  // FAISS_INDEX_BINARY_IVF_H
diff --git a/core/src/index/thirdparty/faiss/IndexFlat.cpp b/core/src/index/thirdparty/faiss/IndexFlat.cpp
new file mode 100644
index 0000000000..5b94416628
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexFlat.cpp
@@ -0,0 +1,508 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexFlat.h>
+
+#include <cstring>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/extra_distances.h>
+#include <faiss/utils/utils.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+
+namespace faiss {
+
+IndexFlat::IndexFlat (idx_t d, MetricType metric):
+            Index(d, metric)
+{
+}
+
+
+
+void IndexFlat::add (idx_t n, const float *x) {
+    xb.insert(xb.end(), x, x + n * d);
+    ntotal += n;
+}
+
+
+void IndexFlat::reset() {
+    xb.clear();
+    ntotal = 0;
+}
+
+
+void IndexFlat::search (idx_t n, const float *x, idx_t k,
+                               float *distances, idx_t *labels) const
+{
+    // we see the distances and labels as heaps
+
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        float_minheap_array_t res = {
+            size_t(n), size_t(k), labels, distances};
+        knn_inner_product (x, xb.data(), d, n, ntotal, &res);
+    } else if (metric_type == METRIC_L2) {
+        float_maxheap_array_t res = {
+            size_t(n), size_t(k), labels, distances};
+        knn_L2sqr (x, xb.data(), d, n, ntotal, &res);
+    } else {
+        float_maxheap_array_t res = {
+            size_t(n), size_t(k), labels, distances};
+        knn_extra_metrics (x, xb.data(), d, n, ntotal,
+                           metric_type, metric_arg,
+                           &res);
+    }
+}
+
+void IndexFlat::range_search (idx_t n, const float *x, float radius,
+                              RangeSearchResult *result) const
+{
+    switch (metric_type) {
+    case METRIC_INNER_PRODUCT:
+        range_search_inner_product (x, xb.data(), d, n, ntotal,
+                                    radius, result);
+        break;
+    case METRIC_L2:
+        range_search_L2sqr (x, xb.data(), d, n, ntotal, radius, result);
+        break;
+    default:
+        FAISS_THROW_MSG("metric type not supported");
+    }
+}
+
+
+void IndexFlat::compute_distance_subset (
+            idx_t n,
+            const float *x,
+            idx_t k,
+            float *distances,
+            const idx_t *labels) const
+{
+    switch (metric_type) {
+        case METRIC_INNER_PRODUCT:
+            fvec_inner_products_by_idx (
+                 distances,
+                 x, xb.data(), labels, d, n, k);
+            break;
+        case METRIC_L2:
+            fvec_L2sqr_by_idx (
+                 distances,
+                 x, xb.data(), labels, d, n, k);
+            break;
+        default:
+            FAISS_THROW_MSG("metric type not supported");
+    }
+
+}
+
+size_t IndexFlat::remove_ids (const IDSelector & sel)
+{
+    idx_t j = 0;
+    for (idx_t i = 0; i < ntotal; i++) {
+        if (sel.is_member (i)) {
+            // should be removed
+        } else {
+            if (i > j) {
+                memmove (&xb[d * j], &xb[d * i], sizeof(xb[0]) * d);
+            }
+            j++;
+        }
+    }
+    size_t nremove = ntotal - j;
+    if (nremove > 0) {
+        ntotal = j;
+        xb.resize (ntotal * d);
+    }
+    return nremove;
+}
+
+
+namespace {
+
+
+struct FlatL2Dis : DistanceComputer {
+    size_t d;
+    Index::idx_t nb;
+    const float *q;
+    const float *b;
+    size_t ndis;
+
+    float operator () (idx_t i) override {
+        ndis++;
+        return fvec_L2sqr(q, b + i * d, d);
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return fvec_L2sqr(b + j * d, b + i * d, d);
+    }
+
+    explicit FlatL2Dis(const IndexFlat& storage, const float *q = nullptr)
+        : d(storage.d),
+          nb(storage.ntotal),
+          q(q),
+          b(storage.xb.data()),
+          ndis(0) {}
+
+    void set_query(const float *x) override {
+        q = x;
+    }
+};
+
+struct FlatIPDis : DistanceComputer {
+    size_t d;
+    Index::idx_t nb;
+    const float *q;
+    const float *b;
+    size_t ndis;
+
+    float operator () (idx_t i) override {
+        ndis++;
+        return fvec_inner_product (q, b + i * d, d);
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return fvec_inner_product (b + j * d, b + i * d, d);
+    }
+
+    explicit FlatIPDis(const IndexFlat& storage, const float *q = nullptr)
+        : d(storage.d),
+          nb(storage.ntotal),
+          q(q),
+          b(storage.xb.data()),
+          ndis(0) {}
+
+    void set_query(const float *x) override {
+        q = x;
+    }
+};
+
+
+
+
+}  // namespace
+
+
+DistanceComputer * IndexFlat::get_distance_computer() const {
+    if (metric_type == METRIC_L2) {
+        return new FlatL2Dis(*this);
+    } else if (metric_type == METRIC_INNER_PRODUCT) {
+        return new FlatIPDis(*this);
+    } else {
+        return get_extra_distance_computer (d, metric_type, metric_arg,
+                                            ntotal, xb.data());
+    }
+}
+
+
+void IndexFlat::reconstruct (idx_t key, float * recons) const
+{
+    memcpy (recons, &(xb[key * d]), sizeof(*recons) * d);
+}
+
+
+/* The standalone codec interface */
+size_t IndexFlat::sa_code_size () const
+{
+    return sizeof(float) * d;
+}
+
+void IndexFlat::sa_encode (idx_t n, const float *x, uint8_t *bytes) const
+{
+    memcpy (bytes, x, sizeof(float) * d * n);
+}
+
+void IndexFlat::sa_decode (idx_t n, const uint8_t *bytes, float *x) const
+{
+    memcpy (x, bytes, sizeof(float) * d * n);
+}
+
+
+
+
+/***************************************************
+ * IndexFlatL2BaseShift
+ ***************************************************/
+
+IndexFlatL2BaseShift::IndexFlatL2BaseShift (idx_t d, size_t nshift, const float *shift):
+    IndexFlatL2 (d), shift (nshift)
+{
+    memcpy (this->shift.data(), shift, sizeof(float) * nshift);
+}
+
+void IndexFlatL2BaseShift::search (
+            idx_t n,
+            const float *x,
+            idx_t k,
+            float *distances,
+            idx_t *labels) const
+{
+    FAISS_THROW_IF_NOT (shift.size() == ntotal);
+
+    float_maxheap_array_t res = {
+        size_t(n), size_t(k), labels, distances};
+    knn_L2sqr_base_shift (x, xb.data(), d, n, ntotal, &res, shift.data());
+}
+
+
+
+/***************************************************
+ * IndexRefineFlat
+ ***************************************************/
+
+IndexRefineFlat::IndexRefineFlat (Index *base_index):
+    Index (base_index->d, base_index->metric_type),
+    refine_index (base_index->d, base_index->metric_type),
+    base_index (base_index), own_fields (false),
+    k_factor (1)
+{
+    is_trained = base_index->is_trained;
+    FAISS_THROW_IF_NOT_MSG (base_index->ntotal == 0,
+                      "base_index should be empty in the beginning");
+}
+
+IndexRefineFlat::IndexRefineFlat () {
+    base_index = nullptr;
+    own_fields = false;
+    k_factor = 1;
+}
+
+
+void IndexRefineFlat::train (idx_t n, const float *x)
+{
+    base_index->train (n, x);
+    is_trained = true;
+}
+
+void IndexRefineFlat::add (idx_t n, const float *x) {
+    FAISS_THROW_IF_NOT (is_trained);
+    base_index->add (n, x);
+    refine_index.add (n, x);
+    ntotal = refine_index.ntotal;
+}
+
+void IndexRefineFlat::reset ()
+{
+    base_index->reset ();
+    refine_index.reset ();
+    ntotal = 0;
+}
+
+namespace {
+typedef faiss::Index::idx_t idx_t;
+
+template<class C>
+static void reorder_2_heaps (
+      idx_t n,
+      idx_t k, idx_t *labels, float *distances,
+      idx_t k_base, const idx_t *base_labels, const float *base_distances)
+{
+#pragma omp parallel for
+    for (idx_t i = 0; i < n; i++) {
+        idx_t *idxo = labels + i * k;
+        float *diso = distances + i * k;
+        const idx_t *idxi = base_labels + i * k_base;
+        const float *disi = base_distances + i * k_base;
+
+        heap_heapify<C> (k, diso, idxo, disi, idxi, k);
+        if (k_base != k) { // add remaining elements
+            heap_addn<C> (k, diso, idxo, disi + k, idxi + k, k_base - k);
+        }
+        heap_reorder<C> (k, diso, idxo);
+    }
+}
+
+
+}
+
+
+void IndexRefineFlat::search (
+              idx_t n, const float *x, idx_t k,
+              float *distances, idx_t *labels) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    idx_t k_base = idx_t (k * k_factor);
+    idx_t * base_labels = labels;
+    float * base_distances = distances;
+    ScopeDeleter<idx_t> del1;
+    ScopeDeleter<float> del2;
+
+
+    if (k != k_base) {
+        base_labels = new idx_t [n * k_base];
+        del1.set (base_labels);
+        base_distances = new float [n * k_base];
+        del2.set (base_distances);
+    }
+
+    base_index->search (n, x, k_base, base_distances, base_labels);
+
+    for (int i = 0; i < n * k_base; i++)
+        assert (base_labels[i] >= -1 &&
+                base_labels[i] < ntotal);
+
+    // compute refined distances
+    refine_index.compute_distance_subset (
+        n, x, k_base, base_distances, base_labels);
+
+    // sort and store result
+    if (metric_type == METRIC_L2) {
+        typedef CMax <float, idx_t> C;
+        reorder_2_heaps<C> (
+            n, k, labels, distances,
+            k_base, base_labels, base_distances);
+
+    } else if (metric_type == METRIC_INNER_PRODUCT) {
+        typedef CMin <float, idx_t> C;
+        reorder_2_heaps<C> (
+            n, k, labels, distances,
+            k_base, base_labels, base_distances);
+    } else {
+        FAISS_THROW_MSG("Metric type not supported");
+    }
+
+}
+
+
+
+IndexRefineFlat::~IndexRefineFlat ()
+{
+    if (own_fields) delete base_index;
+}
+
+/***************************************************
+ * IndexFlat1D
+ ***************************************************/
+
+
+IndexFlat1D::IndexFlat1D (bool continuous_update):
+    IndexFlatL2 (1),
+    continuous_update (continuous_update)
+{
+}
+
+/// if not continuous_update, call this between the last add and
+/// the first search
+void IndexFlat1D::update_permutation ()
+{
+    perm.resize (ntotal);
+    if (ntotal < 1000000) {
+        fvec_argsort (ntotal, xb.data(), (size_t*)perm.data());
+    } else {
+        fvec_argsort_parallel (ntotal, xb.data(), (size_t*)perm.data());
+    }
+}
+
+void IndexFlat1D::add (idx_t n, const float *x)
+{
+    IndexFlatL2::add (n, x);
+    if (continuous_update)
+        update_permutation();
+}
+
+void IndexFlat1D::reset()
+{
+    IndexFlatL2::reset();
+    perm.clear();
+}
+
+void IndexFlat1D::search (
+            idx_t n,
+            const float *x,
+            idx_t k,
+            float *distances,
+            idx_t *labels) const
+{
+    FAISS_THROW_IF_NOT_MSG (perm.size() == ntotal,
+                    "Call update_permutation before search");
+
+#pragma omp parallel for
+    for (idx_t i = 0; i < n; i++) {
+
+        float q = x[i]; // query
+        float *D = distances + i * k;
+        idx_t *I = labels + i * k;
+
+        // binary search
+        idx_t i0 = 0, i1 = ntotal;
+        idx_t wp = 0;
+
+        if (xb[perm[i0]] > q) {
+            i1 = 0;
+            goto finish_right;
+        }
+
+        if (xb[perm[i1 - 1]] <= q) {
+            i0 = i1 - 1;
+            goto finish_left;
+        }
+
+        while (i0 + 1 < i1) {
+            idx_t imed = (i0 + i1) / 2;
+            if (xb[perm[imed]] <= q) i0 = imed;
+            else                    i1 = imed;
+        }
+
+        // query is between xb[perm[i0]] and xb[perm[i1]]
+        // expand to nearest neighs
+
+        while (wp < k) {
+            float xleft = xb[perm[i0]];
+            float xright = xb[perm[i1]];
+
+            if (q - xleft < xright - q) {
+                D[wp] = q - xleft;
+                I[wp] = perm[i0];
+                i0--; wp++;
+                if (i0 < 0) { goto finish_right; }
+            } else {
+                D[wp] = xright - q;
+                I[wp] = perm[i1];
+                i1++; wp++;
+                if (i1 >= ntotal) { goto finish_left; }
+            }
+        }
+        goto done;
+
+    finish_right:
+        // grow to the right from i1
+        while (wp < k) {
+            if (i1 < ntotal) {
+                D[wp] = xb[perm[i1]] - q;
+                I[wp] = perm[i1];
+                i1++;
+            } else {
+                D[wp] = std::numeric_limits<float>::infinity();
+                I[wp] = -1;
+            }
+            wp++;
+        }
+        goto done;
+
+    finish_left:
+        // grow to the left from i0
+        while (wp < k) {
+            if (i0 >= 0) {
+                D[wp] = q - xb[perm[i0]];
+                I[wp] = perm[i0];
+                i0--;
+            } else {
+                D[wp] = std::numeric_limits<float>::infinity();
+                I[wp] = -1;
+            }
+            wp++;
+        }
+    done:  ;
+    }
+
+}
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexFlat.h b/core/src/index/thirdparty/faiss/IndexFlat.h
new file mode 100644
index 0000000000..7b13451211
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexFlat.h
@@ -0,0 +1,175 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef INDEX_FLAT_H
+#define INDEX_FLAT_H
+
+#include <vector>
+
+#include <faiss/Index.h>
+
+
+namespace faiss {
+
+/** Index that stores the full vectors and performs exhaustive search */
+struct IndexFlat: Index {
+    /// database vectors, size ntotal * d
+    std::vector<float> xb;
+
+    explicit IndexFlat (idx_t d, MetricType metric = METRIC_L2);
+
+    void add(idx_t n, const float* x) override;
+
+    void reset() override;
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+    void range_search(
+        idx_t n,
+        const float* x,
+        float radius,
+        RangeSearchResult* result) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    /** compute distance with a subset of vectors
+     *
+     * @param x       query vectors, size n * d
+     * @param labels  indices of the vectors that should be compared
+     *                for each query vector, size n * k
+     * @param distances
+     *                corresponding output distances, size n * k
+     */
+    void compute_distance_subset (
+            idx_t n,
+            const float *x,
+            idx_t k,
+            float *distances,
+            const idx_t *labels) const;
+
+    /** remove some ids. NB that Because of the structure of the
+     * indexing structure, the semantics of this operation are
+     * different from the usual ones: the new ids are shifted */
+    size_t remove_ids(const IDSelector& sel) override;
+
+    IndexFlat () {}
+
+    DistanceComputer * get_distance_computer() const override;
+
+    /* The stanadlone codec interface (just memcopies in this case) */
+    size_t sa_code_size () const override;
+
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+};
+
+
+
+struct IndexFlatIP:IndexFlat {
+    explicit IndexFlatIP (idx_t d): IndexFlat (d, METRIC_INNER_PRODUCT) {}
+    IndexFlatIP () {}
+};
+
+
+struct IndexFlatL2:IndexFlat {
+    explicit IndexFlatL2 (idx_t d): IndexFlat (d, METRIC_L2) {}
+    IndexFlatL2 () {}
+};
+
+
+// same as an IndexFlatL2 but a value is subtracted from each distance
+struct IndexFlatL2BaseShift: IndexFlatL2 {
+    std::vector<float> shift;
+
+    IndexFlatL2BaseShift (idx_t d, size_t nshift, const float *shift);
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+};
+
+
+/** Index that queries in a base_index (a fast one) and refines the
+ *  results with an exact search, hopefully improving the results.
+ */
+struct IndexRefineFlat: Index {
+
+    /// storage for full vectors
+    IndexFlat refine_index;
+
+    /// faster index to pre-select the vectors that should be filtered
+    Index *base_index;
+    bool own_fields;  ///< should the base index be deallocated?
+
+    /// factor between k requested in search and the k requested from
+    /// the base_index (should be >= 1)
+    float k_factor;
+
+    explicit IndexRefineFlat (Index *base_index);
+
+    IndexRefineFlat ();
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void reset() override;
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+    ~IndexRefineFlat() override;
+};
+
+
+/// optimized version for 1D "vectors"
+struct IndexFlat1D:IndexFlatL2 {
+    bool continuous_update; ///< is the permutation updated continuously?
+
+    std::vector<idx_t> perm; ///< sorted database indices
+
+    explicit IndexFlat1D (bool continuous_update=true);
+
+    /// if not continuous_update, call this between the last add and
+    /// the first search
+    void update_permutation ();
+
+    void add(idx_t n, const float* x) override;
+
+    void reset() override;
+
+    /// Warn: the distances returned are L1 not L2
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+};
+
+
+}
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/IndexHNSW.cpp b/core/src/index/thirdparty/faiss/IndexHNSW.cpp
new file mode 100644
index 0000000000..b315477c5e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexHNSW.cpp
@@ -0,0 +1,1090 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexHNSW.h>
+
+
+#include <cstdlib>
+#include <cassert>
+#include <cstring>
+#include <cstdio>
+#include <cmath>
+#include <omp.h>
+
+#include <unordered_set>
+#include <queue>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <stdint.h>
+
+#ifdef __SSE__
+#include <immintrin.h>
+#endif
+
+#include <faiss/utils/distances.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/Index2Layer.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+
+extern "C" {
+
+/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
+
+int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+            n, FINTEGER *k, const float *alpha, const float *a,
+            FINTEGER *lda, const float *b, FINTEGER *
+            ldb, float *beta, float *c, FINTEGER *ldc);
+
+}
+
+namespace faiss {
+
+using idx_t = Index::idx_t;
+using MinimaxHeap = HNSW::MinimaxHeap;
+using storage_idx_t = HNSW::storage_idx_t;
+using NodeDistCloser = HNSW::NodeDistCloser;
+using NodeDistFarther = HNSW::NodeDistFarther;
+
+HNSWStats hnsw_stats;
+
+/**************************************************************
+ * add / search blocks of descriptors
+ **************************************************************/
+
+namespace {
+
+
+void hnsw_add_vertices(IndexHNSW &index_hnsw,
+                       size_t n0,
+                       size_t n, const float *x,
+                       bool verbose,
+                       bool preset_levels = false) {
+    size_t d = index_hnsw.d;
+    HNSW & hnsw = index_hnsw.hnsw;
+    size_t ntotal = n0 + n;
+    double t0 = getmillisecs();
+    if (verbose) {
+        printf("hnsw_add_vertices: adding %ld elements on top of %ld "
+               "(preset_levels=%d)\n",
+               n, n0, int(preset_levels));
+    }
+
+    if (n == 0) {
+        return;
+    }
+
+    int max_level = hnsw.prepare_level_tab(n, preset_levels);
+
+    if (verbose) {
+        printf("  max_level = %d\n", max_level);
+    }
+
+    std::vector<omp_lock_t> locks(ntotal);
+    for(int i = 0; i < ntotal; i++)
+        omp_init_lock(&locks[i]);
+
+    // add vectors from highest to lowest level
+    std::vector<int> hist;
+    std::vector<int> order(n);
+
+    { // make buckets with vectors of the same level
+
+        // build histogram
+        for (int i = 0; i < n; i++) {
+            storage_idx_t pt_id = i + n0;
+            int pt_level = hnsw.levels[pt_id] - 1;
+            while (pt_level >= hist.size())
+                hist.push_back(0);
+            hist[pt_level] ++;
+        }
+
+        // accumulate
+        std::vector<int> offsets(hist.size() + 1, 0);
+        for (int i = 0; i < hist.size() - 1; i++) {
+            offsets[i + 1] = offsets[i] + hist[i];
+        }
+
+        // bucket sort
+        for (int i = 0; i < n; i++) {
+            storage_idx_t pt_id = i + n0;
+            int pt_level = hnsw.levels[pt_id] - 1;
+            order[offsets[pt_level]++] = pt_id;
+        }
+    }
+
+    idx_t check_period = InterruptCallback::get_period_hint
+        (max_level * index_hnsw.d * hnsw.efConstruction);
+
+    { // perform add
+        RandomGenerator rng2(789);
+
+        int i1 = n;
+
+        for (int pt_level = hist.size() - 1; pt_level >= 0; pt_level--) {
+            int i0 = i1 - hist[pt_level];
+
+            if (verbose) {
+                printf("Adding %d elements at level %d\n",
+                       i1 - i0, pt_level);
+            }
+
+            // random permutation to get rid of dataset order bias
+            for (int j = i0; j < i1; j++)
+                std::swap(order[j], order[j + rng2.rand_int(i1 - j)]);
+
+            bool interrupt = false;
+
+#pragma omp parallel if(i1 > i0 + 100)
+            {
+                VisitedTable vt (ntotal);
+
+                DistanceComputer *dis =
+                    index_hnsw.storage->get_distance_computer();
+                ScopeDeleter1<DistanceComputer> del(dis);
+                int prev_display = verbose && omp_get_thread_num() == 0 ? 0 : -1;
+                size_t counter = 0;
+
+#pragma omp  for schedule(dynamic)
+                for (int i = i0; i < i1; i++) {
+                    storage_idx_t pt_id = order[i];
+                    dis->set_query (x + (pt_id - n0) * d);
+
+                    // cannot break
+                    if (interrupt) {
+                        continue;
+                    }
+
+                    hnsw.add_with_locks(*dis, pt_level, pt_id, locks, vt);
+
+                    if (prev_display >= 0 && i - i0 > prev_display + 10000) {
+                        prev_display = i - i0;
+                        printf("  %d / %d\r", i - i0, i1 - i0);
+                        fflush(stdout);
+                    }
+
+                    if (counter % check_period == 0) {
+                        if (InterruptCallback::is_interrupted ()) {
+                            interrupt = true;
+                        }
+                    }
+                    counter++;
+                }
+
+            }
+            if (interrupt) {
+                FAISS_THROW_MSG ("computation interrupted");
+            }
+            i1 = i0;
+        }
+        FAISS_ASSERT(i1 == 0);
+    }
+    if (verbose) {
+        printf("Done in %.3f ms\n", getmillisecs() - t0);
+    }
+
+    for(int i = 0; i < ntotal; i++) {
+        omp_destroy_lock(&locks[i]);
+    }
+}
+
+
+}  // namespace
+
+
+
+
+/**************************************************************
+ * IndexHNSW implementation
+ **************************************************************/
+
+IndexHNSW::IndexHNSW(int d, int M):
+    Index(d, METRIC_L2),
+    hnsw(M),
+    own_fields(false),
+    storage(nullptr),
+    reconstruct_from_neighbors(nullptr)
+{}
+
+IndexHNSW::IndexHNSW(Index *storage, int M):
+    Index(storage->d, storage->metric_type),
+    hnsw(M),
+    own_fields(false),
+    storage(storage),
+    reconstruct_from_neighbors(nullptr)
+{}
+
+IndexHNSW::~IndexHNSW() {
+    if (own_fields) {
+        delete storage;
+    }
+}
+
+void IndexHNSW::train(idx_t n, const float* x)
+{
+    FAISS_THROW_IF_NOT_MSG(storage,
+       "Please use IndexHSNWFlat (or variants) instead of IndexHNSW directly");
+    // hnsw structure does not require training
+    storage->train (n, x);
+    is_trained = true;
+}
+
+void IndexHNSW::search (idx_t n, const float *x, idx_t k,
+                        float *distances, idx_t *labels) const
+
+{
+    FAISS_THROW_IF_NOT_MSG(storage,
+       "Please use IndexHSNWFlat (or variants) instead of IndexHNSW directly");
+    size_t nreorder = 0;
+
+    idx_t check_period = InterruptCallback::get_period_hint (
+          hnsw.max_level * d * hnsw.efSearch);
+
+    for (idx_t i0 = 0; i0 < n; i0 += check_period) {
+        idx_t i1 = std::min(i0 + check_period, n);
+
+#pragma omp parallel reduction(+ : nreorder)
+        {
+            VisitedTable vt (ntotal);
+            DistanceComputer *dis = storage->get_distance_computer();
+            ScopeDeleter1<DistanceComputer> del(dis);
+
+#pragma omp for
+            for(idx_t i = i0; i < i1; i++) {
+                idx_t * idxi = labels + i * k;
+                float * simi = distances + i * k;
+                dis->set_query(x + i * d);
+
+                maxheap_heapify (k, simi, idxi);
+                hnsw.search(*dis, k, idxi, simi, vt);
+
+                maxheap_reorder (k, simi, idxi);
+
+                if (reconstruct_from_neighbors &&
+                    reconstruct_from_neighbors->k_reorder != 0) {
+                    int k_reorder = reconstruct_from_neighbors->k_reorder;
+                    if (k_reorder == -1 || k_reorder > k) k_reorder = k;
+
+                    nreorder += reconstruct_from_neighbors->compute_distances(
+                             k_reorder, idxi, x + i * d, simi);
+
+                    // sort top k_reorder
+                    maxheap_heapify (k_reorder, simi, idxi, simi, idxi, k_reorder);
+                    maxheap_reorder (k_reorder, simi, idxi);
+                }
+
+            }
+
+        }
+        InterruptCallback::check ();
+    }
+    hnsw_stats.nreorder += nreorder;
+}
+
+
+void IndexHNSW::add(idx_t n, const float *x)
+{
+    FAISS_THROW_IF_NOT_MSG(storage,
+       "Please use IndexHSNWFlat (or variants) instead of IndexHNSW directly");
+    FAISS_THROW_IF_NOT(is_trained);
+    int n0 = ntotal;
+    storage->add(n, x);
+    ntotal = storage->ntotal;
+
+    hnsw_add_vertices (*this, n0, n, x, verbose,
+                       hnsw.levels.size() == ntotal);
+}
+
+void IndexHNSW::reset()
+{
+    hnsw.reset();
+    storage->reset();
+    ntotal = 0;
+}
+
+void IndexHNSW::reconstruct (idx_t key, float* recons) const
+{
+    storage->reconstruct(key, recons);
+}
+
+void IndexHNSW::shrink_level_0_neighbors(int new_size)
+{
+#pragma omp parallel
+    {
+        DistanceComputer *dis = storage->get_distance_computer();
+        ScopeDeleter1<DistanceComputer> del(dis);
+
+#pragma omp for
+        for (idx_t i = 0; i < ntotal; i++) {
+
+            size_t begin, end;
+            hnsw.neighbor_range(i, 0, &begin, &end);
+
+            std::priority_queue<NodeDistFarther> initial_list;
+
+            for (size_t j = begin; j < end; j++) {
+                int v1 = hnsw.neighbors[j];
+                if (v1 < 0) break;
+                initial_list.emplace(dis->symmetric_dis(i, v1), v1);
+
+                // initial_list.emplace(qdis(v1), v1);
+            }
+
+            std::vector<NodeDistFarther> shrunk_list;
+            HNSW::shrink_neighbor_list(*dis, initial_list,
+                                       shrunk_list, new_size);
+
+            for (size_t j = begin; j < end; j++) {
+                if (j - begin < shrunk_list.size())
+                    hnsw.neighbors[j] = shrunk_list[j - begin].id;
+                else
+                    hnsw.neighbors[j] = -1;
+            }
+        }
+    }
+
+}
+
+void IndexHNSW::search_level_0(
+    idx_t n, const float *x, idx_t k,
+    const storage_idx_t *nearest, const float *nearest_d,
+    float *distances, idx_t *labels, int nprobe,
+    int search_type) const
+{
+
+    storage_idx_t ntotal = hnsw.levels.size();
+#pragma omp parallel
+    {
+        DistanceComputer *qdis = storage->get_distance_computer();
+        ScopeDeleter1<DistanceComputer> del(qdis);
+
+        VisitedTable vt (ntotal);
+
+#pragma omp for
+        for(idx_t i = 0; i < n; i++) {
+            idx_t * idxi = labels + i * k;
+            float * simi = distances + i * k;
+
+            qdis->set_query(x + i * d);
+            maxheap_heapify (k, simi, idxi);
+
+            if (search_type == 1) {
+
+                int nres = 0;
+
+                for(int j = 0; j < nprobe; j++) {
+                    storage_idx_t cj = nearest[i * nprobe + j];
+
+                    if (cj < 0) break;
+
+                    if (vt.get(cj)) continue;
+
+                    int candidates_size = std::max(hnsw.efSearch, int(k));
+                    MinimaxHeap candidates(candidates_size);
+
+                    candidates.push(cj, nearest_d[i * nprobe + j]);
+
+                    nres = hnsw.search_from_candidates(
+                      *qdis, k, idxi, simi,
+                      candidates, vt, 0, nres
+                    );
+                }
+            } else if (search_type == 2) {
+
+                int candidates_size = std::max(hnsw.efSearch, int(k));
+                candidates_size = std::max(candidates_size, nprobe);
+
+                MinimaxHeap candidates(candidates_size);
+                for(int j = 0; j < nprobe; j++) {
+                    storage_idx_t cj = nearest[i * nprobe + j];
+
+                    if (cj < 0) break;
+                    candidates.push(cj, nearest_d[i * nprobe + j]);
+                }
+                hnsw.search_from_candidates(
+                  *qdis, k, idxi, simi,
+                  candidates, vt, 0
+                );
+
+            }
+            vt.advance();
+
+            maxheap_reorder (k, simi, idxi);
+
+        }
+    }
+
+
+}
+
+void IndexHNSW::init_level_0_from_knngraph(
+       int k, const float *D, const idx_t *I)
+{
+    int dest_size = hnsw.nb_neighbors (0);
+
+#pragma omp parallel for
+    for (idx_t i = 0; i < ntotal; i++) {
+        DistanceComputer *qdis = storage->get_distance_computer();
+        float vec[d];
+        storage->reconstruct(i, vec);
+        qdis->set_query(vec);
+
+        std::priority_queue<NodeDistFarther> initial_list;
+
+        for (size_t j = 0; j < k; j++) {
+            int v1 = I[i * k + j];
+            if (v1 == i) continue;
+            if (v1 < 0) break;
+            initial_list.emplace(D[i * k + j], v1);
+        }
+
+        std::vector<NodeDistFarther> shrunk_list;
+        HNSW::shrink_neighbor_list(*qdis, initial_list, shrunk_list, dest_size);
+
+        size_t begin, end;
+        hnsw.neighbor_range(i, 0, &begin, &end);
+
+        for (size_t j = begin; j < end; j++) {
+            if (j - begin < shrunk_list.size())
+                hnsw.neighbors[j] = shrunk_list[j - begin].id;
+            else
+                hnsw.neighbors[j] = -1;
+        }
+    }
+}
+
+
+
+void IndexHNSW::init_level_0_from_entry_points(
+          int n, const storage_idx_t *points,
+          const storage_idx_t *nearests)
+{
+
+    std::vector<omp_lock_t> locks(ntotal);
+    for(int i = 0; i < ntotal; i++)
+        omp_init_lock(&locks[i]);
+
+#pragma omp parallel
+    {
+        VisitedTable vt (ntotal);
+
+        DistanceComputer *dis = storage->get_distance_computer();
+        ScopeDeleter1<DistanceComputer> del(dis);
+        float vec[storage->d];
+
+#pragma omp  for schedule(dynamic)
+        for (int i = 0; i < n; i++) {
+            storage_idx_t pt_id = points[i];
+            storage_idx_t nearest = nearests[i];
+            storage->reconstruct (pt_id, vec);
+            dis->set_query (vec);
+
+            hnsw.add_links_starting_from(*dis, pt_id,
+                                         nearest, (*dis)(nearest),
+                                         0, locks.data(), vt);
+
+            if (verbose && i % 10000 == 0) {
+                printf("  %d / %d\r", i, n);
+                fflush(stdout);
+            }
+        }
+    }
+    if (verbose) {
+        printf("\n");
+    }
+
+    for(int i = 0; i < ntotal; i++)
+        omp_destroy_lock(&locks[i]);
+}
+
+void IndexHNSW::reorder_links()
+{
+    int M = hnsw.nb_neighbors(0);
+
+#pragma omp parallel
+    {
+        std::vector<float> distances (M);
+        std::vector<size_t> order (M);
+        std::vector<storage_idx_t> tmp (M);
+        DistanceComputer *dis = storage->get_distance_computer();
+        ScopeDeleter1<DistanceComputer> del(dis);
+
+#pragma omp for
+        for(storage_idx_t i = 0; i < ntotal; i++) {
+
+            size_t begin, end;
+            hnsw.neighbor_range(i, 0, &begin, &end);
+
+            for (size_t j = begin; j < end; j++) {
+                storage_idx_t nj = hnsw.neighbors[j];
+                if (nj < 0) {
+                    end = j;
+                    break;
+                }
+                distances[j - begin] = dis->symmetric_dis(i, nj);
+                tmp [j - begin] = nj;
+            }
+
+            fvec_argsort (end - begin, distances.data(), order.data());
+            for (size_t j = begin; j < end; j++) {
+                hnsw.neighbors[j] = tmp[order[j - begin]];
+            }
+        }
+
+    }
+}
+
+
+void IndexHNSW::link_singletons()
+{
+    printf("search for singletons\n");
+
+    std::vector<bool> seen(ntotal);
+
+    for (size_t i = 0; i < ntotal; i++) {
+        size_t begin, end;
+        hnsw.neighbor_range(i, 0, &begin, &end);
+        for (size_t j = begin; j < end; j++) {
+            storage_idx_t ni = hnsw.neighbors[j];
+            if (ni >= 0) seen[ni] = true;
+        }
+    }
+
+    int n_sing = 0, n_sing_l1 = 0;
+    std::vector<storage_idx_t> singletons;
+    for (storage_idx_t i = 0; i < ntotal; i++) {
+        if (!seen[i]) {
+            singletons.push_back(i);
+            n_sing++;
+            if (hnsw.levels[i] > 1)
+                n_sing_l1++;
+        }
+    }
+
+    printf("  Found %d / %ld singletons (%d appear in a level above)\n",
+           n_sing, ntotal, n_sing_l1);
+
+    std::vector<float>recons(singletons.size() * d);
+    for (int i = 0; i < singletons.size(); i++) {
+
+        FAISS_ASSERT(!"not implemented");
+
+    }
+
+
+}
+
+
+/**************************************************************
+ * ReconstructFromNeighbors implementation
+ **************************************************************/
+
+
+ReconstructFromNeighbors::ReconstructFromNeighbors(
+             const IndexHNSW & index, size_t k, size_t nsq):
+    index(index), k(k), nsq(nsq) {
+    M = index.hnsw.nb_neighbors(0);
+    FAISS_ASSERT(k <= 256);
+    code_size = k == 1 ? 0 : nsq;
+    ntotal = 0;
+    d = index.d;
+    FAISS_ASSERT(d % nsq == 0);
+    dsub = d / nsq;
+    k_reorder = -1;
+}
+
+void ReconstructFromNeighbors::reconstruct(storage_idx_t i, float *x, float *tmp) const
+{
+
+
+    const HNSW & hnsw = index.hnsw;
+    size_t begin, end;
+    hnsw.neighbor_range(i, 0, &begin, &end);
+
+    if (k == 1 || nsq == 1) {
+        const float * beta;
+        if (k == 1) {
+            beta = codebook.data();
+        } else {
+            int idx = codes[i];
+            beta = codebook.data() + idx * (M + 1);
+        }
+
+        float w0 = beta[0]; // weight of image itself
+        index.storage->reconstruct(i, tmp);
+
+        for (int l = 0; l < d; l++)
+            x[l] = w0 * tmp[l];
+
+        for (size_t j = begin; j < end; j++) {
+
+            storage_idx_t ji = hnsw.neighbors[j];
+            if (ji < 0) ji = i;
+            float w = beta[j - begin + 1];
+            index.storage->reconstruct(ji, tmp);
+            for (int l = 0; l < d; l++)
+                x[l] += w * tmp[l];
+        }
+    } else if (nsq == 2) {
+        int idx0 = codes[2 * i];
+        int idx1 = codes[2 * i + 1];
+
+        const float *beta0 = codebook.data() +  idx0 * (M + 1);
+        const float *beta1 = codebook.data() + (idx1 + k) * (M + 1);
+
+        index.storage->reconstruct(i, tmp);
+
+        float w0;
+
+        w0 = beta0[0];
+        for (int l = 0; l < dsub; l++)
+            x[l] = w0 * tmp[l];
+
+        w0 = beta1[0];
+        for (int l = dsub; l < d; l++)
+            x[l] = w0 * tmp[l];
+
+        for (size_t j = begin; j < end; j++) {
+            storage_idx_t ji = hnsw.neighbors[j];
+            if (ji < 0) ji = i;
+            index.storage->reconstruct(ji, tmp);
+            float w;
+            w = beta0[j - begin + 1];
+            for (int l = 0; l < dsub; l++)
+                x[l] += w * tmp[l];
+
+            w = beta1[j - begin + 1];
+            for (int l = dsub; l < d; l++)
+                x[l] += w * tmp[l];
+        }
+    } else {
+        const float *betas[nsq];
+        {
+            const float *b = codebook.data();
+            const uint8_t *c = &codes[i * code_size];
+            for (int sq = 0; sq < nsq; sq++) {
+                betas[sq] = b + (*c++) * (M + 1);
+                b += (M + 1) * k;
+            }
+        }
+
+        index.storage->reconstruct(i, tmp);
+        {
+            int d0 = 0;
+            for (int sq = 0; sq < nsq; sq++) {
+                float w = *(betas[sq]++);
+                int d1 = d0 + dsub;
+                for (int l = d0; l < d1; l++) {
+                    x[l] = w * tmp[l];
+                }
+                d0 = d1;
+            }
+        }
+
+        for (size_t j = begin; j < end; j++) {
+            storage_idx_t ji = hnsw.neighbors[j];
+            if (ji < 0) ji = i;
+
+            index.storage->reconstruct(ji, tmp);
+            int d0 = 0;
+            for (int sq = 0; sq < nsq; sq++) {
+                float w = *(betas[sq]++);
+                int d1 = d0 + dsub;
+                for (int l = d0; l < d1; l++) {
+                    x[l] += w * tmp[l];
+                }
+                d0 = d1;
+            }
+        }
+    }
+}
+
+void ReconstructFromNeighbors::reconstruct_n(storage_idx_t n0,
+                                             storage_idx_t ni,
+                                             float *x) const
+{
+#pragma omp parallel
+    {
+        std::vector<float> tmp(index.d);
+#pragma omp for
+        for (storage_idx_t i = 0; i < ni; i++) {
+            reconstruct(n0 + i, x + i * index.d, tmp.data());
+        }
+    }
+}
+
+size_t ReconstructFromNeighbors::compute_distances(
+    size_t n, const idx_t *shortlist,
+    const float *query, float *distances) const
+{
+    std::vector<float> tmp(2 * index.d);
+    size_t ncomp = 0;
+    for (int i = 0; i < n; i++) {
+        if (shortlist[i] < 0) break;
+        reconstruct(shortlist[i], tmp.data(), tmp.data() + index.d);
+        distances[i] = fvec_L2sqr(query, tmp.data(), index.d);
+        ncomp++;
+    }
+    return ncomp;
+}
+
+void ReconstructFromNeighbors::get_neighbor_table(storage_idx_t i, float *tmp1) const
+{
+    const HNSW & hnsw = index.hnsw;
+    size_t begin, end;
+    hnsw.neighbor_range(i, 0, &begin, &end);
+    size_t d = index.d;
+
+    index.storage->reconstruct(i, tmp1);
+
+    for (size_t j = begin; j < end; j++) {
+        storage_idx_t ji = hnsw.neighbors[j];
+        if (ji < 0) ji = i;
+        index.storage->reconstruct(ji, tmp1 + (j - begin + 1) * d);
+    }
+
+}
+
+
+/// called by add_codes
+void ReconstructFromNeighbors::estimate_code(
+       const float *x, storage_idx_t i, uint8_t *code) const
+{
+
+    // fill in tmp table with the neighbor values
+    float *tmp1 = new float[d * (M + 1) + (d * k)];
+    float *tmp2 = tmp1 + d * (M + 1);
+    ScopeDeleter<float> del(tmp1);
+
+    // collect coordinates of base
+    get_neighbor_table (i, tmp1);
+
+    for (size_t sq = 0; sq < nsq; sq++) {
+        int d0 = sq * dsub;
+
+        {
+            FINTEGER ki = k, di = d, m1 = M + 1;
+            FINTEGER dsubi = dsub;
+            float zero = 0, one = 1;
+
+            sgemm_ ("N", "N", &dsubi, &ki, &m1, &one,
+                    tmp1 + d0, &di,
+                    codebook.data() + sq * (m1 * k), &m1,
+                    &zero, tmp2, &dsubi);
+        }
+
+        float min = HUGE_VAL;
+        int argmin = -1;
+        for (size_t j = 0; j < k; j++) {
+            float dis = fvec_L2sqr(x + d0, tmp2 + j * dsub, dsub);
+            if (dis < min) {
+                min = dis;
+                argmin = j;
+            }
+        }
+        code[sq] = argmin;
+    }
+
+}
+
+void ReconstructFromNeighbors::add_codes(size_t n, const float *x)
+{
+    if (k == 1) { // nothing to encode
+        ntotal += n;
+        return;
+    }
+    codes.resize(codes.size() + code_size * n);
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+        estimate_code(x + i * index.d, ntotal + i,
+                      codes.data() + (ntotal + i) * code_size);
+    }
+    ntotal += n;
+    FAISS_ASSERT (codes.size() == ntotal * code_size);
+}
+
+
+/**************************************************************
+ * IndexHNSWFlat implementation
+ **************************************************************/
+
+
+IndexHNSWFlat::IndexHNSWFlat()
+{
+    is_trained = true;
+}
+
+IndexHNSWFlat::IndexHNSWFlat(int d, int M):
+    IndexHNSW(new IndexFlatL2(d), M)
+{
+    own_fields = true;
+    is_trained = true;
+}
+
+
+/**************************************************************
+ * IndexHNSWPQ implementation
+ **************************************************************/
+
+
+IndexHNSWPQ::IndexHNSWPQ() {}
+
+IndexHNSWPQ::IndexHNSWPQ(int d, int pq_m, int M):
+    IndexHNSW(new IndexPQ(d, pq_m, 8), M)
+{
+    own_fields = true;
+    is_trained = false;
+}
+
+void IndexHNSWPQ::train(idx_t n, const float* x)
+{
+    IndexHNSW::train (n, x);
+    (dynamic_cast<IndexPQ*> (storage))->pq.compute_sdc_table();
+}
+
+
+/**************************************************************
+ * IndexHNSWSQ implementation
+ **************************************************************/
+
+
+IndexHNSWSQ::IndexHNSWSQ(int d, ScalarQuantizer::QuantizerType qtype, int M):
+    IndexHNSW (new IndexScalarQuantizer (d, qtype), M)
+{
+    is_trained = false;
+    own_fields = true;
+}
+
+IndexHNSWSQ::IndexHNSWSQ() {}
+
+
+/**************************************************************
+ * IndexHNSW2Level implementation
+ **************************************************************/
+
+
+IndexHNSW2Level::IndexHNSW2Level(Index *quantizer, size_t nlist, int m_pq, int M):
+    IndexHNSW (new Index2Layer (quantizer, nlist, m_pq), M)
+{
+    own_fields = true;
+    is_trained = false;
+}
+
+IndexHNSW2Level::IndexHNSW2Level() {}
+
+
+namespace {
+
+
+// same as search_from_candidates but uses v
+// visno -> is in result list
+// visno + 1 -> in result list + in candidates
+int search_from_candidates_2(const HNSW & hnsw,
+                             DistanceComputer & qdis, int k,
+                             idx_t *I, float * D,
+                             MinimaxHeap &candidates,
+                             VisitedTable &vt,
+                             int level, int nres_in = 0)
+{
+    int nres = nres_in;
+    int ndis = 0;
+    for (int i = 0; i < candidates.size(); i++) {
+        idx_t v1 = candidates.ids[i];
+        FAISS_ASSERT(v1 >= 0);
+        vt.visited[v1] = vt.visno + 1;
+    }
+
+    int nstep = 0;
+
+    while (candidates.size() > 0) {
+        float d0 = 0;
+        int v0 = candidates.pop_min(&d0);
+
+        size_t begin, end;
+        hnsw.neighbor_range(v0, level, &begin, &end);
+
+        for (size_t j = begin; j < end; j++) {
+            int v1 = hnsw.neighbors[j];
+            if (v1 < 0) break;
+            if (vt.visited[v1] == vt.visno + 1) {
+                // nothing to do
+            } else {
+                ndis++;
+                float d = qdis(v1);
+                candidates.push(v1, d);
+
+                // never seen before --> add to heap
+                if (vt.visited[v1] < vt.visno) {
+                    if (nres < k) {
+                        faiss::maxheap_push (++nres, D, I, d, v1);
+                    } else if (d < D[0]) {
+                        faiss::maxheap_pop (nres--, D, I);
+                        faiss::maxheap_push (++nres, D, I, d, v1);
+                    }
+                }
+                vt.visited[v1] = vt.visno + 1;
+            }
+        }
+
+        nstep++;
+        if (nstep > hnsw.efSearch) {
+            break;
+        }
+    }
+
+    if (level == 0) {
+#pragma omp critical
+        {
+            hnsw_stats.n1 ++;
+            if (candidates.size() == 0)
+                hnsw_stats.n2 ++;
+        }
+    }
+
+
+    return nres;
+}
+
+
+}  // namespace
+
+void IndexHNSW2Level::search (idx_t n, const float *x, idx_t k,
+                              float *distances, idx_t *labels) const
+{
+    if (dynamic_cast<const Index2Layer*>(storage)) {
+        IndexHNSW::search (n, x, k, distances, labels);
+
+    } else { // "mixed" search
+
+        const IndexIVFPQ *index_ivfpq =
+            dynamic_cast<const IndexIVFPQ*>(storage);
+
+        int nprobe = index_ivfpq->nprobe;
+
+        std::unique_ptr<idx_t[]> coarse_assign(new idx_t[n * nprobe]);
+        std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+
+        index_ivfpq->quantizer->search (n, x, nprobe, coarse_dis.get(),
+                                        coarse_assign.get());
+
+        index_ivfpq->search_preassigned (n, x, k, coarse_assign.get(),
+                                         coarse_dis.get(), distances, labels,
+                                         false);
+
+#pragma omp parallel
+        {
+            VisitedTable vt (ntotal);
+            DistanceComputer *dis = storage->get_distance_computer();
+            ScopeDeleter1<DistanceComputer> del(dis);
+
+            int candidates_size = hnsw.upper_beam;
+            MinimaxHeap candidates(candidates_size);
+
+#pragma omp for
+            for(idx_t i = 0; i < n; i++) {
+                idx_t * idxi = labels + i * k;
+                float * simi = distances + i * k;
+                dis->set_query(x + i * d);
+
+                // mark all inverted list elements as visited
+
+                for (int j = 0; j < nprobe; j++) {
+                    idx_t key = coarse_assign[j + i * nprobe];
+                    if (key < 0) break;
+                    size_t list_length = index_ivfpq->get_list_size (key);
+                    const idx_t * ids = index_ivfpq->invlists->get_ids (key);
+
+                    for (int jj = 0; jj < list_length; jj++) {
+                        vt.set (ids[jj]);
+                    }
+                }
+
+                candidates.clear();
+                // copy the upper_beam elements to candidates list
+
+                int search_policy = 2;
+
+                if (search_policy == 1) {
+
+                    for (int j = 0 ; j < hnsw.upper_beam && j < k; j++) {
+                        if (idxi[j] < 0) break;
+                        candidates.push (idxi[j], simi[j]);
+                        // search_from_candidates adds them back
+                        idxi[j] = -1;
+                        simi[j] = HUGE_VAL;
+                    }
+
+                    // reorder from sorted to heap
+                    maxheap_heapify (k, simi, idxi, simi, idxi, k);
+
+                    hnsw.search_from_candidates(
+                      *dis, k, idxi, simi,
+                      candidates, vt, 0, k
+                    );
+
+                    vt.advance();
+
+                } else if (search_policy == 2) {
+
+                    for (int j = 0 ; j < hnsw.upper_beam && j < k; j++) {
+                        if (idxi[j] < 0) break;
+                        candidates.push (idxi[j], simi[j]);
+                    }
+
+                    // reorder from sorted to heap
+                    maxheap_heapify (k, simi, idxi, simi, idxi, k);
+
+                    search_from_candidates_2 (
+                        hnsw, *dis, k, idxi, simi,
+                        candidates, vt, 0, k);
+                    vt.advance ();
+                    vt.advance ();
+
+                }
+
+                maxheap_reorder (k, simi, idxi);
+            }
+        }
+    }
+
+
+}
+
+
+void IndexHNSW2Level::flip_to_ivf ()
+{
+    Index2Layer *storage2l =
+        dynamic_cast<Index2Layer*>(storage);
+
+    FAISS_THROW_IF_NOT (storage2l);
+
+    IndexIVFPQ * index_ivfpq =
+        new IndexIVFPQ (storage2l->q1.quantizer,
+                        d, storage2l->q1.nlist,
+                        storage2l->pq.M, 8);
+    index_ivfpq->pq = storage2l->pq;
+    index_ivfpq->is_trained = storage2l->is_trained;
+    index_ivfpq->precompute_table();
+    index_ivfpq->own_fields = storage2l->q1.own_fields;
+    storage2l->transfer_to_IVFPQ(*index_ivfpq);
+    index_ivfpq->make_direct_map (true);
+
+    storage = index_ivfpq;
+    delete storage2l;
+
+}
+
+
+}  // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexHNSW.h b/core/src/index/thirdparty/faiss/IndexHNSW.h
new file mode 100644
index 0000000000..118e37f5d2
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexHNSW.h
@@ -0,0 +1,170 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <vector>
+
+#include <faiss/impl/HNSW.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/utils/utils.h>
+
+
+namespace faiss {
+
+struct IndexHNSW;
+
+struct ReconstructFromNeighbors {
+    typedef Index::idx_t idx_t;
+    typedef HNSW::storage_idx_t storage_idx_t;
+
+    const IndexHNSW & index;
+    size_t M; // number of neighbors
+    size_t k; // number of codebook entries
+    size_t nsq; // number of subvectors
+    size_t code_size;
+    int k_reorder; // nb to reorder. -1 = all
+
+    std::vector<float> codebook; // size nsq * k * (M + 1)
+
+    std::vector<uint8_t> codes; // size ntotal * code_size
+    size_t ntotal;
+    size_t d, dsub; // derived values
+
+    explicit ReconstructFromNeighbors(const IndexHNSW& index,
+                                      size_t k=256, size_t nsq=1);
+
+    /// codes must be added in the correct order and the IndexHNSW
+    /// must be populated and sorted
+    void add_codes(size_t n, const float *x);
+
+    size_t compute_distances(size_t n, const idx_t *shortlist,
+                             const float *query, float *distances) const;
+
+    /// called by add_codes
+    void estimate_code(const float *x, storage_idx_t i, uint8_t *code) const;
+
+    /// called by compute_distances
+    void reconstruct(storage_idx_t i, float *x, float *tmp) const;
+
+    void reconstruct_n(storage_idx_t n0, storage_idx_t ni, float *x) const;
+
+    /// get the M+1 -by-d table for neighbor coordinates for vector i
+    void get_neighbor_table(storage_idx_t i, float *out) const;
+
+};
+
+
+/** The HNSW index is a normal random-access index with a HNSW
+ * link structure built on top */
+
+struct IndexHNSW : Index {
+
+    typedef HNSW::storage_idx_t storage_idx_t;
+
+    // the link strcuture
+    HNSW hnsw;
+
+    // the sequential storage
+    bool own_fields;
+    Index *storage;
+
+    ReconstructFromNeighbors *reconstruct_from_neighbors;
+
+    explicit IndexHNSW (int d = 0, int M = 32);
+    explicit IndexHNSW (Index *storage, int M = 32);
+
+    ~IndexHNSW() override;
+
+    void add(idx_t n, const float *x) override;
+
+    /// Trains the storage if needed
+    void train(idx_t n, const float* x) override;
+
+    /// entry point for search
+    void search (idx_t n, const float *x, idx_t k,
+                 float *distances, idx_t *labels) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    void reset () override;
+
+    void shrink_level_0_neighbors(int size);
+
+    /** Perform search only on level 0, given the starting points for
+     * each vertex.
+     *
+     * @param search_type 1:perform one search per nprobe, 2: enqueue
+     *                    all entry points
+     */
+    void search_level_0(idx_t n, const float *x, idx_t k,
+                        const storage_idx_t *nearest, const float *nearest_d,
+                        float *distances, idx_t *labels, int nprobe = 1,
+                        int search_type = 1) const;
+
+    /// alternative graph building
+    void init_level_0_from_knngraph(
+                        int k, const float *D, const idx_t *I);
+
+    /// alternative graph building
+    void init_level_0_from_entry_points(
+                        int npt, const storage_idx_t *points,
+                        const storage_idx_t *nearests);
+
+    // reorder links from nearest to farthest
+    void reorder_links();
+
+    void link_singletons();
+};
+
+
+/** Flat index topped with with a HNSW structure to access elements
+ *  more efficiently.
+ */
+
+struct IndexHNSWFlat : IndexHNSW {
+    IndexHNSWFlat();
+    IndexHNSWFlat(int d, int M);
+};
+
+/** PQ index topped with with a HNSW structure to access elements
+ *  more efficiently.
+ */
+struct IndexHNSWPQ : IndexHNSW {
+    IndexHNSWPQ();
+    IndexHNSWPQ(int d, int pq_m, int M);
+    void train(idx_t n, const float* x) override;
+};
+
+/** SQ index topped with with a HNSW structure to access elements
+ *  more efficiently.
+ */
+struct IndexHNSWSQ : IndexHNSW {
+    IndexHNSWSQ();
+    IndexHNSWSQ(int d, ScalarQuantizer::QuantizerType qtype, int M);
+};
+
+/** 2-level code structure with fast random access
+ */
+struct IndexHNSW2Level : IndexHNSW {
+    IndexHNSW2Level();
+    IndexHNSW2Level(Index *quantizer, size_t nlist, int m_pq, int M);
+
+    void flip_to_ivf();
+
+    /// entry point for search
+    void search (idx_t n, const float *x, idx_t k,
+                 float *distances, idx_t *labels) const override;
+
+};
+
+
+}  // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexIVF.cpp b/core/src/index/thirdparty/faiss/IndexIVF.cpp
new file mode 100644
index 0000000000..85e8932cd9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexIVF.cpp
@@ -0,0 +1,966 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexIVF.h>
+
+
+#include <omp.h>
+
+#include <cstdio>
+#include <memory>
+#include <iostream>
+
+#include <faiss/utils/utils.h>
+#include <faiss/utils/hamming.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+namespace faiss {
+
+using ScopedIds = InvertedLists::ScopedIds;
+using ScopedCodes = InvertedLists::ScopedCodes;
+
+/*****************************************
+ * Level1Quantizer implementation
+ ******************************************/
+
+
+Level1Quantizer::Level1Quantizer (Index * quantizer, size_t nlist):
+    quantizer (quantizer),
+    nlist (nlist),
+    quantizer_trains_alone (0),
+    own_fields (false),
+    clustering_index (nullptr)
+{
+    // here we set a low # iterations because this is typically used
+    // for large clusterings (nb this is not used for the MultiIndex,
+    // for which quantizer_trains_alone = true)
+    cp.niter = 10;
+}
+
+Level1Quantizer::Level1Quantizer ():
+    quantizer (nullptr),
+    nlist (0),
+    quantizer_trains_alone (0), own_fields (false),
+    clustering_index (nullptr)
+{}
+
+Level1Quantizer::~Level1Quantizer ()
+{
+    if (own_fields) {
+        if(quantizer == quantizer_backup) {
+            if(quantizer != nullptr) {
+                delete quantizer;
+            }
+        } else {
+            if(quantizer != nullptr) {
+                delete quantizer;
+            }
+
+            if(quantizer_backup != nullptr) {
+                delete quantizer_backup;
+            }
+        }
+        quantizer = nullptr;
+        quantizer_backup = nullptr;
+    }
+}
+
+void Level1Quantizer::train_q1 (size_t n, const float *x, bool verbose, MetricType metric_type)
+{
+    size_t d = quantizer->d;
+    if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
+        if (verbose)
+            printf ("IVF quantizer does not need training.\n");
+    } else if (quantizer_trains_alone == 1) {
+        if (verbose)
+            printf ("IVF quantizer trains alone...\n");
+        quantizer->train (n, x);
+        quantizer->verbose = verbose;
+        FAISS_THROW_IF_NOT_MSG (quantizer->ntotal == nlist,
+                          "nlist not consistent with quantizer size");
+    } else if (quantizer_trains_alone == 0) {
+        if (verbose)
+            printf ("Training level-1 quantizer on %ld vectors in %ldD\n",
+                    n, d);
+
+        Clustering clus (d, nlist, cp);
+        quantizer->reset();
+        if (clustering_index) {
+            clus.train (n, x, *clustering_index);
+            quantizer->add (nlist, clus.centroids.data());
+        } else {
+            clus.train (n, x, *quantizer);
+        }
+        quantizer->is_trained = true;
+    } else if (quantizer_trains_alone == 2) {
+        if (verbose)
+            printf (
+                "Training L2 quantizer on %ld vectors in %ldD%s\n",
+                n, d,
+                clustering_index ? "(user provided index)" : "");
+        FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
+        Clustering clus (d, nlist, cp);
+        if (!clustering_index) {
+            IndexFlatL2 assigner (d);
+            clus.train(n, x, assigner);
+        } else {
+            clus.train(n, x, *clustering_index);
+        }
+        if (verbose)
+            printf ("Adding centroids to quantizer\n");
+        quantizer->add (nlist, clus.centroids.data());
+    }
+}
+
+size_t Level1Quantizer::coarse_code_size () const
+{
+    size_t nl = nlist - 1;
+    size_t nbyte = 0;
+    while (nl > 0) {
+        nbyte ++;
+        nl >>= 8;
+    }
+    return nbyte;
+}
+
+void Level1Quantizer::encode_listno (Index::idx_t list_no, uint8_t *code) const
+{
+    // little endian
+    size_t nl = nlist - 1;
+    while (nl > 0) {
+        *code++ = list_no & 0xff;
+        list_no >>= 8;
+        nl >>= 8;
+    }
+}
+
+Index::idx_t Level1Quantizer::decode_listno (const uint8_t *code) const
+{
+    size_t nl = nlist - 1;
+    int64_t list_no = 0;
+    int nbit = 0;
+    while (nl > 0) {
+        list_no |= int64_t(*code++) << nbit;
+        nbit += 8;
+        nl >>= 8;
+    }
+    FAISS_THROW_IF_NOT (list_no >= 0 && list_no < nlist);
+    return list_no;
+}
+
+
+
+/*****************************************
+ * IndexIVF implementation
+ ******************************************/
+
+
+IndexIVF::IndexIVF (Index * quantizer, size_t d,
+                    size_t nlist, size_t code_size,
+                    MetricType metric):
+    Index (d, metric),
+    Level1Quantizer (quantizer, nlist),
+    invlists (new ArrayInvertedLists (nlist, code_size)),
+    own_invlists (true),
+    code_size (code_size),
+    nprobe (1),
+    max_codes (0),
+    parallel_mode (0),
+    maintain_direct_map (false)
+{
+    FAISS_THROW_IF_NOT (d == quantizer->d);
+    is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
+    // Spherical by default if the metric is inner_product
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        cp.spherical = true;
+    }
+
+}
+
+IndexIVF::IndexIVF ():
+    invlists (nullptr), own_invlists (false),
+    code_size (0),
+    nprobe (1), max_codes (0), parallel_mode (0),
+    maintain_direct_map (false)
+{}
+
+void IndexIVF::add (idx_t n, const float * x)
+{
+    add_with_ids (n, x, nullptr);
+}
+
+
+void IndexIVF::add_with_ids (idx_t n, const float * x, const idx_t *xids)
+{
+    // do some blocking to avoid excessive allocs
+    idx_t bs = 65536;
+    if (n > bs) {
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min (n, i0 + bs);
+            if (verbose) {
+                printf("   IndexIVF::add_with_ids %ld:%ld\n", i0, i1);
+            }
+            add_with_ids (i1 - i0, x + i0 * d,
+                          xids ? xids + i0 : nullptr);
+        }
+        return;
+    }
+
+    FAISS_THROW_IF_NOT (is_trained);
+    std::unique_ptr<idx_t []> idx(new idx_t[n]);
+    quantizer->assign (n, x, idx.get());
+    size_t nadd = 0, nminus1 = 0;
+
+    for (size_t i = 0; i < n; i++) {
+        if (idx[i] < 0) nminus1++;
+    }
+
+    std::unique_ptr<uint8_t []> flat_codes(new uint8_t [n * code_size]);
+    encode_vectors (n, x, idx.get(), flat_codes.get());
+
+#pragma omp parallel reduction(+: nadd)
+    {
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+
+        // each thread takes care of a subset of lists
+        for (size_t i = 0; i < n; i++) {
+            idx_t list_no = idx [i];
+            if (list_no >= 0 && list_no % nt == rank) {
+                idx_t id = xids ? xids[i] : ntotal + i;
+                invlists->add_entry (list_no, id,
+                                     flat_codes.get() + i * code_size);
+                nadd++;
+            }
+        }
+    }
+
+    if (verbose) {
+        printf("    added %ld / %ld vectors (%ld -1s)\n", nadd, n, nminus1);
+    }
+
+    ntotal += n;
+}
+
+void IndexIVF::to_readonly() {
+    if (is_readonly()) return;
+    auto readonly_lists = this->invlists->to_readonly();
+    if (!readonly_lists) return;
+    this->replace_invlists(readonly_lists, true);
+}
+
+bool IndexIVF::is_readonly() const {
+    return this->invlists->is_readonly();
+}
+
+void IndexIVF::backup_quantizer() {
+    this->quantizer_backup = quantizer;
+}
+
+void IndexIVF::restore_quantizer() {
+    if(this->quantizer_backup != nullptr) {
+        quantizer = this->quantizer_backup;
+    }
+}
+
+void IndexIVF::make_direct_map (bool new_maintain_direct_map)
+{
+    // nothing to do
+    if (new_maintain_direct_map == maintain_direct_map)
+        return;
+
+    if (new_maintain_direct_map) {
+        direct_map.resize (ntotal, -1);
+        for (size_t key = 0; key < nlist; key++) {
+            size_t list_size = invlists->list_size (key);
+            ScopedIds idlist (invlists, key);
+
+            for (long ofs = 0; ofs < list_size; ofs++) {
+                FAISS_THROW_IF_NOT_MSG (
+                       0 <= idlist [ofs] && idlist[ofs] < ntotal,
+                       "direct map supported only for seuquential ids");
+                direct_map [idlist [ofs]] = key << 32 | ofs;
+            }
+        }
+    } else {
+        direct_map.clear ();
+    }
+    maintain_direct_map = new_maintain_direct_map;
+}
+
+
+void IndexIVF::search (idx_t n, const float *x, idx_t k,
+                         float *distances, idx_t *labels) const
+{
+    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+
+    double t0 = getmillisecs();
+    quantizer->search (n, x, nprobe, coarse_dis.get(), idx.get());
+    indexIVF_stats.quantization_time += getmillisecs() - t0;
+
+    t0 = getmillisecs();
+    invlists->prefetch_lists (idx.get(), n * nprobe);
+
+    search_preassigned (n, x, k, idx.get(), coarse_dis.get(),
+                        distances, labels, false);
+    indexIVF_stats.search_time += getmillisecs() - t0;
+}
+
+
+
+void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
+                                   const idx_t *keys,
+                                   const float *coarse_dis ,
+                                   float *distances, idx_t *labels,
+                                   bool store_pairs,
+                                   const IVFSearchParameters *params) const
+{
+    long nprobe = params ? params->nprobe : this->nprobe;
+    long max_codes = params ? params->max_codes : this->max_codes;
+
+    size_t nlistv = 0, ndis = 0, nheap = 0;
+
+    using HeapForIP = CMin<float, idx_t>;
+    using HeapForL2 = CMax<float, idx_t>;
+
+    bool interrupt = false;
+
+    // don't start parallel section if single query
+    bool do_parallel =
+        parallel_mode == 0 ? n > 1 :
+        parallel_mode == 1 ? nprobe > 1 :
+        nprobe * n > 1;
+
+#pragma omp parallel if(do_parallel) reduction(+: nlistv, ndis, nheap)
+    {
+        InvertedListScanner *scanner = get_InvertedListScanner(store_pairs);
+        ScopeDeleter1<InvertedListScanner> del(scanner);
+
+        /*****************************************************
+         * Depending on parallel_mode, there are two possible ways
+         * to organize the search. Here we define local functions
+         * that are in common between the two
+         ******************************************************/
+
+        // intialize + reorder a result heap
+
+        auto init_result = [&](float *simi, idx_t *idxi) {
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_heapify<HeapForIP> (k, simi, idxi);
+            } else {
+                heap_heapify<HeapForL2> (k, simi, idxi);
+            }
+        };
+
+        auto reorder_result = [&] (float *simi, idx_t *idxi) {
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_reorder<HeapForIP> (k, simi, idxi);
+            } else {
+                heap_reorder<HeapForL2> (k, simi, idxi);
+            }
+        };
+
+        // single list scan using the current scanner (with query
+        // set porperly) and storing results in simi and idxi
+        auto scan_one_list = [&] (idx_t key, float coarse_dis_i,
+                                  float *simi, idx_t *idxi) {
+
+            if (key < 0) {
+                // not enough centroids for multiprobe
+                return (size_t)0;
+            }
+            FAISS_THROW_IF_NOT_FMT (key < (idx_t) nlist,
+                                    "Invalid key=%ld nlist=%ld\n",
+                                    key, nlist);
+
+            size_t list_size = invlists->list_size(key);
+
+            // don't waste time on empty lists
+            if (list_size == 0) {
+                return (size_t)0;
+            }
+
+            scanner->set_list (key, coarse_dis_i);
+
+            nlistv++;
+
+            InvertedLists::ScopedCodes scodes (invlists, key);
+
+            std::unique_ptr<InvertedLists::ScopedIds> sids;
+            const Index::idx_t * ids = nullptr;
+
+            if (!store_pairs)  {
+                sids.reset (new InvertedLists::ScopedIds (invlists, key));
+                ids = sids->get();
+            }
+
+            nheap += scanner->scan_codes (list_size, scodes.get(),
+                                          ids, simi, idxi, k);
+
+            return list_size;
+        };
+
+        /****************************************************
+         * Actual loops, depending on parallel_mode
+         ****************************************************/
+
+        if (parallel_mode == 0) {
+
+#pragma omp for
+            for (size_t i = 0; i < n; i++) {
+
+                if (interrupt) {
+                    continue;
+                }
+
+                // loop over queries
+                scanner->set_query (x + i * d);
+                float * simi = distances + i * k;
+                idx_t * idxi = labels + i * k;
+
+                init_result (simi, idxi);
+
+                long nscan = 0;
+
+                // loop over probes
+                for (size_t ik = 0; ik < nprobe; ik++) {
+
+                    nscan += scan_one_list (
+                         keys [i * nprobe + ik],
+                         coarse_dis[i * nprobe + ik],
+                         simi, idxi
+                    );
+
+                    if (max_codes && nscan >= max_codes) {
+                        break;
+                    }
+                }
+
+                ndis += nscan;
+                reorder_result (simi, idxi);
+
+                if (InterruptCallback::is_interrupted ()) {
+                    interrupt = true;
+                }
+
+            } // parallel for
+        } else if (parallel_mode == 1) {
+            std::vector <idx_t> local_idx (k);
+            std::vector <float> local_dis (k);
+
+            for (size_t i = 0; i < n; i++) {
+                scanner->set_query (x + i * d);
+                init_result (local_dis.data(), local_idx.data());
+
+#pragma omp for schedule(dynamic)
+                for (size_t ik = 0; ik < nprobe; ik++) {
+                    ndis += scan_one_list
+                        (keys [i * nprobe + ik],
+                         coarse_dis[i * nprobe + ik],
+                         local_dis.data(), local_idx.data());
+
+                    // can't do the test on max_codes
+                }
+                // merge thread-local results
+
+                float * simi = distances + i * k;
+                idx_t * idxi = labels + i * k;
+#pragma omp single
+                init_result (simi, idxi);
+
+#pragma omp barrier
+#pragma omp critical
+                {
+                    if (metric_type == METRIC_INNER_PRODUCT) {
+                        heap_addn<HeapForIP>
+                            (k, simi, idxi,
+                             local_dis.data(), local_idx.data(), k);
+                    } else {
+                        heap_addn<HeapForL2>
+                            (k, simi, idxi,
+                             local_dis.data(), local_idx.data(), k);
+                    }
+                }
+#pragma omp barrier
+#pragma omp single
+                reorder_result (simi, idxi);
+            }
+        } else {
+            FAISS_THROW_FMT ("parallel_mode %d not supported\n",
+                             parallel_mode);
+        }
+    } // parallel section
+
+    if (interrupt) {
+        FAISS_THROW_MSG ("computation interrupted");
+    }
+
+    indexIVF_stats.nq += n;
+    indexIVF_stats.nlist += nlistv;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.nheap_updates += nheap;
+
+}
+
+
+
+
+void IndexIVF::range_search (idx_t nx, const float *x, float radius,
+                             RangeSearchResult *result) const
+{
+    std::unique_ptr<idx_t[]> keys (new idx_t[nx * nprobe]);
+    std::unique_ptr<float []> coarse_dis (new float[nx * nprobe]);
+
+    double t0 = getmillisecs();
+    quantizer->search (nx, x, nprobe, coarse_dis.get (), keys.get ());
+    indexIVF_stats.quantization_time += getmillisecs() - t0;
+
+    t0 = getmillisecs();
+    invlists->prefetch_lists (keys.get(), nx * nprobe);
+
+    range_search_preassigned (nx, x, radius, keys.get (), coarse_dis.get (),
+                              result);
+
+    indexIVF_stats.search_time += getmillisecs() - t0;
+}
+
+void IndexIVF::range_search_preassigned (
+         idx_t nx, const float *x, float radius,
+         const idx_t *keys, const float *coarse_dis,
+         RangeSearchResult *result) const
+{
+
+    size_t nlistv = 0, ndis = 0;
+    bool store_pairs = false;
+
+    std::vector<RangeSearchPartialResult *> all_pres (omp_get_max_threads());
+
+#pragma omp parallel reduction(+: nlistv, ndis)
+    {
+        RangeSearchPartialResult pres(result);
+        std::unique_ptr<InvertedListScanner> scanner
+            (get_InvertedListScanner(store_pairs));
+        FAISS_THROW_IF_NOT (scanner.get ());
+        all_pres[omp_get_thread_num()] = &pres;
+
+        // prepare the list scanning function
+
+        auto scan_list_func = [&](size_t i, size_t ik, RangeQueryResult &qres) {
+
+            idx_t key = keys[i * nprobe + ik];  /* select the list  */
+            if (key < 0) return;
+            FAISS_THROW_IF_NOT_FMT (
+                  key < (idx_t) nlist,
+                  "Invalid key=%ld  at ik=%ld nlist=%ld\n",
+                  key, ik, nlist);
+            const size_t list_size = invlists->list_size(key);
+
+            if (list_size == 0) return;
+
+            InvertedLists::ScopedCodes scodes (invlists, key);
+            InvertedLists::ScopedIds ids (invlists, key);
+
+            scanner->set_list (key, coarse_dis[i * nprobe + ik]);
+            nlistv++;
+            ndis += list_size;
+            scanner->scan_codes_range (list_size, scodes.get(),
+                                       ids.get(), radius, qres);
+        };
+
+        if (parallel_mode == 0) {
+
+#pragma omp for
+            for (size_t i = 0; i < nx; i++) {
+                scanner->set_query (x + i * d);
+
+                RangeQueryResult & qres = pres.new_result (i);
+
+                for (size_t ik = 0; ik < nprobe; ik++) {
+                    scan_list_func (i, ik, qres);
+                }
+
+            }
+
+        } else if (parallel_mode == 1) {
+
+            for (size_t i = 0; i < nx; i++) {
+                scanner->set_query (x + i * d);
+
+                RangeQueryResult & qres = pres.new_result (i);
+
+#pragma omp for schedule(dynamic)
+                for (size_t ik = 0; ik < nprobe; ik++) {
+                    scan_list_func (i, ik, qres);
+                }
+            }
+        } else if (parallel_mode == 2) {
+            std::vector<RangeQueryResult *> all_qres (nx);
+            RangeQueryResult *qres = nullptr;
+
+#pragma omp for schedule(dynamic)
+            for (size_t iik = 0; iik < nx * nprobe; iik++) {
+                size_t i = iik / nprobe;
+                size_t ik = iik % nprobe;
+                if (qres == nullptr || qres->qno != i) {
+                    FAISS_ASSERT (!qres || i > qres->qno);
+                    qres = &pres.new_result (i);
+                    scanner->set_query (x + i * d);
+                }
+                scan_list_func (i, ik, *qres);
+            }
+        } else {
+            FAISS_THROW_FMT ("parallel_mode %d not supported\n", parallel_mode);
+        }
+        if (parallel_mode == 0) {
+            pres.finalize ();
+        } else {
+#pragma omp barrier
+#pragma omp single
+            RangeSearchPartialResult::merge (all_pres, false);
+#pragma omp barrier
+
+        }
+    }
+    indexIVF_stats.nq += nx;
+    indexIVF_stats.nlist += nlistv;
+    indexIVF_stats.ndis += ndis;
+}
+
+
+InvertedListScanner *IndexIVF::get_InvertedListScanner (
+    bool /*store_pairs*/) const
+{
+    return nullptr;
+}
+
+void IndexIVF::reconstruct (idx_t key, float* recons) const
+{
+    FAISS_THROW_IF_NOT_MSG (direct_map.size() == ntotal,
+                            "direct map is not initialized");
+    FAISS_THROW_IF_NOT_MSG (key >= 0 && key < direct_map.size(),
+                            "invalid key");
+    idx_t list_no = direct_map[key] >> 32;
+    idx_t offset = direct_map[key] & 0xffffffff;
+    reconstruct_from_offset (list_no, offset, recons);
+}
+
+
+void IndexIVF::reconstruct_n (idx_t i0, idx_t ni, float* recons) const
+{
+    FAISS_THROW_IF_NOT (ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
+
+    for (idx_t list_no = 0; list_no < nlist; list_no++) {
+        size_t list_size = invlists->list_size (list_no);
+        ScopedIds idlist (invlists, list_no);
+
+        for (idx_t offset = 0; offset < list_size; offset++) {
+            idx_t id = idlist[offset];
+            if (!(id >= i0 && id < i0 + ni)) {
+                continue;
+            }
+
+            float* reconstructed = recons + (id - i0) * d;
+            reconstruct_from_offset (list_no, offset, reconstructed);
+        }
+    }
+}
+
+
+/* standalone codec interface */
+size_t IndexIVF::sa_code_size () const
+{
+    size_t coarse_size = coarse_code_size();
+    return code_size + coarse_size;
+}
+
+void IndexIVF::sa_encode (idx_t n, const float *x,
+                                 uint8_t *bytes) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    std::unique_ptr<int64_t []> idx (new int64_t [n]);
+    quantizer->assign (n, x, idx.get());
+    encode_vectors (n, x, idx.get(), bytes, true);
+}
+
+
+void IndexIVF::search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                       float *distances, idx_t *labels,
+                                       float *recons) const
+{
+    idx_t * idx = new idx_t [n * nprobe];
+    ScopeDeleter<idx_t> del (idx);
+    float * coarse_dis = new float [n * nprobe];
+    ScopeDeleter<float> del2 (coarse_dis);
+
+    quantizer->search (n, x, nprobe, coarse_dis, idx);
+
+    invlists->prefetch_lists (idx, n * nprobe);
+
+    // search_preassigned() with `store_pairs` enabled to obtain the list_no
+    // and offset into `codes` for reconstruction
+    search_preassigned (n, x, k, idx, coarse_dis,
+                        distances, labels, true /* store_pairs */);
+    for (idx_t i = 0; i < n; ++i) {
+        for (idx_t j = 0; j < k; ++j) {
+            idx_t ij = i * k + j;
+            idx_t key = labels[ij];
+            float* reconstructed = recons + ij * d;
+            if (key < 0) {
+                // Fill with NaNs
+                memset(reconstructed, -1, sizeof(*reconstructed) * d);
+            } else {
+                int list_no = key >> 32;
+                int offset = key & 0xffffffff;
+
+                // Update label to the actual id
+                labels[ij] = invlists->get_single_id (list_no, offset);
+
+                reconstruct_from_offset (list_no, offset, reconstructed);
+            }
+        }
+    }
+}
+
+void IndexIVF::reconstruct_from_offset(
+    int64_t /*list_no*/,
+    int64_t /*offset*/,
+    float* /*recons*/) const {
+  FAISS_THROW_MSG ("reconstruct_from_offset not implemented");
+}
+
+void IndexIVF::reset ()
+{
+    direct_map.clear ();
+    invlists->reset ();
+    ntotal = 0;
+}
+
+
+size_t IndexIVF::remove_ids (const IDSelector & sel)
+{
+    FAISS_THROW_IF_NOT_MSG (!maintain_direct_map,
+                    "direct map remove not implemented");
+
+    std::vector<idx_t> toremove(nlist);
+
+#pragma omp parallel for
+    for (idx_t i = 0; i < nlist; i++) {
+        idx_t l0 = invlists->list_size (i), l = l0, j = 0;
+        ScopedIds idsi (invlists, i);
+        while (j < l) {
+            if (sel.is_member (idsi[j])) {
+                l--;
+                invlists->update_entry (
+                     i, j,
+                     invlists->get_single_id (i, l),
+                     ScopedCodes (invlists, i, l).get());
+            } else {
+                j++;
+            }
+        }
+        toremove[i] = l0 - l;
+    }
+    // this will not run well in parallel on ondisk because of possible shrinks
+    size_t nremove = 0;
+    for (idx_t i = 0; i < nlist; i++) {
+        if (toremove[i] > 0) {
+            nremove += toremove[i];
+            invlists->resize(
+                i, invlists->list_size(i) - toremove[i]);
+        }
+    }
+    ntotal -= nremove;
+    return nremove;
+}
+
+
+
+
+void IndexIVF::train (idx_t n, const float *x)
+{
+    if (verbose)
+        printf ("Training level-1 quantizer\n");
+
+    train_q1 (n, x, verbose, metric_type);
+
+    if (verbose)
+        printf ("Training IVF residual\n");
+
+    train_residual (n, x);
+    is_trained = true;
+
+}
+
+void IndexIVF::train_residual(idx_t /*n*/, const float* /*x*/) {
+  if (verbose)
+    printf("IndexIVF: no residual training\n");
+  // does nothing by default
+}
+
+
+void IndexIVF::check_compatible_for_merge (const IndexIVF &other) const
+{
+    // minimal sanity checks
+    FAISS_THROW_IF_NOT (other.d == d);
+    FAISS_THROW_IF_NOT (other.nlist == nlist);
+    FAISS_THROW_IF_NOT (other.code_size == code_size);
+    FAISS_THROW_IF_NOT_MSG (typeid (*this) == typeid (other),
+                  "can only merge indexes of the same type");
+}
+
+
+void IndexIVF::merge_from (IndexIVF &other, idx_t add_id)
+{
+    check_compatible_for_merge (other);
+    FAISS_THROW_IF_NOT_MSG ((!maintain_direct_map &&
+                             !other.maintain_direct_map),
+                  "direct map copy not implemented");
+
+    invlists->merge_from (other.invlists, add_id);
+
+    ntotal += other.ntotal;
+    other.ntotal = 0;
+}
+
+
+void IndexIVF::replace_invlists (InvertedLists *il, bool own)
+{
+    if (own_invlists) {
+        delete invlists;
+    }
+    // FAISS_THROW_IF_NOT (ntotal == 0);
+    if (il) {
+        FAISS_THROW_IF_NOT (il->nlist == nlist &&
+                            il->code_size == code_size);
+    }
+    invlists = il;
+    own_invlists = own;
+}
+
+
+void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
+                                 idx_t a1, idx_t a2) const
+{
+
+    FAISS_THROW_IF_NOT (nlist == other.nlist);
+    FAISS_THROW_IF_NOT (code_size == other.code_size);
+    FAISS_THROW_IF_NOT (!other.maintain_direct_map);
+    FAISS_THROW_IF_NOT_FMT (
+          subset_type == 0 || subset_type == 1 || subset_type == 2,
+          "subset type %d not implemented", subset_type);
+
+    size_t accu_n = 0;
+    size_t accu_a1 = 0;
+    size_t accu_a2 = 0;
+
+    InvertedLists *oivf = other.invlists;
+
+    for (idx_t list_no = 0; list_no < nlist; list_no++) {
+        size_t n = invlists->list_size (list_no);
+        ScopedIds ids_in (invlists, list_no);
+
+        if (subset_type == 0) {
+            for (idx_t i = 0; i < n; i++) {
+                idx_t id = ids_in[i];
+                if (a1 <= id && id < a2) {
+                    oivf->add_entry (list_no,
+                                     invlists->get_single_id (list_no, i),
+                                     ScopedCodes (invlists, list_no, i).get());
+                    other.ntotal++;
+                }
+            }
+        } else if (subset_type == 1) {
+            for (idx_t i = 0; i < n; i++) {
+                idx_t id = ids_in[i];
+                if (id % a1 == a2) {
+                    oivf->add_entry (list_no,
+                                     invlists->get_single_id (list_no, i),
+                                     ScopedCodes (invlists, list_no, i).get());
+                    other.ntotal++;
+                }
+            }
+        } else if (subset_type == 2) {
+            // see what is allocated to a1 and to a2
+            size_t next_accu_n = accu_n + n;
+            size_t next_accu_a1 = next_accu_n * a1 / ntotal;
+            size_t i1 = next_accu_a1 - accu_a1;
+            size_t next_accu_a2 = next_accu_n * a2 / ntotal;
+            size_t i2 = next_accu_a2 - accu_a2;
+
+            for (idx_t i = i1; i < i2; i++) {
+                oivf->add_entry (list_no,
+                                 invlists->get_single_id (list_no, i),
+                                 ScopedCodes (invlists, list_no, i).get());
+            }
+
+            other.ntotal += i2 - i1;
+            accu_a1 = next_accu_a1;
+            accu_a2 = next_accu_a2;
+        }
+        accu_n += n;
+    }
+    FAISS_ASSERT(accu_n == ntotal);
+
+}
+
+void
+IndexIVF::dump() {
+    for (auto i = 0; i < invlists->nlist; ++ i) {
+        auto numVecs = invlists->list_size(i);
+        auto ids = invlists->get_ids(i);
+        auto codes = invlists->get_codes(i);
+        int code_size = invlists->code_size;
+
+        std::cout << "Bucket ID: " << i << ", with code size: " << code_size << ", vectors number: " << numVecs << std::endl;
+        if(code_size == 8) {
+            // int8 types
+            for (auto j=0; j < numVecs; ++j) {
+                std::cout << *(ids+j) << ": " << std::endl;
+                for(int k = 0; k < this->d; ++ k) {
+                    printf("%u ", (uint8_t)(codes[j * d + k]));
+                }
+                std::cout << std::endl;
+            }
+        }
+        std::cout << "Bucket End." << std::endl;
+    }
+}
+
+IndexIVF::~IndexIVF()
+{
+    if (own_invlists) {
+        delete invlists;
+    }
+}
+
+
+void IndexIVFStats::reset()
+{
+    memset ((void*)this, 0, sizeof (*this));
+}
+
+
+IndexIVFStats indexIVF_stats;
+
+void InvertedListScanner::scan_codes_range (size_t ,
+                       const uint8_t *,
+                       const idx_t *,
+                       float ,
+                       RangeQueryResult &) const
+{
+    FAISS_THROW_MSG ("scan_codes_range not implemented");
+}
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexIVF.h b/core/src/index/thirdparty/faiss/IndexIVF.h
new file mode 100644
index 0000000000..d85e105c65
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexIVF.h
@@ -0,0 +1,363 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_IVF_H
+#define FAISS_INDEX_IVF_H
+
+
+#include <vector>
+#include <stdint.h>
+
+#include <faiss/Index.h>
+#include <faiss/InvertedLists.h>
+#include <faiss/Clustering.h>
+#include <faiss/utils/Heap.h>
+
+
+namespace faiss {
+
+
+/** Encapsulates a quantizer object for the IndexIVF
+ *
+ * The class isolates the fields that are independent of the storage
+ * of the lists (especially training)
+ */
+struct Level1Quantizer {
+    Index * quantizer = nullptr;        ///< quantizer that maps vectors to inverted lists
+    Index * quantizer_backup = nullptr; ///< quantizer for backup
+    size_t nlist;             ///< number of possible key values
+
+
+    /**
+     * = 0: use the quantizer as index in a kmeans training
+     * = 1: just pass on the training set to the train() of the quantizer
+     * = 2: kmeans training on a flat index + add the centroids to the quantizer
+     */
+    char quantizer_trains_alone;
+    bool own_fields;          ///< whether object owns the quantizer
+
+    ClusteringParameters cp; ///< to override default clustering params
+    Index *clustering_index; ///< to override index used during clustering
+
+    /// Trains the quantizer and calls train_residual to train sub-quantizers
+    void train_q1 (size_t n, const float *x, bool verbose,
+                   MetricType metric_type);
+
+
+    /// compute the number of bytes required to store list ids
+    size_t coarse_code_size () const;
+    void encode_listno (Index::idx_t list_no, uint8_t *code) const;
+    Index::idx_t decode_listno (const uint8_t *code) const;
+
+    Level1Quantizer (Index * quantizer, size_t nlist);
+
+    Level1Quantizer ();
+
+    ~Level1Quantizer ();
+
+};
+
+
+
+struct IVFSearchParameters {
+    size_t nprobe;            ///< number of probes at query time
+    size_t max_codes;         ///< max nb of codes to visit to do a query
+    virtual ~IVFSearchParameters () {}
+};
+
+
+
+struct InvertedListScanner;
+
+/** Index based on a inverted file (IVF)
+ *
+ * In the inverted file, the quantizer (an Index instance) provides a
+ * quantization index for each vector to be added. The quantization
+ * index maps to a list (aka inverted list or posting list), where the
+ * id of the vector is stored.
+ *
+ * The inverted list object is required only after trainng. If none is
+ * set externally, an ArrayInvertedLists is used automatically.
+ *
+ * At search time, the vector to be searched is also quantized, and
+ * only the list corresponding to the quantization index is
+ * searched. This speeds up the search by making it
+ * non-exhaustive. This can be relaxed using multi-probe search: a few
+ * (nprobe) quantization indices are selected and several inverted
+ * lists are visited.
+ *
+ * Sub-classes implement a post-filtering of the index that refines
+ * the distance estimation from the query to databse vectors.
+ */
+struct IndexIVF: Index, Level1Quantizer {
+    /// Acess to the actual data
+    InvertedLists *invlists;
+    bool own_invlists;
+
+    size_t code_size;              ///< code size per vector in bytes
+
+    size_t nprobe;            ///< number of probes at query time
+    size_t max_codes;         ///< max nb of codes to visit to do a query
+
+    /** Parallel mode determines how queries are parallelized with OpenMP
+     *
+     * 0 (default): parallelize over queries
+     * 1: parallelize over over inverted lists
+     * 2: parallelize over both
+     */
+    int parallel_mode;
+
+    /// map for direct access to the elements. Enables reconstruct().
+    bool maintain_direct_map;
+    std::vector <idx_t> direct_map;
+
+    /** The Inverted file takes a quantizer (an Index) on input,
+     * which implements the function mapping a vector to a list
+     * identifier. The pointer is borrowed: the quantizer should not
+     * be deleted while the IndexIVF is in use.
+     */
+    IndexIVF (Index * quantizer, size_t d,
+              size_t nlist, size_t code_size,
+              MetricType metric = METRIC_L2);
+
+    void reset() override;
+
+    /// Trains the quantizer and calls train_residual to train sub-quantizers
+    void train(idx_t n, const float* x) override;
+
+    /// Calls add_with_ids with NULL ids
+    void add(idx_t n, const float* x) override;
+
+    /// default implementation that calls encode_vectors
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    /** Encodes a set of vectors as they would appear in the inverted lists
+     *
+     * @param list_nos   inverted list ids as returned by the
+     *                   quantizer (size n). -1s are ignored.
+     * @param codes      output codes, size n * code_size
+     * @param include_listno
+     *                   include the list ids in the code (in this case add
+     *                   ceil(log8(nlist)) to the code size)
+     */
+    virtual void encode_vectors(idx_t n, const float* x,
+                                const idx_t *list_nos,
+                                uint8_t * codes,
+                                bool include_listno = false) const = 0;
+
+    /// Sub-classes that encode the residuals can train their encoders here
+    /// does nothing by default
+    virtual void train_residual (idx_t n, const float *x);
+
+    /** search a set of vectors, that are pre-quantized by the IVF
+     *  quantizer. Fill in the corresponding heaps with the query
+     *  results. The default implementation uses InvertedListScanners
+     *  to do the search.
+     *
+     * @param n      nb of vectors to query
+     * @param x      query vectors, size nx * d
+     * @param assign coarse quantization indices, size nx * nprobe
+     * @param centroid_dis
+     *               distances to coarse centroids, size nx * nprobe
+     * @param distance
+     *               output distances, size n * k
+     * @param labels output labels, size n * k
+     * @param store_pairs store inv list index + inv list offset
+     *                     instead in upper/lower 32 bit of result,
+     *                     instead of ids (used for reranking).
+     * @param params used to override the object's search parameters
+     */
+    virtual void search_preassigned (idx_t n, const float *x, idx_t k,
+                                     const idx_t *assign,
+                                     const float *centroid_dis,
+                                     float *distances, idx_t *labels,
+                                     bool store_pairs,
+                                     const IVFSearchParameters *params=nullptr
+                                     ) const;
+
+    /** assign the vectors, then call search_preassign */
+    void search (idx_t n, const float *x, idx_t k,
+                 float *distances, idx_t *labels) const override;
+
+    void range_search (idx_t n, const float* x, float radius,
+                       RangeSearchResult* result) const override;
+
+    void range_search_preassigned(idx_t nx, const float *x, float radius,
+                                  const idx_t *keys, const float *coarse_dis,
+                                  RangeSearchResult *result) const;
+
+    /// get a scanner for this index (store_pairs means ignore labels)
+    virtual InvertedListScanner *get_InvertedListScanner (
+        bool store_pairs=false) const;
+
+    void reconstruct (idx_t key, float* recons) const override;
+
+    /** Reconstruct a subset of the indexed vectors.
+     *
+     * Overrides default implementation to bypass reconstruct() which requires
+     * direct_map to be maintained.
+     *
+     * @param i0     first vector to reconstruct
+     * @param ni     nb of vectors to reconstruct
+     * @param recons output array of reconstructed vectors, size ni * d
+     */
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * Overrides default implementation to avoid having to maintain direct_map
+     * and instead fetch the code offsets through the `store_pairs` flag in
+     * search_preassigned().
+     *
+     * @param recons      reconstructed vectors size (n, k, d)
+     */
+    void search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                 float *distances, idx_t *labels,
+                                 float *recons) const override;
+
+    /** Reconstruct a vector given the location in terms of (inv list index +
+     * inv list offset) instead of the id.
+     *
+     * Useful for reconstructing when the direct_map is not maintained and
+     * the inv list offset is computed by search_preassigned() with
+     * `store_pairs` set.
+     */
+    virtual void reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                          float* recons) const;
+
+
+    /// Dataset manipulation functions
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    /** check that the two indexes are compatible (ie, they are
+     * trained in the same way and have the same
+     * parameters). Otherwise throw. */
+    void check_compatible_for_merge (const IndexIVF &other) const;
+
+    /** moves the entries from another dataset to self. On output,
+     * other is empty. add_id is added to all moved ids (for
+     * sequential ids, this would be this->ntotal */
+    virtual void merge_from (IndexIVF &other, idx_t add_id);
+
+    /** copy a subset of the entries index to the other index
+     *
+     * if subset_type == 0: copies ids in [a1, a2)
+     * if subset_type == 1: copies ids if id % a1 == a2
+     * if subset_type == 2: copies inverted lists such that a1
+     *                      elements are left before and a2 elements are after
+     */
+    virtual void copy_subset_to (IndexIVF & other, int subset_type,
+                                 idx_t a1, idx_t a2) const;
+
+    virtual void to_readonly();
+    virtual bool is_readonly() const;
+
+    virtual void backup_quantizer();
+
+    virtual void restore_quantizer();
+
+    ~IndexIVF() override;
+
+    size_t get_list_size (size_t list_no) const
+    { return invlists->list_size(list_no); }
+
+    /** intialize a direct map
+     *
+     * @param new_maintain_direct_map    if true, create a direct map,
+     *                                   else clear it
+     */
+    void make_direct_map (bool new_maintain_direct_map=true);
+
+    /// replace the inverted lists, old one is deallocated if own_invlists
+    void replace_invlists (InvertedLists *il, bool own=false);
+
+    /* The standalone codec interface (except sa_decode that is specific) */
+    size_t sa_code_size () const override;
+
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+
+    void dump();
+
+    IndexIVF ();
+};
+
+struct RangeQueryResult;
+
+/** Object that handles a query. The inverted lists to scan are
+ * provided externally. The object has a lot of state, but
+ * distance_to_code and scan_codes can be called in multiple
+ * threads */
+struct InvertedListScanner {
+
+    using idx_t = Index::idx_t;
+
+    /// from now on we handle this query.
+    virtual void set_query (const float *query_vector) = 0;
+
+    /// following codes come from this inverted list
+    virtual void set_list (idx_t list_no, float coarse_dis) = 0;
+
+    /// compute a single query-to-code distance
+    virtual float distance_to_code (const uint8_t *code) const = 0;
+
+    /** scan a set of codes, compute distances to current query and
+     * update heap of results if necessary.
+     *
+     * @param n      number of codes to scan
+     * @param codes  codes to scan (n * code_size)
+     * @param ids        corresponding ids (ignored if store_pairs)
+     * @param distances  heap distances (size k)
+     * @param labels     heap labels (size k)
+     * @param k          heap size
+     * @return number of heap updates performed
+     */
+    virtual size_t scan_codes (size_t n,
+                               const uint8_t *codes,
+                               const idx_t *ids,
+                               float *distances, idx_t *labels,
+                               size_t k) const = 0;
+
+    /** scan a set of codes, compute distances to current query and
+     * update results if distances are below radius
+     *
+     * (default implementation fails) */
+    virtual void scan_codes_range (size_t n,
+                                   const uint8_t *codes,
+                                   const idx_t *ids,
+                                   float radius,
+                                   RangeQueryResult &result) const;
+
+    virtual ~InvertedListScanner () {}
+
+};
+
+
+struct IndexIVFStats {
+    size_t nq;       // nb of queries run
+    size_t nlist;    // nb of inverted lists scanned
+    size_t ndis;     // nb of distancs computed
+    size_t nheap_updates; // nb of times the heap was updated
+    double quantization_time; // time spent quantizing vectors (in ms)
+    double search_time;       // time spent searching lists (in ms)
+
+    IndexIVFStats () {reset (); }
+    void reset ();
+};
+
+// global var that collects them all
+extern IndexIVFStats indexIVF_stats;
+
+
+} // namespace faiss
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp b/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp
new file mode 100644
index 0000000000..aafb32231b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp
@@ -0,0 +1,502 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexIVFFlat.h>
+
+#include <cstdio>
+
+#include <faiss/IndexFlat.h>
+
+#include <faiss/utils/distances.h>
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+
+namespace faiss {
+
+
+/*****************************************
+ * IndexIVFFlat implementation
+ ******************************************/
+
+IndexIVFFlat::IndexIVFFlat (Index * quantizer,
+                            size_t d, size_t nlist, MetricType metric):
+    IndexIVF (quantizer, d, nlist, sizeof(float) * d, metric)
+{
+    code_size = sizeof(float) * d;
+}
+
+
+void IndexIVFFlat::add_with_ids (idx_t n, const float * x, const idx_t *xids)
+{
+    add_core (n, x, xids, nullptr);
+}
+
+void IndexIVFFlat::add_core (idx_t n, const float * x, const int64_t *xids,
+                             const int64_t *precomputed_idx)
+
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    assert (invlists);
+    FAISS_THROW_IF_NOT_MSG (!(maintain_direct_map && xids),
+                            "cannot have direct map and add with ids");
+    const int64_t * idx;
+    ScopeDeleter<int64_t> del;
+
+    if (precomputed_idx) {
+        idx = precomputed_idx;
+    } else {
+        int64_t * idx0 = new int64_t [n];
+        del.set (idx0);
+        quantizer->assign (n, x, idx0);
+        idx = idx0;
+    }
+    int64_t n_add = 0;
+    for (size_t i = 0; i < n; i++) {
+        int64_t id = xids ? xids[i] : ntotal + i;
+        int64_t list_no = idx [i];
+
+        if (list_no < 0)
+            continue;
+        const float *xi = x + i * d;
+        size_t offset = invlists->add_entry (
+              list_no, id, (const uint8_t*) xi);
+
+        if (maintain_direct_map)
+            direct_map.push_back (list_no << 32 | offset);
+        n_add++;
+    }
+    if (verbose) {
+        printf("IndexIVFFlat::add_core: added %ld / %ld vectors\n",
+               n_add, n);
+    }
+    ntotal += n;
+}
+
+void IndexIVFFlat::encode_vectors(idx_t n, const float* x,
+                                  const idx_t * list_nos,
+                                  uint8_t * codes,
+                                  bool include_listnos) const
+{
+    if (!include_listnos) {
+        memcpy (codes, x, code_size * n);
+    } else {
+        size_t coarse_size = coarse_code_size ();
+        for (size_t i = 0; i < n; i++) {
+            int64_t list_no = list_nos [i];
+            uint8_t *code = codes + i * (code_size + coarse_size);
+            const float *xi = x + i * d;
+            if (list_no >= 0) {
+                encode_listno (list_no, code);
+                memcpy (code + coarse_size, xi, code_size);
+            } else {
+                memset (code, 0, code_size + coarse_size);
+            }
+
+        }
+    }
+}
+
+void IndexIVFFlat::sa_decode (idx_t n, const uint8_t *bytes,
+                                      float *x) const
+{
+    size_t coarse_size = coarse_code_size ();
+    for (size_t i = 0; i < n; i++) {
+        const uint8_t *code = bytes + i * (code_size + coarse_size);
+        float *xi = x + i * d;
+        memcpy (xi, code + coarse_size, code_size);
+    }
+}
+
+
+namespace {
+
+
+template<MetricType metric, class C>
+struct IVFFlatScanner: InvertedListScanner {
+    size_t d;
+    bool store_pairs;
+
+    IVFFlatScanner(size_t d, bool store_pairs):
+        d(d), store_pairs(store_pairs) {}
+
+    const float *xi;
+    void set_query (const float *query) override {
+        this->xi = query;
+    }
+
+    idx_t list_no;
+    void set_list (idx_t list_no, float /* coarse_dis */) override {
+        this->list_no = list_no;
+    }
+
+    float distance_to_code (const uint8_t *code) const override {
+        const float *yj = (float*)code;
+        float dis = metric == METRIC_INNER_PRODUCT ?
+            fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
+        return dis;
+    }
+
+    size_t scan_codes (size_t list_size,
+                       const uint8_t *codes,
+                       const idx_t *ids,
+                       float *simi, idx_t *idxi,
+                       size_t k) const override
+    {
+        const float *list_vecs = (const float*)codes;
+        size_t nup = 0;
+        for (size_t j = 0; j < list_size; j++) {
+            const float * yj = list_vecs + d * j;
+            float dis = metric == METRIC_INNER_PRODUCT ?
+                fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
+            if (C::cmp (simi[0], dis)) {
+                heap_pop<C> (k, simi, idxi);
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                heap_push<C> (k, simi, idxi, dis, id);
+                nup++;
+            }
+        }
+        return nup;
+    }
+
+    void scan_codes_range (size_t list_size,
+                           const uint8_t *codes,
+                           const idx_t *ids,
+                           float radius,
+                           RangeQueryResult & res) const override
+    {
+        const float *list_vecs = (const float*)codes;
+        for (size_t j = 0; j < list_size; j++) {
+            const float * yj = list_vecs + d * j;
+            float dis = metric == METRIC_INNER_PRODUCT ?
+                fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
+            if (C::cmp (radius, dis)) {
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                res.add (dis, id);
+            }
+        }
+    }
+
+
+};
+
+
+} // anonymous namespace
+
+
+
+InvertedListScanner* IndexIVFFlat::get_InvertedListScanner
+     (bool store_pairs) const
+{
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        return new IVFFlatScanner<
+            METRIC_INNER_PRODUCT, CMin<float, int64_t> > (d, store_pairs);
+    } else if (metric_type == METRIC_L2) {
+        return new IVFFlatScanner<
+            METRIC_L2, CMax<float, int64_t> >(d, store_pairs);
+    } else {
+        FAISS_THROW_MSG("metric type not supported");
+    }
+    return nullptr;
+}
+
+
+
+void IndexIVFFlat::update_vectors (int n, idx_t *new_ids, const float *x)
+{
+
+    FAISS_THROW_IF_NOT (maintain_direct_map);
+    FAISS_THROW_IF_NOT (is_trained);
+    std::vector<idx_t> assign (n);
+    quantizer->assign (n, x, assign.data());
+
+    for (size_t i = 0; i < n; i++) {
+        idx_t id = new_ids[i];
+        FAISS_THROW_IF_NOT_MSG (0 <= id && id < ntotal,
+                                "id to update out of range");
+        { // remove old one
+            int64_t dm = direct_map[id];
+            int64_t ofs = dm & 0xffffffff;
+            int64_t il = dm >> 32;
+            size_t l = invlists->list_size (il);
+            if (ofs != l - 1) { // move l - 1 to ofs
+                int64_t id2 = invlists->get_single_id (il, l - 1);
+                direct_map[id2] = (il << 32) | ofs;
+                invlists->update_entry (il, ofs, id2,
+                                        invlists->get_single_code (il, l - 1));
+            }
+            invlists->resize (il, l - 1);
+        }
+        { // insert new one
+            int64_t il = assign[i];
+            size_t l = invlists->list_size (il);
+            int64_t dm = (il << 32) | l;
+            direct_map[id] = dm;
+            invlists->add_entry (il, id, (const uint8_t*)(x + i * d));
+        }
+    }
+
+}
+
+void IndexIVFFlat::reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                            float* recons) const
+{
+    memcpy (recons, invlists->get_single_code (list_no, offset), code_size);
+}
+
+/*****************************************
+ * IndexIVFFlatDedup implementation
+ ******************************************/
+
+IndexIVFFlatDedup::IndexIVFFlatDedup (
+            Index * quantizer, size_t d, size_t nlist_,
+            MetricType metric_type):
+    IndexIVFFlat (quantizer, d, nlist_, metric_type)
+{}
+
+
+void IndexIVFFlatDedup::train(idx_t n, const float* x)
+{
+    std::unordered_map<uint64_t, idx_t> map;
+    float * x2 = new float [n * d];
+    ScopeDeleter<float> del (x2);
+
+    int64_t n2 = 0;
+    for (int64_t i = 0; i < n; i++) {
+        uint64_t hash = hash_bytes((uint8_t *)(x + i * d), code_size);
+        if (map.count(hash) &&
+            !memcmp (x2 + map[hash] * d, x + i * d, code_size)) {
+            // is duplicate, skip
+        } else {
+            map [hash] = n2;
+            memcpy (x2 + n2 * d, x + i * d, code_size);
+            n2 ++;
+        }
+    }
+    if (verbose) {
+        printf ("IndexIVFFlatDedup::train: train on %ld points after dedup "
+                "(was %ld points)\n", n2, n);
+    }
+    IndexIVFFlat::train (n2, x2);
+}
+
+
+
+void IndexIVFFlatDedup::add_with_ids(
+           idx_t na, const float* x, const idx_t* xids)
+{
+
+    FAISS_THROW_IF_NOT (is_trained);
+    assert (invlists);
+    FAISS_THROW_IF_NOT_MSG (
+           !maintain_direct_map,
+           "IVFFlatDedup not implemented with direct_map");
+    int64_t * idx = new int64_t [na];
+    ScopeDeleter<int64_t> del (idx);
+    quantizer->assign (na, x, idx);
+
+    int64_t n_add = 0, n_dup = 0;
+    // TODO make a omp loop with this
+    for (size_t i = 0; i < na; i++) {
+        idx_t id = xids ? xids[i] : ntotal + i;
+        int64_t list_no = idx [i];
+
+        if (list_no < 0) {
+            continue;
+        }
+        const float *xi = x + i * d;
+
+        // search if there is already an entry with that id
+        InvertedLists::ScopedCodes codes (invlists, list_no);
+
+        int64_t n = invlists->list_size (list_no);
+        int64_t offset = -1;
+        for (int64_t o = 0; o < n; o++) {
+            if (!memcmp (codes.get() + o * code_size,
+                         xi, code_size)) {
+                offset = o;
+                break;
+            }
+        }
+
+        if (offset == -1) { // not found
+            invlists->add_entry (list_no, id, (const uint8_t*) xi);
+        } else {
+            // mark equivalence
+            idx_t id2 = invlists->get_single_id (list_no, offset);
+            std::pair<idx_t, idx_t> pair (id2, id);
+            instances.insert (pair);
+            n_dup ++;
+        }
+        n_add++;
+    }
+    if (verbose) {
+        printf("IndexIVFFlat::add_with_ids: added %ld / %ld vectors"
+               " (out of which %ld are duplicates)\n",
+               n_add, na, n_dup);
+    }
+    ntotal += n_add;
+}
+
+void IndexIVFFlatDedup::search_preassigned (
+           idx_t n, const float *x, idx_t k,
+           const idx_t *assign,
+           const float *centroid_dis,
+           float *distances, idx_t *labels,
+           bool store_pairs,
+           const IVFSearchParameters *params) const
+{
+    FAISS_THROW_IF_NOT_MSG (
+           !store_pairs, "store_pairs not supported in IVFDedup");
+
+    IndexIVFFlat::search_preassigned (n, x, k, assign, centroid_dis,
+                                      distances, labels, false,
+                                      params);
+
+    std::vector <idx_t> labels2 (k);
+    std::vector <float> dis2 (k);
+
+    for (int64_t i = 0; i < n; i++) {
+        idx_t *labels1 = labels + i * k;
+        float *dis1 = distances + i * k;
+        int64_t j = 0;
+        for (; j < k; j++) {
+            if (instances.find (labels1[j]) != instances.end ()) {
+                // a duplicate: special handling
+                break;
+            }
+        }
+        if (j < k) {
+            // there are duplicates, special handling
+            int64_t j0 = j;
+            int64_t rp = j;
+            while (j < k) {
+                auto range = instances.equal_range (labels1[rp]);
+                float dis = dis1[rp];
+                labels2[j] = labels1[rp];
+                dis2[j] = dis;
+                j ++;
+                for (auto it = range.first; j < k && it != range.second; ++it) {
+                    labels2[j] = it->second;
+                    dis2[j] = dis;
+                    j++;
+                }
+                rp++;
+            }
+            memcpy (labels1 + j0, labels2.data() + j0,
+                    sizeof(labels1[0]) * (k - j0));
+            memcpy (dis1 + j0, dis2.data() + j0,
+                    sizeof(dis2[0]) * (k - j0));
+        }
+    }
+
+}
+
+
+size_t IndexIVFFlatDedup::remove_ids(const IDSelector& sel)
+{
+    std::unordered_map<idx_t, idx_t> replace;
+    std::vector<std::pair<idx_t, idx_t> > toadd;
+    for (auto it = instances.begin(); it != instances.end(); ) {
+        if (sel.is_member(it->first)) {
+            // then we erase this entry
+            if (!sel.is_member(it->second)) {
+                // if the second is not erased
+                if (replace.count(it->first) == 0) {
+                    replace[it->first] = it->second;
+                } else { // remember we should add an element
+                    std::pair<idx_t, idx_t> new_entry (
+                          replace[it->first], it->second);
+                    toadd.push_back(new_entry);
+                }
+            }
+            it = instances.erase(it);
+        } else {
+            if (sel.is_member(it->second)) {
+                it = instances.erase(it);
+            } else {
+                ++it;
+            }
+        }
+    }
+
+    instances.insert (toadd.begin(), toadd.end());
+
+    // mostly copied from IndexIVF.cpp
+
+    FAISS_THROW_IF_NOT_MSG (!maintain_direct_map,
+                    "direct map remove not implemented");
+
+    std::vector<int64_t> toremove(nlist);
+
+#pragma omp parallel for
+    for (int64_t i = 0; i < nlist; i++) {
+        int64_t l0 = invlists->list_size (i), l = l0, j = 0;
+        InvertedLists::ScopedIds idsi (invlists, i);
+        while (j < l) {
+            if (sel.is_member (idsi[j])) {
+                if (replace.count(idsi[j]) == 0) {
+                    l--;
+                    invlists->update_entry (
+                        i, j,
+                        invlists->get_single_id (i, l),
+                        InvertedLists::ScopedCodes (invlists, i, l).get());
+                } else {
+                    invlists->update_entry (
+                        i, j,
+                        replace[idsi[j]],
+                        InvertedLists::ScopedCodes (invlists, i, j).get());
+                    j++;
+                }
+            } else {
+                j++;
+            }
+        }
+        toremove[i] = l0 - l;
+    }
+    // this will not run well in parallel on ondisk because of possible shrinks
+    int64_t nremove = 0;
+    for (int64_t i = 0; i < nlist; i++) {
+        if (toremove[i] > 0) {
+            nremove += toremove[i];
+            invlists->resize(
+                i, invlists->list_size(i) - toremove[i]);
+        }
+    }
+    ntotal -= nremove;
+    return nremove;
+}
+
+
+void IndexIVFFlatDedup::range_search(
+        idx_t ,
+        const float* ,
+        float ,
+        RangeSearchResult* ) const
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+void IndexIVFFlatDedup::update_vectors (int , idx_t *, const float *)
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+
+void IndexIVFFlatDedup::reconstruct_from_offset (
+         int64_t , int64_t , float* ) const
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexIVFFlat.h b/core/src/index/thirdparty/faiss/IndexIVFFlat.h
new file mode 100644
index 0000000000..d79b099718
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexIVFFlat.h
@@ -0,0 +1,118 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_IVF_FLAT_H
+#define FAISS_INDEX_IVF_FLAT_H
+
+#include <unordered_map>
+#include <stdint.h>
+
+#include <faiss/IndexIVF.h>
+
+
+namespace faiss {
+
+/** Inverted file with stored vectors. Here the inverted file
+ * pre-selects the vectors to be searched, but they are not otherwise
+ * encoded, the code array just contains the raw float entries.
+ */
+struct IndexIVFFlat: IndexIVF {
+
+    IndexIVFFlat (
+            Index * quantizer, size_t d, size_t nlist_,
+            MetricType = METRIC_L2);
+
+    /// same as add_with_ids, with precomputed coarse quantizer
+    virtual void add_core (idx_t n, const float * x, const int64_t *xids,
+                   const int64_t *precomputed_idx);
+
+    /// implemented for all IndexIVF* classes
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    void encode_vectors(idx_t n, const float* x,
+                        const idx_t *list_nos,
+                        uint8_t * codes,
+                        bool include_listnos=false) const override;
+
+
+    InvertedListScanner *get_InvertedListScanner (bool store_pairs)
+        const override;
+
+    /** Update a subset of vectors.
+     *
+     * The index must have a direct_map
+     *
+     * @param nv     nb of vectors to update
+     * @param idx    vector indices to update, size nv
+     * @param v      vectors of new values, size nv*d
+     */
+    virtual void update_vectors (int nv, idx_t *idx, const float *v);
+
+    void reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                  float* recons) const override;
+
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+    IndexIVFFlat () {}
+};
+
+
+struct IndexIVFFlatDedup: IndexIVFFlat {
+
+    /** Maps ids stored in the index to the ids of vectors that are
+     *  the same. When a vector is unique, it does not appear in the
+     *  instances map */
+    std::unordered_multimap <idx_t, idx_t> instances;
+
+    IndexIVFFlatDedup (
+            Index * quantizer, size_t d, size_t nlist_,
+            MetricType = METRIC_L2);
+
+    /// also dedups the training set
+    void train(idx_t n, const float* x) override;
+
+    /// implemented for all IndexIVF* classes
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    void search_preassigned (idx_t n, const float *x, idx_t k,
+                             const idx_t *assign,
+                             const float *centroid_dis,
+                             float *distances, idx_t *labels,
+                             bool store_pairs,
+                             const IVFSearchParameters *params=nullptr
+                             ) const override;
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    /// not implemented
+    void range_search(
+        idx_t n,
+        const float* x,
+        float radius,
+        RangeSearchResult* result) const override;
+
+    /// not implemented
+    void update_vectors (int nv, idx_t *idx, const float *v) override;
+
+
+    /// not implemented
+    void reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                  float* recons) const override;
+
+    IndexIVFFlatDedup () {}
+
+
+};
+
+
+
+} // namespace faiss
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/IndexIVFPQ.cpp b/core/src/index/thirdparty/faiss/IndexIVFPQ.cpp
new file mode 100644
index 0000000000..fe0ed0c406
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexIVFPQ.cpp
@@ -0,0 +1,1207 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexIVFPQ.h>
+
+#include <cmath>
+#include <cstdio>
+#include <cassert>
+#include <stdint.h>
+
+#include <algorithm>
+
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/utils.h>
+#include <faiss/utils/distances.h>
+
+#include <faiss/Clustering.h>
+#include <faiss/IndexFlat.h>
+
+#include <faiss/utils/hamming.h>
+
+#include <faiss/impl/FaissAssert.h>
+
+#include <faiss/impl/AuxIndexStructures.h>
+
+namespace faiss {
+
+/*****************************************
+ * IndexIVFPQ implementation
+ ******************************************/
+
+IndexIVFPQ::IndexIVFPQ (Index * quantizer, size_t d, size_t nlist,
+                        size_t M, size_t nbits_per_idx):
+    IndexIVF (quantizer, d, nlist, 0, METRIC_L2),
+    pq (d, M, nbits_per_idx)
+{
+    FAISS_THROW_IF_NOT (nbits_per_idx <= 8);
+    code_size = pq.code_size;
+    invlists->code_size = code_size;
+    is_trained = false;
+    by_residual = true;
+    use_precomputed_table = 0;
+    scan_table_threshold = 0;
+
+    polysemous_training = nullptr;
+    do_polysemous_training = false;
+    polysemous_ht = 0;
+
+}
+
+
+/****************************************************************
+ * training                                                     */
+
+void IndexIVFPQ::train_residual (idx_t n, const float *x)
+{
+    train_residual_o (n, x, nullptr);
+}
+
+
+void IndexIVFPQ::train_residual_o (idx_t n, const float *x, float *residuals_2)
+{
+    const float * x_in = x;
+
+    x = fvecs_maybe_subsample (
+         d, (size_t*)&n, pq.cp.max_points_per_centroid * pq.ksub,
+         x, verbose, pq.cp.seed);
+
+    ScopeDeleter<float> del_x (x_in == x ? nullptr : x);
+
+    const float *trainset;
+    ScopeDeleter<float> del_residuals;
+    if (by_residual) {
+        if(verbose) printf("computing residuals\n");
+        idx_t * assign = new idx_t [n]; // assignement to coarse centroids
+        ScopeDeleter<idx_t> del (assign);
+        quantizer->assign (n, x, assign);
+        float *residuals = new float [n * d];
+        del_residuals.set (residuals);
+        for (idx_t i = 0; i < n; i++)
+           quantizer->compute_residual (x + i * d, residuals+i*d, assign[i]);
+
+        trainset = residuals;
+    } else {
+        trainset = x;
+    }
+    if (verbose)
+        printf ("training %zdx%zd product quantizer on %ld vectors in %dD\n",
+                pq.M, pq.ksub, n, d);
+    pq.verbose = verbose;
+    pq.train (n, trainset);
+
+    if (do_polysemous_training) {
+        if (verbose)
+            printf("doing polysemous training for PQ\n");
+        PolysemousTraining default_pt;
+        PolysemousTraining *pt = polysemous_training;
+        if (!pt) pt = &default_pt;
+        pt->optimize_pq_for_hamming (pq, n, trainset);
+    }
+
+    // prepare second-level residuals for refine PQ
+    if (residuals_2) {
+        uint8_t *train_codes = new uint8_t [pq.code_size * n];
+        ScopeDeleter<uint8_t> del (train_codes);
+        pq.compute_codes (trainset, train_codes, n);
+
+        for (idx_t i = 0; i < n; i++) {
+            const float *xx = trainset + i * d;
+            float * res = residuals_2 + i * d;
+            pq.decode (train_codes + i * pq.code_size, res);
+            for (int j = 0; j < d; j++)
+                res[j] = xx[j] - res[j];
+        }
+
+    }
+
+    if (by_residual) {
+        precompute_table ();
+    }
+
+}
+
+
+
+
+
+
+/****************************************************************
+ * IVFPQ as codec                                               */
+
+
+/* produce a binary signature based on the residual vector */
+void IndexIVFPQ::encode (idx_t key, const float * x, uint8_t * code) const
+{
+    if (by_residual) {
+        float residual_vec[d];
+        quantizer->compute_residual (x, residual_vec, key);
+        pq.compute_code (residual_vec, code);
+    }
+    else pq.compute_code (x, code);
+}
+
+void IndexIVFPQ::encode_multiple (size_t n, idx_t *keys,
+                                  const float * x, uint8_t * xcodes,
+                                  bool compute_keys) const
+{
+    if (compute_keys)
+        quantizer->assign (n, x, keys);
+
+    encode_vectors (n, x, keys, xcodes);
+}
+
+void IndexIVFPQ::decode_multiple (size_t n, const idx_t *keys,
+                                  const uint8_t * xcodes, float * x) const
+{
+    pq.decode (xcodes, x, n);
+    if (by_residual) {
+        std::vector<float> centroid (d);
+        for (size_t i = 0; i < n; i++) {
+            quantizer->reconstruct (keys[i], centroid.data());
+            float *xi = x + i * d;
+            for (size_t j = 0; j < d; j++) {
+                xi [j] += centroid [j];
+            }
+        }
+    }
+}
+
+
+
+
+/****************************************************************
+ * add                                                          */
+
+
+void IndexIVFPQ::add_with_ids (idx_t n, const float * x, const idx_t *xids)
+{
+    add_core_o (n, x, xids, nullptr);
+}
+
+
+static float * compute_residuals (
+        const Index *quantizer,
+        Index::idx_t n, const float* x,
+        const Index::idx_t *list_nos)
+{
+    size_t d = quantizer->d;
+    float *residuals = new float [n * d];
+    // TODO: parallelize?
+    for (size_t i = 0; i < n; i++) {
+        if (list_nos[i] < 0)
+            memset (residuals + i * d, 0, sizeof(*residuals) * d);
+        else
+            quantizer->compute_residual (
+                 x + i * d, residuals + i * d, list_nos[i]);
+    }
+    return residuals;
+}
+
+void IndexIVFPQ::encode_vectors(idx_t n, const float* x,
+                                const idx_t *list_nos,
+                                uint8_t * codes,
+                                bool include_listnos) const
+{
+    if (by_residual) {
+        float *to_encode = compute_residuals (quantizer, n, x, list_nos);
+        ScopeDeleter<float> del (to_encode);
+        pq.compute_codes (to_encode, codes, n);
+    } else {
+        pq.compute_codes (x, codes, n);
+    }
+
+    if (include_listnos) {
+        size_t coarse_size = coarse_code_size();
+        for (idx_t i = n - 1; i >= 0; i--) {
+            uint8_t * code = codes + i * (coarse_size + code_size);
+            memmove (code + coarse_size,
+                     codes + i * code_size, code_size);
+            encode_listno (list_nos[i], code);
+        }
+    }
+}
+
+
+
+void IndexIVFPQ::sa_decode (idx_t n, const uint8_t *codes,
+                            float *x) const
+{
+    size_t coarse_size = coarse_code_size ();
+
+#pragma omp parallel
+    {
+        std::vector<float> residual (d);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            const uint8_t *code = codes + i * (code_size + coarse_size);
+            int64_t list_no = decode_listno (code);
+            float *xi = x + i * d;
+            pq.decode (code + coarse_size, xi);
+            if (by_residual) {
+                quantizer->reconstruct (list_no, residual.data());
+                for (size_t j = 0; j < d; j++) {
+                    xi[j] += residual[j];
+                }
+            }
+        }
+    }
+}
+
+
+void IndexIVFPQ::add_core_o (idx_t n, const float * x, const idx_t *xids,
+                             float *residuals_2, const idx_t *precomputed_idx)
+{
+
+    idx_t bs = 32768;
+    if (n > bs) {
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min(i0 + bs, n);
+            if (verbose) {
+                printf("IndexIVFPQ::add_core_o: adding %ld:%ld / %ld\n",
+                       i0, i1, n);
+            }
+            add_core_o (i1 - i0, x + i0 * d,
+                        xids ? xids + i0 : nullptr,
+                        residuals_2 ? residuals_2 + i0 * d : nullptr,
+                        precomputed_idx ? precomputed_idx + i0 : nullptr);
+        }
+        return;
+    }
+
+    InterruptCallback::check();
+
+    FAISS_THROW_IF_NOT (is_trained);
+    double t0 = getmillisecs ();
+    const idx_t * idx;
+    ScopeDeleter<idx_t> del_idx;
+
+    if (precomputed_idx) {
+        idx = precomputed_idx;
+    } else {
+        idx_t * idx0 = new idx_t [n];
+        del_idx.set (idx0);
+        quantizer->assign (n, x, idx0);
+        idx = idx0;
+    }
+
+    double t1 = getmillisecs ();
+    uint8_t * xcodes = new uint8_t [n * code_size];
+    ScopeDeleter<uint8_t> del_xcodes (xcodes);
+
+    const float *to_encode = nullptr;
+    ScopeDeleter<float> del_to_encode;
+
+    if (by_residual) {
+        to_encode = compute_residuals (quantizer, n, x, idx);
+        del_to_encode.set (to_encode);
+    } else {
+        to_encode = x;
+    }
+    pq.compute_codes (to_encode, xcodes, n);
+
+    double t2 = getmillisecs ();
+    // TODO: parallelize?
+    size_t n_ignore = 0;
+    for (size_t i = 0; i < n; i++) {
+        idx_t key = idx[i];
+        if (key < 0) {
+            n_ignore ++;
+            if (residuals_2)
+                memset (residuals_2, 0, sizeof(*residuals_2) * d);
+            continue;
+        }
+        idx_t id = xids ? xids[i] : ntotal + i;
+
+        uint8_t *code = xcodes + i * code_size;
+        size_t offset = invlists->add_entry (key, id, code);
+
+        if (residuals_2) {
+            float *res2 = residuals_2 + i * d;
+            const float *xi = to_encode + i * d;
+            pq.decode (code, res2);
+            for (int j = 0; j < d; j++)
+                res2[j] = xi[j] - res2[j];
+        }
+
+        if (maintain_direct_map)
+            direct_map.push_back (key << 32 | offset);
+    }
+
+
+    double t3 = getmillisecs ();
+    if(verbose) {
+        char comment[100] = {0};
+        if (n_ignore > 0)
+            snprintf (comment, 100, "(%ld vectors ignored)", n_ignore);
+        printf(" add_core times: %.3f %.3f %.3f %s\n",
+               t1 - t0, t2 - t1, t3 - t2, comment);
+    }
+    ntotal += n;
+}
+
+
+void IndexIVFPQ::reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                          float* recons) const
+{
+    const uint8_t* code = invlists->get_single_code (list_no, offset);
+
+    if (by_residual) {
+        std::vector<float> centroid(d);
+        quantizer->reconstruct (list_no, centroid.data());
+
+        pq.decode (code, recons);
+        for (int i = 0; i < d; ++i) {
+            recons[i] += centroid[i];
+        }
+    } else {
+        pq.decode (code, recons);
+    }
+}
+
+
+
+/// 2G by default, accommodates tables up to PQ32 w/ 65536 centroids
+size_t IndexIVFPQ::precomputed_table_max_bytes = ((size_t)1) << 31;
+
+/** Precomputed tables for residuals
+ *
+ * During IVFPQ search with by_residual, we compute
+ *
+ *     d = || x - y_C - y_R ||^2
+ *
+ * where x is the query vector, y_C the coarse centroid, y_R the
+ * refined PQ centroid. The expression can be decomposed as:
+ *
+ *    d = || x - y_C ||^2 + || y_R ||^2 + 2 * (y_C|y_R) - 2 * (x|y_R)
+ *        ---------------   ---------------------------       -------
+ *             term 1                 term 2                   term 3
+ *
+ * When using multiprobe, we use the following decomposition:
+ * - term 1 is the distance to the coarse centroid, that is computed
+ *   during the 1st stage search.
+ * - term 2 can be precomputed, as it does not involve x. However,
+ *   because of the PQ, it needs nlist * M * ksub storage. This is why
+ *   use_precomputed_table is off by default
+ * - term 3 is the classical non-residual distance table.
+ *
+ * Since y_R defined by a product quantizer, it is split across
+ * subvectors and stored separately for each subvector. If the coarse
+ * quantizer is a MultiIndexQuantizer then the table can be stored
+ * more compactly.
+ *
+ * At search time, the tables for term 2 and term 3 are added up. This
+ * is faster when the length of the lists is > ksub * M.
+ */
+
+void IndexIVFPQ::precompute_table ()
+{
+    if (use_precomputed_table == -1)
+        return;
+
+    if (use_precomputed_table == 0) { // then choose the type of table
+        if (quantizer->metric_type == METRIC_INNER_PRODUCT) {
+            if (verbose) {
+                printf("IndexIVFPQ::precompute_table: precomputed "
+                        "tables not needed for inner product quantizers\n");
+            }
+            return;
+        }
+        const MultiIndexQuantizer *miq =
+            dynamic_cast<const MultiIndexQuantizer *> (quantizer);
+        if (miq && pq.M % miq->pq.M == 0)
+            use_precomputed_table = 2;
+        else {
+            size_t table_size = pq.M * pq.ksub * nlist * sizeof(float);
+            if (table_size > precomputed_table_max_bytes) {
+                if (verbose) {
+                    printf(
+                       "IndexIVFPQ::precompute_table: not precomputing table, "
+                       "it would be too big: %ld bytes (max %ld)\n",
+                       table_size, precomputed_table_max_bytes);
+                    use_precomputed_table = 0;
+                }
+                return;
+            }
+            use_precomputed_table = 1;
+        }
+    } // otherwise assume user has set appropriate flag on input
+
+    if (verbose) {
+        printf ("precomputing IVFPQ tables type %d\n",
+                use_precomputed_table);
+    }
+
+    // squared norms of the PQ centroids
+    std::vector<float> r_norms (pq.M * pq.ksub, NAN);
+    for (int m = 0; m < pq.M; m++)
+        for (int j = 0; j < pq.ksub; j++)
+            r_norms [m * pq.ksub + j] =
+                fvec_norm_L2sqr (pq.get_centroids (m, j), pq.dsub);
+
+    if (use_precomputed_table == 1) {
+
+        precomputed_table.resize (nlist * pq.M * pq.ksub);
+        std::vector<float> centroid (d);
+
+        for (size_t i = 0; i < nlist; i++) {
+            quantizer->reconstruct (i, centroid.data());
+
+            float *tab = &precomputed_table[i * pq.M * pq.ksub];
+            pq.compute_inner_prod_table (centroid.data(), tab);
+            fvec_madd (pq.M * pq.ksub, r_norms.data(), 2.0, tab, tab);
+        }
+    } else if (use_precomputed_table == 2) {
+        const MultiIndexQuantizer *miq =
+           dynamic_cast<const MultiIndexQuantizer *> (quantizer);
+        FAISS_THROW_IF_NOT (miq);
+        const ProductQuantizer &cpq = miq->pq;
+        FAISS_THROW_IF_NOT (pq.M % cpq.M == 0);
+
+        precomputed_table.resize(cpq.ksub * pq.M * pq.ksub);
+
+        // reorder PQ centroid table
+        std::vector<float> centroids (d * cpq.ksub, NAN);
+
+        for (int m = 0; m < cpq.M; m++) {
+            for (size_t i = 0; i < cpq.ksub; i++) {
+                memcpy (centroids.data() + i * d + m * cpq.dsub,
+                        cpq.get_centroids (m, i),
+                        sizeof (*centroids.data()) * cpq.dsub);
+            }
+        }
+
+        pq.compute_inner_prod_tables (cpq.ksub, centroids.data (),
+                                      precomputed_table.data ());
+
+        for (size_t i = 0; i < cpq.ksub; i++) {
+            float *tab = &precomputed_table[i * pq.M * pq.ksub];
+            fvec_madd (pq.M * pq.ksub, r_norms.data(), 2.0, tab, tab);
+        }
+
+    }
+
+}
+
+namespace {
+
+using idx_t = Index::idx_t;
+
+
+#define TIC t0 = get_cycles()
+#define TOC get_cycles () - t0
+
+
+
+/** QueryTables manages the various ways of searching an
+ * IndexIVFPQ. The code contains a lot of branches, depending on:
+ * - metric_type: are we computing L2 or Inner product similarity?
+ * - by_residual: do we encode raw vectors or residuals?
+ * - use_precomputed_table: are x_R|x_C tables precomputed?
+ * - polysemous_ht: are we filtering with polysemous codes?
+ */
+struct QueryTables {
+
+    /*****************************************************
+     * General data from the IVFPQ
+     *****************************************************/
+
+    const IndexIVFPQ & ivfpq;
+    const IVFSearchParameters *params;
+
+    // copied from IndexIVFPQ for easier access
+    int d;
+    const ProductQuantizer & pq;
+    MetricType metric_type;
+    bool by_residual;
+    int use_precomputed_table;
+    int polysemous_ht;
+
+    // pre-allocated data buffers
+    float * sim_table, * sim_table_2;
+    float * residual_vec, *decoded_vec;
+
+    // single data buffer
+    std::vector<float> mem;
+
+    // for table pointers
+    std::vector<const float *> sim_table_ptrs;
+
+    explicit QueryTables (const IndexIVFPQ & ivfpq,
+                          const IVFSearchParameters *params):
+        ivfpq(ivfpq),
+        d(ivfpq.d),
+        pq (ivfpq.pq),
+        metric_type (ivfpq.metric_type),
+        by_residual (ivfpq.by_residual),
+        use_precomputed_table (ivfpq.use_precomputed_table)
+    {
+        mem.resize (pq.ksub * pq.M * 2 + d * 2);
+        sim_table = mem.data ();
+        sim_table_2 = sim_table + pq.ksub * pq.M;
+        residual_vec = sim_table_2 + pq.ksub * pq.M;
+        decoded_vec = residual_vec + d;
+
+        // for polysemous
+        polysemous_ht = ivfpq.polysemous_ht;
+        if (auto ivfpq_params =
+            dynamic_cast<const IVFPQSearchParameters *>(params)) {
+            polysemous_ht = ivfpq_params->polysemous_ht;
+        }
+        if (polysemous_ht != 0)  {
+            q_code.resize (pq.code_size);
+        }
+        init_list_cycles = 0;
+        sim_table_ptrs.resize (pq.M);
+    }
+
+    /*****************************************************
+     * What we do when query is known
+     *****************************************************/
+
+    // field specific to query
+    const float * qi;
+
+    // query-specific intialization
+    void init_query (const float * qi) {
+        this->qi = qi;
+        if (metric_type == METRIC_INNER_PRODUCT)
+            init_query_IP ();
+        else
+            init_query_L2 ();
+        if (!by_residual && polysemous_ht != 0)
+            pq.compute_code (qi, q_code.data());
+    }
+
+    void init_query_IP () {
+        // precompute some tables specific to the query qi
+        pq.compute_inner_prod_table (qi, sim_table);
+    }
+
+    void init_query_L2 () {
+        if (!by_residual) {
+            pq.compute_distance_table (qi, sim_table);
+        } else if (use_precomputed_table) {
+            pq.compute_inner_prod_table (qi, sim_table_2);
+        }
+    }
+
+    /*****************************************************
+     * When inverted list is known: prepare computations
+     *****************************************************/
+
+    // fields specific to list
+    Index::idx_t key;
+    float coarse_dis;
+    std::vector<uint8_t> q_code;
+
+    uint64_t init_list_cycles;
+
+    /// once we know the query and the centroid, we can prepare the
+    /// sim_table that will be used for accumulation
+    /// and dis0, the initial value
+    float precompute_list_tables () {
+        float dis0 = 0;
+        uint64_t t0; TIC;
+        if (by_residual) {
+            if (metric_type == METRIC_INNER_PRODUCT)
+                dis0 = precompute_list_tables_IP ();
+            else
+                dis0 = precompute_list_tables_L2 ();
+        }
+        init_list_cycles += TOC;
+        return dis0;
+     }
+
+    float precompute_list_table_pointers () {
+        float dis0 = 0;
+        uint64_t t0; TIC;
+        if (by_residual) {
+            if (metric_type == METRIC_INNER_PRODUCT)
+              FAISS_THROW_MSG ("not implemented");
+            else
+              dis0 = precompute_list_table_pointers_L2 ();
+        }
+        init_list_cycles += TOC;
+        return dis0;
+     }
+
+    /*****************************************************
+     * compute tables for inner prod
+     *****************************************************/
+
+    float precompute_list_tables_IP ()
+    {
+        // prepare the sim_table that will be used for accumulation
+        // and dis0, the initial value
+        ivfpq.quantizer->reconstruct (key, decoded_vec);
+        // decoded_vec = centroid
+        float dis0 = fvec_inner_product (qi, decoded_vec, d);
+
+        if (polysemous_ht) {
+            for (int i = 0; i < d; i++) {
+                residual_vec [i] = qi[i] - decoded_vec[i];
+            }
+            pq.compute_code (residual_vec, q_code.data());
+        }
+        return dis0;
+    }
+
+
+    /*****************************************************
+     * compute tables for L2 distance
+     *****************************************************/
+
+    float precompute_list_tables_L2 ()
+    {
+        float dis0 = 0;
+
+        if (use_precomputed_table == 0 || use_precomputed_table == -1) {
+            ivfpq.quantizer->compute_residual (qi, residual_vec, key);
+            pq.compute_distance_table (residual_vec, sim_table);
+
+            if (polysemous_ht != 0) {
+                pq.compute_code (residual_vec, q_code.data());
+            }
+
+        } else if (use_precomputed_table == 1) {
+            dis0 = coarse_dis;
+
+            fvec_madd (pq.M * pq.ksub,
+                       &ivfpq.precomputed_table [key * pq.ksub * pq.M],
+                       -2.0, sim_table_2,
+                       sim_table);
+
+
+            if (polysemous_ht != 0) {
+                ivfpq.quantizer->compute_residual (qi, residual_vec, key);
+                pq.compute_code (residual_vec, q_code.data());
+            }
+
+        } else if (use_precomputed_table == 2) {
+            dis0 = coarse_dis;
+
+            const MultiIndexQuantizer *miq =
+                dynamic_cast<const MultiIndexQuantizer *> (ivfpq.quantizer);
+            FAISS_THROW_IF_NOT (miq);
+            const ProductQuantizer &cpq = miq->pq;
+            int Mf = pq.M / cpq.M;
+
+            const float *qtab = sim_table_2; // query-specific table
+            float *ltab = sim_table; // (output) list-specific table
+
+            long k = key;
+            for (int cm = 0; cm < cpq.M; cm++) {
+                // compute PQ index
+                int ki = k & ((uint64_t(1) << cpq.nbits) - 1);
+                k >>= cpq.nbits;
+
+                // get corresponding table
+                const float *pc = &ivfpq.precomputed_table
+                    [(ki * pq.M + cm * Mf) * pq.ksub];
+
+                if (polysemous_ht == 0) {
+
+                    // sum up with query-specific table
+                    fvec_madd (Mf * pq.ksub,
+                               pc,
+                               -2.0, qtab,
+                               ltab);
+                    ltab += Mf * pq.ksub;
+                    qtab += Mf * pq.ksub;
+                } else {
+                    for (int m = cm * Mf; m < (cm + 1) * Mf; m++) {
+                        q_code[m] = fvec_madd_and_argmin
+                            (pq.ksub, pc, -2, qtab, ltab);
+                        pc += pq.ksub;
+                        ltab += pq.ksub;
+                        qtab += pq.ksub;
+                    }
+                }
+
+            }
+        }
+
+        return dis0;
+    }
+
+    float precompute_list_table_pointers_L2 ()
+    {
+        float dis0 = 0;
+
+        if (use_precomputed_table == 1) {
+            dis0 = coarse_dis;
+
+            const float * s = &ivfpq.precomputed_table [key * pq.ksub * pq.M];
+            for (int m = 0; m < pq.M; m++) {
+                sim_table_ptrs [m] = s;
+                s += pq.ksub;
+            }
+        } else if (use_precomputed_table == 2) {
+            dis0 = coarse_dis;
+
+            const MultiIndexQuantizer *miq =
+                dynamic_cast<const MultiIndexQuantizer *> (ivfpq.quantizer);
+            FAISS_THROW_IF_NOT (miq);
+            const ProductQuantizer &cpq = miq->pq;
+            int Mf = pq.M / cpq.M;
+
+            long k = key;
+            int m0 = 0;
+            for (int cm = 0; cm < cpq.M; cm++) {
+                int ki = k & ((uint64_t(1) << cpq.nbits) - 1);
+                k >>= cpq.nbits;
+
+                const float *pc = &ivfpq.precomputed_table
+                    [(ki * pq.M + cm * Mf) * pq.ksub];
+
+                for (int m = m0; m < m0 + Mf; m++) {
+                    sim_table_ptrs [m] = pc;
+                    pc += pq.ksub;
+                }
+                m0 += Mf;
+            }
+        } else {
+          FAISS_THROW_MSG ("need precomputed tables");
+        }
+
+        if (polysemous_ht) {
+            FAISS_THROW_MSG ("not implemented");
+            // Not clear that it makes sense to implemente this,
+            // because it costs M * ksub, which is what we wanted to
+            // avoid with the tables pointers.
+        }
+
+        return dis0;
+    }
+
+
+};
+
+
+
+template<class C>
+struct KnnSearchResults {
+    idx_t key;
+    const idx_t *ids;
+
+    // heap params
+    size_t k;
+    float * heap_sim;
+    idx_t * heap_ids;
+
+    size_t nup;
+
+    inline void add (idx_t j, float dis) {
+        if (C::cmp (heap_sim[0], dis)) {
+            heap_pop<C> (k, heap_sim, heap_ids);
+            idx_t id = ids ? ids[j] : (key << 32 | j);
+            heap_push<C> (k, heap_sim, heap_ids, dis, id);
+            nup++;
+        }
+    }
+
+};
+
+template<class C>
+struct RangeSearchResults {
+    idx_t key;
+    const idx_t *ids;
+
+    // wrapped result structure
+    float radius;
+    RangeQueryResult & rres;
+
+    inline void add (idx_t j, float dis) {
+        if (C::cmp (radius, dis)) {
+            idx_t id = ids ? ids[j] : (key << 32 | j);
+            rres.add (dis, id);
+        }
+    }
+};
+
+
+
+/*****************************************************
+ * Scaning the codes.
+ * The scanning functions call their favorite precompute_*
+ * function to precompute the tables they need.
+ *****************************************************/
+template <typename IDType, MetricType METRIC_TYPE>
+struct IVFPQScannerT: QueryTables {
+
+    const uint8_t * list_codes;
+    const IDType * list_ids;
+    size_t list_size;
+
+    IVFPQScannerT (const IndexIVFPQ & ivfpq, const IVFSearchParameters *params):
+        QueryTables (ivfpq, params)
+    {
+        FAISS_THROW_IF_NOT (pq.nbits == 8);
+        assert(METRIC_TYPE == metric_type);
+    }
+
+    float dis0;
+
+    void init_list (idx_t list_no, float coarse_dis,
+                      int mode) {
+        this->key = list_no;
+        this->coarse_dis = coarse_dis;
+
+        if (mode == 2) {
+            dis0 = precompute_list_tables ();
+        } else if (mode == 1) {
+            dis0 = precompute_list_table_pointers ();
+        }
+    }
+
+    /*****************************************************
+     * Scaning the codes: simple PQ scan.
+     *****************************************************/
+
+    /// version of the scan where we use precomputed tables
+    template<class SearchResultType>
+    void scan_list_with_table (size_t ncode, const uint8_t *codes,
+                               SearchResultType & res) const
+    {
+        for (size_t j = 0; j < ncode; j++) {
+
+            float dis = dis0;
+            const float *tab = sim_table;
+
+            for (size_t m = 0; m < pq.M; m++) {
+                dis += tab[*codes++];
+                tab += pq.ksub;
+            }
+
+            res.add(j, dis);
+        }
+    }
+
+
+    /// tables are not precomputed, but pointers are provided to the
+    /// relevant X_c|x_r tables
+    template<class SearchResultType>
+    void scan_list_with_pointer (size_t ncode, const uint8_t *codes,
+                                 SearchResultType & res) const
+    {
+        for (size_t j = 0; j < ncode; j++) {
+
+            float dis = dis0;
+            const float *tab = sim_table_2;
+
+            for (size_t m = 0; m < pq.M; m++) {
+                int ci = *codes++;
+                dis += sim_table_ptrs [m][ci] - 2 * tab [ci];
+                tab += pq.ksub;
+            }
+            res.add (j, dis);
+        }
+    }
+
+
+    /// nothing is precomputed: access residuals on-the-fly
+    template<class SearchResultType>
+    void scan_on_the_fly_dist (size_t ncode, const uint8_t *codes,
+                                 SearchResultType &res) const
+    {
+        const float *dvec;
+        float dis0 = 0;
+        if (by_residual) {
+            if (METRIC_TYPE == METRIC_INNER_PRODUCT) {
+                ivfpq.quantizer->reconstruct (key, residual_vec);
+                dis0 = fvec_inner_product (residual_vec, qi, d);
+            } else {
+                ivfpq.quantizer->compute_residual (qi, residual_vec, key);
+            }
+            dvec = residual_vec;
+        } else {
+            dvec = qi;
+            dis0 = 0;
+        }
+
+        for (size_t j = 0; j < ncode; j++) {
+
+            pq.decode (codes, decoded_vec);
+            codes += pq.code_size;
+
+            float dis;
+            if (METRIC_TYPE == METRIC_INNER_PRODUCT) {
+                dis = dis0 + fvec_inner_product (decoded_vec, qi, d);
+            } else {
+                dis = fvec_L2sqr (decoded_vec, dvec, d);
+            }
+            res.add (j, dis);
+        }
+    }
+
+    /*****************************************************
+     * Scanning codes with polysemous filtering
+     *****************************************************/
+
+    template <class HammingComputer, class SearchResultType>
+    void scan_list_polysemous_hc (
+             size_t ncode, const uint8_t *codes,
+             SearchResultType & res) const
+    {
+        int ht = ivfpq.polysemous_ht;
+        size_t n_hamming_pass = 0, nup = 0;
+
+        int code_size = pq.code_size;
+
+        HammingComputer hc (q_code.data(), code_size);
+
+        for (size_t j = 0; j < ncode; j++) {
+            const uint8_t *b_code = codes;
+            int hd = hc.hamming (b_code);
+            if (hd < ht) {
+                n_hamming_pass ++;
+
+                float dis = dis0;
+                const float *tab = sim_table;
+
+                for (size_t m = 0; m < pq.M; m++) {
+                    dis += tab[*b_code++];
+                    tab += pq.ksub;
+                }
+
+                res.add (j, dis);
+            }
+            codes += code_size;
+        }
+#pragma omp critical
+        {
+            indexIVFPQ_stats.n_hamming_pass += n_hamming_pass;
+        }
+    }
+
+    template<class SearchResultType>
+    void scan_list_polysemous (
+             size_t ncode, const uint8_t *codes,
+             SearchResultType &res) const
+    {
+        switch (pq.code_size) {
+#define HANDLE_CODE_SIZE(cs)                                            \
+        case cs:                                                        \
+            scan_list_polysemous_hc \
+            <HammingComputer ## cs, SearchResultType>   \
+                (ncode, codes, res);             \
+            break
+        HANDLE_CODE_SIZE(4);
+        HANDLE_CODE_SIZE(8);
+        HANDLE_CODE_SIZE(16);
+        HANDLE_CODE_SIZE(20);
+        HANDLE_CODE_SIZE(32);
+        HANDLE_CODE_SIZE(64);
+#undef HANDLE_CODE_SIZE
+        default:
+            if (pq.code_size % 8 == 0)
+                scan_list_polysemous_hc
+                    <HammingComputerM8, SearchResultType>
+                    (ncode, codes, res);
+            else
+                scan_list_polysemous_hc
+                    <HammingComputerM4, SearchResultType>
+                    (ncode, codes, res);
+            break;
+        }
+    }
+
+};
+
+
+/* We put as many parameters as possible in template. Hopefully the
+ * gain in runtime is worth the code bloat. C is the comparator < or
+ * >, it is directly related to METRIC_TYPE. precompute_mode is how
+ * much we precompute (2 = precompute distance tables, 1 = precompute
+ * pointers to distances, 0 = compute distances one by one).
+ * Currently only 2 is supported */
+template<MetricType METRIC_TYPE, class C, int precompute_mode>
+struct IVFPQScanner:
+    IVFPQScannerT<Index::idx_t, METRIC_TYPE>,
+    InvertedListScanner
+{
+    bool store_pairs;
+
+    IVFPQScanner(const IndexIVFPQ & ivfpq, bool store_pairs):
+        IVFPQScannerT<Index::idx_t, METRIC_TYPE>(ivfpq, nullptr),
+        store_pairs(store_pairs)
+    {
+    }
+
+    void set_query (const float *query) override {
+        this->init_query (query);
+    }
+
+    void set_list (idx_t list_no, float coarse_dis) override {
+        this->init_list (list_no, coarse_dis, precompute_mode);
+    }
+
+    float distance_to_code (const uint8_t *code) const override {
+        assert(precompute_mode == 2);
+        float dis = this->dis0;
+        const float *tab = this->sim_table;
+
+        for (size_t m = 0; m < this->pq.M; m++) {
+            dis += tab[*code++];
+            tab += this->pq.ksub;
+        }
+        return dis;
+    }
+
+    size_t scan_codes (size_t ncode,
+                       const uint8_t *codes,
+                       const idx_t *ids,
+                       float *heap_sim, idx_t *heap_ids,
+                       size_t k) const override
+    {
+        KnnSearchResults<C> res = {
+            /* key */      this->key,
+            /* ids */      this->store_pairs ? nullptr : ids,
+            /* k */        k,
+            /* heap_sim */ heap_sim,
+            /* heap_ids */ heap_ids,
+            /* nup */      0
+        };
+
+        if (this->polysemous_ht > 0) {
+            assert(precompute_mode == 2);
+            this->scan_list_polysemous (ncode, codes, res);
+        } else if (precompute_mode == 2) {
+            this->scan_list_with_table (ncode, codes, res);
+        } else if (precompute_mode == 1) {
+            this->scan_list_with_pointer (ncode, codes, res);
+        } else if (precompute_mode == 0) {
+            this->scan_on_the_fly_dist (ncode, codes, res);
+        } else {
+            FAISS_THROW_MSG("bad precomp mode");
+        }
+        return res.nup;
+    }
+
+    void scan_codes_range (size_t ncode,
+                           const uint8_t *codes,
+                           const idx_t *ids,
+                           float radius,
+                           RangeQueryResult & rres) const override
+    {
+        RangeSearchResults<C> res = {
+            /* key */      this->key,
+            /* ids */      this->store_pairs ? nullptr : ids,
+            /* radius */   radius,
+            /* rres */     rres
+        };
+
+        if (this->polysemous_ht > 0) {
+            assert(precompute_mode == 2);
+            this->scan_list_polysemous (ncode, codes, res);
+        } else if (precompute_mode == 2) {
+            this->scan_list_with_table (ncode, codes, res);
+        } else if (precompute_mode == 1) {
+            this->scan_list_with_pointer (ncode, codes, res);
+        } else if (precompute_mode == 0) {
+            this->scan_on_the_fly_dist (ncode, codes, res);
+        } else {
+            FAISS_THROW_MSG("bad precomp mode");
+        }
+
+    }
+};
+
+
+
+
+} // anonymous namespace
+
+InvertedListScanner *
+IndexIVFPQ::get_InvertedListScanner (bool store_pairs) const
+{
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        return new IVFPQScanner<METRIC_INNER_PRODUCT, CMin<float, idx_t>, 2>
+            (*this, store_pairs);
+    } else if (metric_type == METRIC_L2) {
+        return new IVFPQScanner<METRIC_L2, CMax<float, idx_t>, 2>
+            (*this, store_pairs);
+    }
+    return nullptr;
+
+}
+
+
+
+IndexIVFPQStats indexIVFPQ_stats;
+
+void IndexIVFPQStats::reset () {
+    memset (this, 0, sizeof (*this));
+}
+
+
+
+IndexIVFPQ::IndexIVFPQ ()
+{
+    // initialize some runtime values
+    use_precomputed_table = 0;
+    scan_table_threshold = 0;
+    do_polysemous_training = false;
+    polysemous_ht = 0;
+    polysemous_training = nullptr;
+}
+
+
+struct CodeCmp {
+    const uint8_t *tab;
+    size_t code_size;
+    bool operator () (int a, int b) const {
+        return cmp (a, b) > 0;
+    }
+    int cmp (int a, int b) const {
+        return memcmp (tab + a * code_size, tab + b * code_size,
+                       code_size);
+    }
+};
+
+
+size_t IndexIVFPQ::find_duplicates (idx_t *dup_ids, size_t *lims) const
+{
+    size_t ngroup = 0;
+    lims[0] = 0;
+    for (size_t list_no = 0; list_no < nlist; list_no++) {
+        size_t n = invlists->list_size (list_no);
+        std::vector<int> ord (n);
+        for (int i = 0; i < n; i++) ord[i] = i;
+        InvertedLists::ScopedCodes codes (invlists, list_no);
+        CodeCmp cs = { codes.get(), code_size };
+        std::sort (ord.begin(), ord.end(), cs);
+
+        InvertedLists::ScopedIds list_ids (invlists, list_no);
+        int prev = -1;  // all elements from prev to i-1 are equal
+        for (int i = 0; i < n; i++) {
+            if (prev >= 0 && cs.cmp (ord [prev], ord [i]) == 0) {
+                // same as previous => remember
+                if (prev + 1 == i) { // start new group
+                    ngroup++;
+                    lims[ngroup] = lims[ngroup - 1];
+                    dup_ids [lims [ngroup]++] = list_ids [ord [prev]];
+                }
+                dup_ids [lims [ngroup]++] = list_ids [ord [i]];
+            } else { // not same as previous.
+                prev = i;
+            }
+        }
+    }
+    return ngroup;
+}
+
+
+
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexIVFPQ.h b/core/src/index/thirdparty/faiss/IndexIVFPQ.h
new file mode 100644
index 0000000000..f556043087
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexIVFPQ.h
@@ -0,0 +1,161 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_IVFPQ_H
+#define FAISS_INDEX_IVFPQ_H
+
+
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexPQ.h>
+
+
+namespace faiss {
+
+struct IVFPQSearchParameters: IVFSearchParameters {
+    size_t scan_table_threshold;   ///< use table computation or on-the-fly?
+    int polysemous_ht;             ///< Hamming thresh for polysemous filtering
+    ~IVFPQSearchParameters () {}
+};
+
+
+/** Inverted file with Product Quantizer encoding. Each residual
+ * vector is encoded as a product quantizer code.
+ */
+struct IndexIVFPQ: IndexIVF {
+    bool by_residual;              ///< Encode residual or plain vector?
+
+    ProductQuantizer pq;           ///< produces the codes
+
+    bool do_polysemous_training;   ///< reorder PQ centroids after training?
+    PolysemousTraining *polysemous_training; ///< if NULL, use default
+
+    // search-time parameters
+    size_t scan_table_threshold;   ///< use table computation or on-the-fly?
+    int polysemous_ht;             ///< Hamming thresh for polysemous filtering
+
+    /** Precompute table that speed up query preprocessing at some
+     * memory cost
+     * =-1: force disable
+     * =0: decide heuristically (default: use tables only if they are
+     *     < precomputed_tables_max_bytes)
+     * =1: tables that work for all quantizers (size 256 * nlist * M)
+     * =2: specific version for MultiIndexQuantizer (much more compact)
+     */
+    int use_precomputed_table;     ///< if by_residual, build precompute tables
+    static size_t precomputed_table_max_bytes;
+
+    /// if use_precompute_table
+    /// size nlist * pq.M * pq.ksub
+    std::vector <float> precomputed_table;
+
+    IndexIVFPQ (
+            Index * quantizer, size_t d, size_t nlist,
+            size_t M, size_t nbits_per_idx);
+
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids = nullptr)
+        override;
+
+    void encode_vectors(idx_t n, const float* x,
+                        const idx_t *list_nos,
+                        uint8_t * codes,
+                        bool include_listnos = false) const override;
+
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                    float *x) const override;
+
+
+    /// same as add_core, also:
+    /// - output 2nd level residuals if residuals_2 != NULL
+    /// - use precomputed list numbers if precomputed_idx != NULL
+    void add_core_o (idx_t n, const float *x,
+                     const idx_t *xids, float *residuals_2,
+                     const idx_t *precomputed_idx = nullptr);
+
+    /// trains the product quantizer
+    void train_residual(idx_t n, const float* x) override;
+
+    /// same as train_residual, also output 2nd level residuals
+    void train_residual_o (idx_t n, const float *x, float *residuals_2);
+
+    void reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                  float* recons) const override;
+
+    /** Find exact duplicates in the dataset.
+     *
+     * the duplicates are returned in pre-allocated arrays (see the
+     * max sizes).
+     *
+     * @params lims   limits between groups of duplicates
+     *                (max size ntotal / 2 + 1)
+     * @params ids    ids[lims[i]] : ids[lims[i+1]-1] is a group of
+     *                duplicates (max size ntotal)
+     * @return n      number of groups found
+     */
+    size_t find_duplicates (idx_t *ids, size_t *lims) const;
+
+    // map a vector to a binary code knowning the index
+    void encode (idx_t key, const float * x, uint8_t * code) const;
+
+    /** Encode multiple vectors
+     *
+     * @param n       nb vectors to encode
+     * @param keys    posting list ids for those vectors (size n)
+     * @param x       vectors (size n * d)
+     * @param codes   output codes (size n * code_size)
+     * @param compute_keys  if false, assume keys are precomputed,
+     *                      otherwise compute them
+     */
+    void encode_multiple (size_t n, idx_t *keys,
+                          const float * x, uint8_t * codes,
+                          bool compute_keys = false) const;
+
+    /// inverse of encode_multiple
+    void decode_multiple (size_t n, const idx_t *keys,
+                          const uint8_t * xcodes, float * x) const;
+
+    InvertedListScanner *get_InvertedListScanner (bool store_pairs)
+        const override;
+
+    /// build precomputed table
+    void precompute_table ();
+
+    IndexIVFPQ ();
+
+};
+
+
+/// statistics are robust to internal threading, but not if
+/// IndexIVFPQ::search_preassigned is called by multiple threads
+struct IndexIVFPQStats {
+    size_t nrefine;  // nb of refines (IVFPQR)
+
+    size_t n_hamming_pass;
+    // nb of passed Hamming distance tests (for polysemous)
+
+    // timings measured with the CPU RTC
+    // on all threads
+    size_t search_cycles;
+    size_t refine_cycles; // only for IVFPQR
+
+    IndexIVFPQStats () {reset (); }
+    void reset ();
+};
+
+// global var that collects them all
+extern IndexIVFPQStats indexIVFPQ_stats;
+
+
+
+
+} // namespace faiss
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/IndexIVFPQR.cpp b/core/src/index/thirdparty/faiss/IndexIVFPQR.cpp
new file mode 100644
index 0000000000..44562b0647
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexIVFPQR.cpp
@@ -0,0 +1,219 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexIVFPQR.h>
+
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/utils.h>
+#include <faiss/utils/distances.h>
+
+#include <faiss/impl/FaissAssert.h>
+
+
+namespace faiss {
+
+/*****************************************
+ * IndexIVFPQR implementation
+ ******************************************/
+
+IndexIVFPQR::IndexIVFPQR (
+            Index * quantizer, size_t d, size_t nlist,
+            size_t M, size_t nbits_per_idx,
+            size_t M_refine, size_t nbits_per_idx_refine):
+    IndexIVFPQ (quantizer, d, nlist, M, nbits_per_idx),
+    refine_pq (d, M_refine, nbits_per_idx_refine),
+    k_factor (4)
+{
+    by_residual = true;
+}
+
+IndexIVFPQR::IndexIVFPQR ():
+    k_factor (1)
+{
+    by_residual = true;
+}
+
+
+
+void IndexIVFPQR::reset()
+{
+    IndexIVFPQ::reset();
+    refine_codes.clear();
+}
+
+
+
+
+void IndexIVFPQR::train_residual (idx_t n, const float *x)
+{
+
+    float * residual_2 = new float [n * d];
+    ScopeDeleter <float> del(residual_2);
+
+    train_residual_o (n, x, residual_2);
+
+    if (verbose)
+        printf ("training %zdx%zd 2nd level PQ quantizer on %ld %dD-vectors\n",
+                refine_pq.M, refine_pq.ksub, n, d);
+
+    refine_pq.cp.max_points_per_centroid = 1000;
+    refine_pq.cp.verbose = verbose;
+
+    refine_pq.train (n, residual_2);
+
+}
+
+
+void IndexIVFPQR::add_with_ids (idx_t n, const float *x, const idx_t *xids) {
+    add_core (n, x, xids, nullptr);
+}
+
+void IndexIVFPQR::add_core (idx_t n, const float *x, const idx_t *xids,
+                                const idx_t *precomputed_idx) {
+
+    float * residual_2 = new float [n * d];
+    ScopeDeleter <float> del(residual_2);
+
+    idx_t n0 = ntotal;
+
+    add_core_o (n, x, xids, residual_2, precomputed_idx);
+
+    refine_codes.resize (ntotal * refine_pq.code_size);
+
+    refine_pq.compute_codes (
+        residual_2, &refine_codes[n0 * refine_pq.code_size], n);
+
+
+}
+#define TIC t0 = get_cycles()
+#define TOC get_cycles () - t0
+
+
+void IndexIVFPQR::search_preassigned (idx_t n, const float *x, idx_t k,
+                                      const idx_t *idx,
+                                      const float *L1_dis,
+                                      float *distances, idx_t *labels,
+                                      bool store_pairs,
+                                      const IVFSearchParameters *params
+                                      ) const
+{
+    uint64_t t0;
+    TIC;
+    size_t k_coarse = long(k * k_factor);
+    idx_t *coarse_labels = new idx_t [k_coarse * n];
+    ScopeDeleter<idx_t> del1 (coarse_labels);
+    { // query with quantizer levels 1 and 2.
+        float *coarse_distances = new float [k_coarse * n];
+        ScopeDeleter<float> del(coarse_distances);
+
+        IndexIVFPQ::search_preassigned (
+                   n, x, k_coarse,
+                   idx, L1_dis, coarse_distances, coarse_labels,
+                   true, params);
+    }
+
+
+    indexIVFPQ_stats.search_cycles += TOC;
+
+    TIC;
+
+    // 3rd level refinement
+    size_t n_refine = 0;
+#pragma omp parallel reduction(+ : n_refine)
+    {
+        // tmp buffers
+        float *residual_1 = new float [2 * d];
+        ScopeDeleter<float> del (residual_1);
+        float *residual_2 = residual_1 + d;
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            const float *xq = x + i * d;
+            const idx_t * shortlist = coarse_labels + k_coarse * i;
+            float * heap_sim = distances + k * i;
+            idx_t * heap_ids = labels + k * i;
+            maxheap_heapify (k, heap_sim, heap_ids);
+
+            for (int j = 0; j < k_coarse; j++) {
+                idx_t sl = shortlist[j];
+
+                if (sl == -1) continue;
+
+                int list_no = sl >> 32;
+                int ofs = sl & 0xffffffff;
+
+                assert (list_no >= 0 && list_no < nlist);
+                assert (ofs >= 0 && ofs < invlists->list_size (list_no));
+
+                // 1st level residual
+                quantizer->compute_residual (xq, residual_1, list_no);
+
+                // 2nd level residual
+                const uint8_t * l2code =
+                    invlists->get_single_code (list_no, ofs);
+
+                pq.decode (l2code, residual_2);
+                for (int l = 0; l < d; l++)
+                    residual_2[l] = residual_1[l] - residual_2[l];
+
+                // 3rd level residual's approximation
+                idx_t id = invlists->get_single_id (list_no, ofs);
+                assert (0 <= id && id < ntotal);
+                refine_pq.decode (&refine_codes [id * refine_pq.code_size],
+                                  residual_1);
+
+                float dis = fvec_L2sqr (residual_1, residual_2, d);
+
+                if (dis < heap_sim[0]) {
+                    maxheap_pop (k, heap_sim, heap_ids);
+                    idx_t id_or_pair = store_pairs ? sl : id;
+                    maxheap_push (k, heap_sim, heap_ids, dis, id_or_pair);
+                }
+                n_refine ++;
+            }
+            maxheap_reorder (k, heap_sim, heap_ids);
+        }
+    }
+    indexIVFPQ_stats.nrefine += n_refine;
+    indexIVFPQ_stats.refine_cycles += TOC;
+}
+
+void IndexIVFPQR::reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                           float* recons) const
+{
+    IndexIVFPQ::reconstruct_from_offset (list_no, offset, recons);
+
+    idx_t id = invlists->get_single_id (list_no, offset);
+    assert (0 <= id && id < ntotal);
+
+    std::vector<float> r3(d);
+    refine_pq.decode (&refine_codes [id * refine_pq.code_size], r3.data());
+    for (int i = 0; i < d; ++i) {
+      recons[i] += r3[i];
+    }
+}
+
+void IndexIVFPQR::merge_from (IndexIVF &other_in, idx_t add_id)
+{
+    IndexIVFPQR *other = dynamic_cast<IndexIVFPQR *> (&other_in);
+    FAISS_THROW_IF_NOT(other);
+
+    IndexIVF::merge_from (other_in, add_id);
+
+    refine_codes.insert (refine_codes.end(),
+                         other->refine_codes.begin(),
+                         other->refine_codes.end());
+    other->refine_codes.clear();
+}
+
+size_t IndexIVFPQR::remove_ids(const IDSelector& /*sel*/) {
+  FAISS_THROW_MSG("not implemented");
+  return 0;
+}
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexIVFPQR.h b/core/src/index/thirdparty/faiss/IndexIVFPQR.h
new file mode 100644
index 0000000000..934b912d25
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexIVFPQR.h
@@ -0,0 +1,65 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <vector>
+
+#include <faiss/IndexIVFPQ.h>
+
+
+namespace faiss {
+
+
+
+/** Index with an additional level of PQ refinement */
+struct IndexIVFPQR: IndexIVFPQ {
+    ProductQuantizer refine_pq;           ///< 3rd level quantizer
+    std::vector <uint8_t> refine_codes;   ///< corresponding codes
+
+    /// factor between k requested in search and the k requested from the IVFPQ
+    float k_factor;
+
+    IndexIVFPQR (
+            Index * quantizer, size_t d, size_t nlist,
+            size_t M, size_t nbits_per_idx,
+            size_t M_refine, size_t nbits_per_idx_refine);
+
+    void reset() override;
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    /// trains the two product quantizers
+    void train_residual(idx_t n, const float* x) override;
+
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    /// same as add_with_ids, but optionally use the precomputed list ids
+    void add_core (idx_t n, const float *x, const idx_t *xids,
+                     const idx_t *precomputed_idx = nullptr);
+
+    void reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                  float* recons) const override;
+
+    void merge_from (IndexIVF &other, idx_t add_id) override;
+
+
+    void search_preassigned (idx_t n, const float *x, idx_t k,
+                             const idx_t *assign,
+                             const float *centroid_dis,
+                             float *distances, idx_t *labels,
+                             bool store_pairs,
+                             const IVFSearchParameters *params=nullptr
+                             ) const override;
+
+    IndexIVFPQR();
+};
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexIVFSpectralHash.cpp b/core/src/index/thirdparty/faiss/IndexIVFSpectralHash.cpp
new file mode 100644
index 0000000000..cab78d0f16
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexIVFSpectralHash.cpp
@@ -0,0 +1,331 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+
+#include <faiss/IndexIVFSpectralHash.h>
+
+#include <memory>
+#include <algorithm>
+#include <stdint.h>
+
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/VectorTransform.h>
+
+namespace faiss {
+
+
+IndexIVFSpectralHash::IndexIVFSpectralHash (
+        Index * quantizer, size_t d, size_t nlist,
+        int nbit, float period):
+    IndexIVF (quantizer, d, nlist, (nbit + 7) / 8, METRIC_L2),
+    nbit (nbit), period (period), threshold_type (Thresh_global)
+{
+    FAISS_THROW_IF_NOT (code_size % 4 == 0);
+    RandomRotationMatrix *rr = new RandomRotationMatrix (d, nbit);
+    rr->init (1234);
+    vt = rr;
+    own_fields = true;
+    is_trained = false;
+}
+
+IndexIVFSpectralHash::IndexIVFSpectralHash():
+    IndexIVF(), vt(nullptr), own_fields(false),
+    nbit(0), period(0), threshold_type(Thresh_global)
+{}
+
+IndexIVFSpectralHash::~IndexIVFSpectralHash ()
+{
+    if (own_fields) {
+        delete vt;
+    }
+}
+
+namespace {
+
+
+float median (size_t n, float *x) {
+    std::sort(x, x + n);
+    if (n % 2 == 1) {
+        return x [n / 2];
+    } else {
+        return (x [n / 2 - 1] + x [n / 2]) / 2;
+    }
+}
+
+}
+
+
+void IndexIVFSpectralHash::train_residual (idx_t n, const float *x)
+{
+    if (!vt->is_trained) {
+        vt->train (n, x);
+    }
+
+    if (threshold_type == Thresh_global) {
+        // nothing to do
+        return;
+    } else if (threshold_type == Thresh_centroid ||
+        threshold_type == Thresh_centroid_half) {
+        // convert all centroids with vt
+        std::vector<float> centroids (nlist * d);
+        quantizer->reconstruct_n (0, nlist, centroids.data());
+        trained.resize(nlist * nbit);
+        vt->apply_noalloc (nlist, centroids.data(), trained.data());
+        if (threshold_type == Thresh_centroid_half) {
+            for (size_t i = 0; i < nlist * nbit; i++) {
+                trained[i] -= 0.25 * period;
+            }
+        }
+        return;
+    }
+    // otherwise train medians
+
+    // assign
+    std::unique_ptr<idx_t []> idx (new idx_t [n]);
+    quantizer->assign (n, x, idx.get());
+
+    std::vector<size_t> sizes(nlist + 1);
+    for (size_t i = 0; i < n; i++) {
+        FAISS_THROW_IF_NOT (idx[i] >= 0);
+        sizes[idx[i]]++;
+    }
+
+    size_t ofs = 0;
+    for (int j = 0; j < nlist; j++) {
+        size_t o0 = ofs;
+        ofs += sizes[j];
+        sizes[j] = o0;
+    }
+
+    // transform
+    std::unique_ptr<float []> xt (vt->apply (n, x));
+
+    // transpose + reorder
+    std::unique_ptr<float []> xo (new float[n * nbit]);
+
+    for (size_t i = 0; i < n; i++) {
+        size_t idest = sizes[idx[i]]++;
+        for (size_t j = 0; j < nbit; j++) {
+            xo[idest + n * j] = xt[i * nbit + j];
+        }
+    }
+
+    trained.resize (n * nbit);
+    // compute medians
+#pragma omp for
+    for (int i = 0; i < nlist; i++) {
+        size_t i0 = i == 0 ? 0 : sizes[i - 1];
+        size_t i1 = sizes[i];
+        for (int j = 0; j < nbit; j++) {
+            float *xoi = xo.get() + i0 + n * j;
+            if (i0 == i1) { // nothing to train
+                trained[i * nbit + j] = 0.0;
+            } else if (i1 == i0 + 1) {
+                trained[i * nbit + j] = xoi[0];
+            } else {
+                trained[i * nbit + j] = median(i1 - i0, xoi);
+            }
+        }
+    }
+}
+
+
+namespace {
+
+void binarize_with_freq(size_t nbit, float freq,
+                        const float *x, const float *c,
+                        uint8_t *codes)
+{
+    memset (codes, 0, (nbit + 7) / 8);
+    for (size_t i = 0; i < nbit; i++) {
+        float xf = (x[i] - c[i]);
+        int xi = int(floor(xf * freq));
+        int bit = xi & 1;
+        codes[i >> 3] |= bit << (i & 7);
+    }
+}
+
+
+};
+
+
+
+void IndexIVFSpectralHash::encode_vectors(idx_t n, const float* x_in,
+                                          const idx_t *list_nos,
+                                          uint8_t * codes,
+                                          bool include_listnos) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    float freq = 2.0 / period;
+
+    FAISS_THROW_IF_NOT_MSG (!include_listnos, "listnos encoding not supported");
+
+    // transform with vt
+    std::unique_ptr<float []> x (vt->apply (n, x_in));
+
+#pragma omp parallel
+    {
+        std::vector<float> zero (nbit);
+
+        // each thread takes care of a subset of lists
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            int64_t list_no = list_nos [i];
+
+            if (list_no >= 0) {
+                const float *c;
+                if (threshold_type == Thresh_global) {
+                    c = zero.data();
+                } else {
+                    c = trained.data() + list_no * nbit;
+                }
+                binarize_with_freq (nbit, freq,
+                                    x.get() + i * nbit, c,
+                                    codes + i * code_size) ;
+            }
+        }
+    }
+}
+
+namespace {
+
+
+template<class HammingComputer>
+struct IVFScanner: InvertedListScanner {
+
+    // copied from index structure
+    const IndexIVFSpectralHash *index;
+    size_t code_size;
+    size_t nbit;
+    bool store_pairs;
+
+    float period, freq;
+    std::vector<float> q;
+    std::vector<float> zero;
+    std::vector<uint8_t> qcode;
+    HammingComputer hc;
+
+    using idx_t = Index::idx_t;
+
+    IVFScanner (const IndexIVFSpectralHash * index,
+                bool store_pairs):
+        index (index),
+        code_size(index->code_size),
+        nbit(index->nbit),
+        store_pairs(store_pairs),
+        period(index->period), freq(2.0 / index->period),
+        q(nbit), zero(nbit), qcode(code_size),
+        hc(qcode.data(), code_size)
+    {
+    }
+
+
+    void set_query (const float *query) override {
+        FAISS_THROW_IF_NOT(query);
+        FAISS_THROW_IF_NOT(q.size() == nbit);
+        index->vt->apply_noalloc (1, query, q.data());
+
+        if (index->threshold_type ==
+            IndexIVFSpectralHash::Thresh_global) {
+            binarize_with_freq
+                (nbit, freq, q.data(), zero.data(), qcode.data());
+            hc.set (qcode.data(), code_size);
+        }
+    }
+
+    idx_t list_no;
+
+    void set_list (idx_t list_no, float /*coarse_dis*/) override {
+        this->list_no = list_no;
+        if (index->threshold_type != IndexIVFSpectralHash::Thresh_global) {
+            const float *c = index->trained.data() + list_no * nbit;
+            binarize_with_freq (nbit, freq, q.data(), c, qcode.data());
+            hc.set (qcode.data(), code_size);
+        }
+    }
+
+    float distance_to_code (const uint8_t *code) const final {
+        return hc.hamming (code);
+    }
+
+    size_t scan_codes (size_t list_size,
+                       const uint8_t *codes,
+                       const idx_t *ids,
+                       float *simi, idx_t *idxi,
+                       size_t k) const override
+    {
+        size_t nup = 0;
+        for (size_t j = 0; j < list_size; j++) {
+
+            float dis = hc.hamming (codes);
+
+            if (dis < simi [0]) {
+                maxheap_pop (k, simi, idxi);
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                maxheap_push (k, simi, idxi, dis, id);
+                nup++;
+            }
+            codes += code_size;
+        }
+        return nup;
+    }
+
+    void scan_codes_range (size_t list_size,
+                           const uint8_t *codes,
+                           const idx_t *ids,
+                           float radius,
+                           RangeQueryResult & res) const override
+    {
+        for (size_t j = 0; j < list_size; j++) {
+            float dis = hc.hamming (codes);
+            if (dis < radius) {
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                res.add (dis, id);
+            }
+            codes += code_size;
+        }
+    }
+
+
+};
+
+} // anonymous namespace
+
+InvertedListScanner* IndexIVFSpectralHash::get_InvertedListScanner
+    (bool store_pairs) const
+{
+    switch (code_size) {
+#define HANDLE_CODE_SIZE(cs) \
+    case cs: \
+        return new IVFScanner<HammingComputer ## cs> (this, store_pairs)
+        HANDLE_CODE_SIZE(4);
+        HANDLE_CODE_SIZE(8);
+        HANDLE_CODE_SIZE(16);
+        HANDLE_CODE_SIZE(20);
+        HANDLE_CODE_SIZE(32);
+        HANDLE_CODE_SIZE(64);
+#undef HANDLE_CODE_SIZE
+        default:
+            if (code_size % 8 == 0) {
+                return new IVFScanner<HammingComputerM8>(this, store_pairs);
+            } else if (code_size % 4 == 0) {
+                return new IVFScanner<HammingComputerM4>(this, store_pairs);
+            } else {
+                FAISS_THROW_MSG("not supported");
+            }
+    }
+
+}
+
+
+
+}  // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexIVFSpectralHash.h b/core/src/index/thirdparty/faiss/IndexIVFSpectralHash.h
new file mode 100644
index 0000000000..ee01ac81cd
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexIVFSpectralHash.h
@@ -0,0 +1,75 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_IVFSH_H
+#define FAISS_INDEX_IVFSH_H
+
+
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+
+
+namespace faiss {
+
+struct VectorTransform;
+
+/** Inverted list that stores binary codes of size nbit. Before the
+ * binary conversion, the dimension of the vectors is transformed from
+ * dim d into dim nbit by vt (a random rotation by default).
+ *
+ * Each coordinate is subtracted from a value determined by
+ * threshold_type, and split into intervals of size period. Half of
+ * the interval is a 0 bit, the other half a 1.
+ */
+struct IndexIVFSpectralHash: IndexIVF {
+
+    VectorTransform *vt; // transformation from d to nbit dim
+    bool own_fields;
+
+    int nbit;
+    float period;
+
+    enum ThresholdType {
+        Thresh_global,
+        Thresh_centroid,
+        Thresh_centroid_half,
+        Thresh_median
+    };
+    ThresholdType threshold_type;
+
+    // size nlist * nbit or 0 if Thresh_global
+    std::vector<float> trained;
+
+    IndexIVFSpectralHash (Index * quantizer, size_t d, size_t nlist,
+                          int nbit, float period);
+
+    IndexIVFSpectralHash ();
+
+    void train_residual(idx_t n, const float* x) override;
+
+    void encode_vectors(idx_t n, const float* x,
+                        const idx_t *list_nos,
+                        uint8_t * codes,
+                        bool include_listnos = false) const override;
+
+    InvertedListScanner *get_InvertedListScanner (bool store_pairs)
+        const override;
+
+    ~IndexIVFSpectralHash () override;
+
+};
+
+
+
+
+}; // namespace faiss
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/IndexLSH.cpp b/core/src/index/thirdparty/faiss/IndexLSH.cpp
new file mode 100644
index 0000000000..c6149f8ea8
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexLSH.cpp
@@ -0,0 +1,225 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexLSH.h>
+
+#include <cstdio>
+#include <cstring>
+
+#include <algorithm>
+
+#include <faiss/utils/utils.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/impl/FaissAssert.h>
+
+
+namespace faiss {
+
+/***************************************************************
+ * IndexLSH
+ ***************************************************************/
+
+
+IndexLSH::IndexLSH (idx_t d, int nbits, bool rotate_data, bool train_thresholds):
+    Index(d), nbits(nbits), rotate_data(rotate_data),
+    train_thresholds (train_thresholds), rrot(d, nbits)
+{
+    is_trained = !train_thresholds;
+
+    bytes_per_vec = (nbits + 7) / 8;
+
+    if (rotate_data) {
+        rrot.init(5);
+    } else {
+        FAISS_THROW_IF_NOT (d >= nbits);
+    }
+}
+
+IndexLSH::IndexLSH ():
+    nbits (0), bytes_per_vec(0), rotate_data (false), train_thresholds (false)
+{
+}
+
+
+const float * IndexLSH::apply_preprocess (idx_t n, const float *x) const
+{
+
+    float *xt = nullptr;
+    if (rotate_data) {
+        // also applies bias if exists
+        xt = rrot.apply (n, x);
+    } else if (d != nbits) {
+        assert (nbits < d);
+        xt = new float [nbits * n];
+        float *xp = xt;
+        for (idx_t i = 0; i < n; i++) {
+            const float *xl = x + i * d;
+            for (int j = 0; j < nbits; j++)
+                *xp++ = xl [j];
+        }
+    }
+
+    if (train_thresholds) {
+
+        if (xt == NULL) {
+            xt = new float [nbits * n];
+            memcpy (xt, x, sizeof(*x) * n * nbits);
+        }
+
+        float *xp = xt;
+        for (idx_t i = 0; i < n; i++)
+            for (int j = 0; j < nbits; j++)
+                *xp++ -= thresholds [j];
+    }
+
+    return xt ? xt : x;
+}
+
+
+
+void IndexLSH::train (idx_t n, const float *x)
+{
+    if (train_thresholds) {
+        thresholds.resize (nbits);
+        train_thresholds = false;
+        const float *xt = apply_preprocess (n, x);
+        ScopeDeleter<float> del (xt == x ? nullptr : xt);
+        train_thresholds = true;
+
+        float * transposed_x = new float [n * nbits];
+        ScopeDeleter<float> del2 (transposed_x);
+
+        for (idx_t i = 0; i < n; i++)
+            for (idx_t j = 0; j < nbits; j++)
+                transposed_x [j * n + i] = xt [i * nbits + j];
+
+        for (idx_t i = 0; i < nbits; i++) {
+            float *xi = transposed_x + i * n;
+            // std::nth_element
+            std::sort (xi, xi + n);
+            if (n % 2 == 1)
+                thresholds [i] = xi [n / 2];
+            else
+                thresholds [i] = (xi [n / 2 - 1] + xi [n / 2]) / 2;
+
+        }
+    }
+    is_trained = true;
+}
+
+
+void IndexLSH::add (idx_t n, const float *x)
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    codes.resize ((ntotal + n) * bytes_per_vec);
+
+    sa_encode (n, x, &codes[ntotal * bytes_per_vec]);
+
+    ntotal += n;
+}
+
+
+void IndexLSH::search (
+        idx_t n,
+        const float *x,
+        idx_t k,
+        float *distances,
+        idx_t *labels) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    const float *xt = apply_preprocess (n, x);
+    ScopeDeleter<float> del (xt == x ? nullptr : xt);
+
+    uint8_t * qcodes = new uint8_t [n * bytes_per_vec];
+    ScopeDeleter<uint8_t> del2 (qcodes);
+
+    fvecs2bitvecs (xt, qcodes, nbits, n);
+
+    int * idistances = new int [n * k];
+    ScopeDeleter<int> del3 (idistances);
+
+    int_maxheap_array_t res = { size_t(n), size_t(k), labels, idistances};
+
+    hammings_knn_hc (&res, qcodes, codes.data(),
+                     ntotal, bytes_per_vec, true);
+
+
+    // convert distances to floats
+    for (int i = 0; i < k * n; i++)
+        distances[i] = idistances[i];
+
+}
+
+
+void IndexLSH::transfer_thresholds (LinearTransform *vt) {
+    if (!train_thresholds) return;
+    FAISS_THROW_IF_NOT (nbits == vt->d_out);
+    if (!vt->have_bias) {
+        vt->b.resize (nbits, 0);
+        vt->have_bias = true;
+    }
+    for (int i = 0; i < nbits; i++)
+        vt->b[i] -= thresholds[i];
+    train_thresholds = false;
+    thresholds.clear();
+}
+
+void IndexLSH::reset() {
+    codes.clear();
+    ntotal = 0;
+}
+
+
+size_t IndexLSH::sa_code_size () const
+{
+    return bytes_per_vec;
+}
+
+void IndexLSH::sa_encode (idx_t n, const float *x,
+                                uint8_t *bytes) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    const float *xt = apply_preprocess (n, x);
+    ScopeDeleter<float> del (xt == x ? nullptr : xt);
+    fvecs2bitvecs (xt, bytes, nbits, n);
+}
+
+void IndexLSH::sa_decode (idx_t n, const uint8_t *bytes,
+                                  float *x) const
+{
+    float *xt = x;
+    ScopeDeleter<float> del;
+    if (rotate_data || nbits != d) {
+        xt = new float [n * nbits];
+        del.set(xt);
+    }
+    bitvecs2fvecs (bytes, xt, nbits, n);
+
+    if (train_thresholds) {
+        float *xp = xt;
+        for (idx_t i = 0; i < n; i++) {
+            for (int j = 0; j < nbits; j++) {
+                *xp++ += thresholds [j];
+            }
+        }
+    }
+
+    if (rotate_data) {
+        rrot.reverse_transform (n, xt, x);
+    } else if (nbits != d) {
+        for (idx_t i = 0; i < n; i++) {
+            memcpy (x + i * d, xt + i * nbits,
+                    nbits * sizeof(xt[0]));
+        }
+    }
+}
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexLSH.h b/core/src/index/thirdparty/faiss/IndexLSH.h
new file mode 100644
index 0000000000..1b45022809
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexLSH.h
@@ -0,0 +1,87 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef INDEX_LSH_H
+#define INDEX_LSH_H
+
+#include <vector>
+
+#include <faiss/Index.h>
+#include <faiss/VectorTransform.h>
+
+namespace faiss {
+
+
+/** The sign of each vector component is put in a binary signature */
+struct IndexLSH:Index {
+    typedef unsigned char uint8_t;
+
+    int nbits;              ///< nb of bits per vector
+    int bytes_per_vec;      ///< nb of 8-bits per encoded vector
+    bool rotate_data;       ///< whether to apply a random rotation to input
+    bool train_thresholds;  ///< whether we train thresholds or use 0
+
+    RandomRotationMatrix rrot; ///< optional random rotation
+
+    std::vector <float> thresholds; ///< thresholds to compare with
+
+    /// encoded dataset
+    std::vector<uint8_t> codes;
+
+    IndexLSH (
+            idx_t d, int nbits,
+            bool rotate_data = true,
+            bool train_thresholds = false);
+
+    /** Preprocesses and resizes the input to the size required to
+     * binarize the data
+     *
+     * @param x input vectors, size n * d
+     * @return output vectors, size n * bits. May be the same pointer
+     *         as x, otherwise it should be deleted by the caller
+     */
+    const float *apply_preprocess (idx_t n, const float *x) const;
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+    void reset() override;
+
+    /// transfer the thresholds to a pre-processing stage (and unset
+    /// train_thresholds)
+    void transfer_thresholds (LinearTransform * vt);
+
+    ~IndexLSH() override {}
+
+    IndexLSH ();
+
+    /* standalone codec interface */
+    size_t sa_code_size () const override;
+
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+};
+
+
+}
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/IndexLattice.cpp b/core/src/index/thirdparty/faiss/IndexLattice.cpp
new file mode 100644
index 0000000000..83ceb12778
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexLattice.cpp
@@ -0,0 +1,143 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+
+#include <faiss/IndexLattice.h>
+#include <faiss/utils/hamming.h>    // for the bitstring routines
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/distances.h>
+
+namespace faiss {
+
+
+IndexLattice::IndexLattice (idx_t d, int nsq, int scale_nbit, int r2):
+    Index (d),
+    nsq (nsq),
+    dsq (d / nsq),
+    zn_sphere_codec (dsq, r2),
+    scale_nbit (scale_nbit)
+{
+    FAISS_THROW_IF_NOT (d % nsq == 0);
+
+    lattice_nbit = 0;
+    while (!( ((uint64_t)1 << lattice_nbit) >= zn_sphere_codec.nv)) {
+        lattice_nbit++;
+    }
+
+    int total_nbit = (lattice_nbit + scale_nbit) * nsq;
+
+    code_size = (total_nbit + 7) / 8;
+
+    is_trained = false;
+}
+
+void IndexLattice::train(idx_t n, const float* x)
+{
+    // compute ranges per sub-block
+    trained.resize (nsq * 2);
+    float * mins = trained.data();
+    float * maxs = trained.data() + nsq;
+    for (int sq = 0; sq < nsq; sq++) {
+        mins[sq] = HUGE_VAL;
+        maxs[sq] = -1;
+    }
+
+    for (idx_t i = 0; i < n; i++) {
+        for (int sq = 0; sq < nsq; sq++) {
+            float norm2 = fvec_norm_L2sqr (x + i * d + sq * dsq, dsq);
+            if (norm2 > maxs[sq]) maxs[sq] = norm2;
+            if (norm2 < mins[sq]) mins[sq] = norm2;
+        }
+    }
+
+    for (int sq = 0; sq < nsq; sq++) {
+        mins[sq] = sqrtf (mins[sq]);
+        maxs[sq] = sqrtf (maxs[sq]);
+    }
+
+    is_trained = true;
+}
+
+/* The standalone codec interface */
+size_t IndexLattice::sa_code_size () const
+{
+    return code_size;
+}
+
+
+
+void IndexLattice::sa_encode (idx_t n, const float *x, uint8_t *codes) const
+{
+
+    const float * mins = trained.data();
+    const float * maxs = mins + nsq;
+    int64_t sc = int64_t(1) << scale_nbit;
+
+#pragma omp parallel for
+    for (idx_t i = 0; i < n; i++) {
+        BitstringWriter wr(codes + i * code_size, code_size);
+        const float *xi = x + i * d;
+        for (int j = 0; j < nsq; j++) {
+            float nj =
+                (sqrtf(fvec_norm_L2sqr(xi, dsq)) - mins[j])
+                * sc / (maxs[j] - mins[j]);
+            if (nj < 0) nj = 0;
+            if (nj >= sc) nj = sc - 1;
+            wr.write((int64_t)nj, scale_nbit);
+            wr.write(zn_sphere_codec.encode(xi), lattice_nbit);
+            xi += dsq;
+        }
+    }
+}
+
+void IndexLattice::sa_decode (idx_t n, const uint8_t *codes, float *x) const
+{
+    const float * mins = trained.data();
+    const float * maxs = mins + nsq;
+    float sc = int64_t(1) << scale_nbit;
+    float r = sqrtf(zn_sphere_codec.r2);
+
+#pragma omp parallel for
+    for (idx_t i = 0; i < n; i++) {
+        BitstringReader rd(codes + i * code_size, code_size);
+        float *xi = x + i * d;
+        for (int j = 0; j < nsq; j++) {
+            float norm =
+                (rd.read (scale_nbit) + 0.5) *
+                (maxs[j] - mins[j]) / sc + mins[j];
+            norm /= r;
+            zn_sphere_codec.decode (rd.read (lattice_nbit), xi);
+            for (int l = 0; l < dsq; l++) {
+                xi[l] *= norm;
+            }
+            xi += dsq;
+        }
+    }
+}
+
+void IndexLattice::add(idx_t , const float* )
+{
+    FAISS_THROW_MSG("not implemented");
+}
+
+
+void  IndexLattice::search(idx_t , const float* , idx_t ,
+                           float* , idx_t* ) const
+{
+    FAISS_THROW_MSG("not implemented");
+}
+
+
+void IndexLattice::reset()
+{
+    FAISS_THROW_MSG("not implemented");
+}
+
+
+}  // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexLattice.h b/core/src/index/thirdparty/faiss/IndexLattice.h
new file mode 100644
index 0000000000..7a150d035b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexLattice.h
@@ -0,0 +1,68 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_LATTICE_H
+#define FAISS_INDEX_LATTICE_H
+
+
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/lattice_Zn.h>
+
+namespace faiss {
+
+
+
+
+
+/** Index that encodes a vector with a series of Zn lattice quantizers
+ */
+struct IndexLattice: Index {
+
+    /// number of sub-vectors
+    int nsq;
+    /// dimension of sub-vectors
+    size_t dsq;
+
+    /// the lattice quantizer
+    ZnSphereCodecAlt zn_sphere_codec;
+
+    /// nb bits used to encode the scale, per subvector
+    int scale_nbit, lattice_nbit;
+    /// total, in bytes
+    size_t code_size;
+
+    /// mins and maxes of the vector norms, per subquantizer
+    std::vector<float> trained;
+
+    IndexLattice (idx_t d, int nsq, int scale_nbit, int r2);
+
+    void train(idx_t n, const float* x) override;
+
+    /* The standalone codec interface */
+    size_t sa_code_size () const override;
+
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+    /// not implemented
+    void add(idx_t n, const float* x) override;
+    void search(idx_t n, const float* x, idx_t k,
+                float* distances, idx_t* labels) const override;
+    void reset() override;
+
+};
+
+} // namespace faiss
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/IndexPQ.cpp b/core/src/index/thirdparty/faiss/IndexPQ.cpp
new file mode 100644
index 0000000000..5357518ae0
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexPQ.cpp
@@ -0,0 +1,1188 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexPQ.h>
+
+
+#include <cstddef>
+#include <cstring>
+#include <cstdio>
+#include <cmath>
+
+#include <algorithm>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/utils/hamming.h>
+
+namespace faiss {
+
+/*********************************************************
+ * IndexPQ implementation
+ ********************************************************/
+
+
+IndexPQ::IndexPQ (int d, size_t M, size_t nbits, MetricType metric):
+    Index(d, metric), pq(d, M, nbits)
+{
+    is_trained = false;
+    do_polysemous_training = false;
+    polysemous_ht = nbits * M + 1;
+    search_type = ST_PQ;
+    encode_signs = false;
+}
+
+IndexPQ::IndexPQ ()
+{
+    metric_type = METRIC_L2;
+    is_trained = false;
+    do_polysemous_training = false;
+    polysemous_ht = pq.nbits * pq.M + 1;
+    search_type = ST_PQ;
+    encode_signs = false;
+}
+
+
+void IndexPQ::train (idx_t n, const float *x)
+{
+    if (!do_polysemous_training) {        // standard training
+        pq.train(n, x);
+    } else {
+        idx_t ntrain_perm = polysemous_training.ntrain_permutation;
+
+        if (ntrain_perm > n / 4)
+            ntrain_perm = n / 4;
+        if (verbose) {
+            printf ("PQ training on %ld points, remains %ld points: "
+                    "training polysemous on %s\n",
+                    n - ntrain_perm, ntrain_perm,
+                    ntrain_perm == 0 ? "centroids" : "these");
+        }
+        pq.train(n - ntrain_perm, x);
+
+        polysemous_training.optimize_pq_for_hamming (
+            pq, ntrain_perm, x + (n - ntrain_perm) * d);
+    }
+    is_trained = true;
+}
+
+
+void IndexPQ::add (idx_t n, const float *x)
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    codes.resize ((n + ntotal) * pq.code_size);
+    pq.compute_codes (x, &codes[ntotal * pq.code_size], n);
+    ntotal += n;
+}
+
+
+size_t IndexPQ::remove_ids (const IDSelector & sel)
+{
+    idx_t j = 0;
+    for (idx_t i = 0; i < ntotal; i++) {
+        if (sel.is_member (i)) {
+            // should be removed
+        } else {
+            if (i > j) {
+                memmove (&codes[pq.code_size * j], &codes[pq.code_size * i], pq.code_size);
+            }
+            j++;
+        }
+    }
+    size_t nremove = ntotal - j;
+    if (nremove > 0) {
+        ntotal = j;
+        codes.resize (ntotal * pq.code_size);
+    }
+    return nremove;
+}
+
+
+void IndexPQ::reset()
+{
+    codes.clear();
+    ntotal = 0;
+}
+
+void IndexPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
+{
+    FAISS_THROW_IF_NOT (ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
+    for (idx_t i = 0; i < ni; i++) {
+        const uint8_t * code = &codes[(i0 + i) * pq.code_size];
+        pq.decode (code, recons + i * d);
+    }
+}
+
+
+void IndexPQ::reconstruct (idx_t key, float * recons) const
+{
+    FAISS_THROW_IF_NOT (key >= 0 && key < ntotal);
+    pq.decode (&codes[key * pq.code_size], recons);
+}
+
+
+namespace {
+
+
+struct PQDis: DistanceComputer {
+    size_t d;
+    Index::idx_t nb;
+    const uint8_t *codes;
+    size_t code_size;
+    const ProductQuantizer & pq;
+    const float *sdc;
+    std::vector<float> precomputed_table;
+    size_t ndis;
+
+    float operator () (idx_t i) override
+    {
+        const uint8_t *code = codes + i * code_size;
+        const float *dt = precomputed_table.data();
+        float accu = 0;
+        for (int j = 0; j < pq.M; j++) {
+            accu += dt[*code++];
+            dt += 256;
+        }
+        ndis++;
+        return accu;
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override
+    {
+        const float * sdci = sdc;
+        float accu = 0;
+        const uint8_t *codei = codes + i * code_size;
+        const uint8_t *codej = codes + j * code_size;
+
+        for (int l = 0; l < pq.M; l++) {
+            accu += sdci[(*codei++) + (*codej++) * 256];
+            sdci += 256 * 256;
+        }
+        return accu;
+    }
+
+    explicit PQDis(const IndexPQ& storage, const float* /*q*/ = nullptr)
+        : pq(storage.pq) {
+        precomputed_table.resize(pq.M * pq.ksub);
+        nb = storage.ntotal;
+        d = storage.d;
+        codes = storage.codes.data();
+        code_size = pq.code_size;
+        FAISS_ASSERT(pq.ksub == 256);
+        FAISS_ASSERT(pq.sdc_table.size() == pq.ksub * pq.ksub * pq.M);
+        sdc = pq.sdc_table.data();
+        ndis = 0;
+    }
+
+    void set_query(const float *x) override {
+        pq.compute_distance_table(x, precomputed_table.data());
+    }
+};
+
+
+}  // namespace
+
+
+DistanceComputer * IndexPQ::get_distance_computer() const {
+    FAISS_THROW_IF_NOT(pq.nbits == 8);
+    return new PQDis(*this);
+}
+
+
+/*****************************************
+ * IndexPQ polysemous search routines
+ ******************************************/
+
+
+
+
+
+void IndexPQ::search (idx_t n, const float *x, idx_t k,
+                           float *distances, idx_t *labels) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    if (search_type == ST_PQ) {  // Simple PQ search
+
+        if (metric_type == METRIC_L2) {
+            float_maxheap_array_t res = {
+                size_t(n), size_t(k), labels, distances };
+            pq.search (x, n, codes.data(), ntotal, &res, true);
+        } else {
+            float_minheap_array_t res = {
+                size_t(n), size_t(k), labels, distances };
+            pq.search_ip (x, n, codes.data(), ntotal, &res, true);
+        }
+        indexPQ_stats.nq += n;
+        indexPQ_stats.ncode += n * ntotal;
+
+    } else if (search_type == ST_polysemous ||
+               search_type == ST_polysemous_generalize) {
+
+        FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
+
+        search_core_polysemous (n, x, k, distances, labels);
+
+    } else { // code-to-code distances
+
+        uint8_t * q_codes = new uint8_t [n * pq.code_size];
+        ScopeDeleter<uint8_t> del (q_codes);
+
+
+        if (!encode_signs) {
+            pq.compute_codes (x, q_codes, n);
+        } else {
+            FAISS_THROW_IF_NOT (d == pq.nbits * pq.M);
+            memset (q_codes, 0, n * pq.code_size);
+            for (size_t i = 0; i < n; i++) {
+                const float *xi = x + i * d;
+                uint8_t *code = q_codes + i * pq.code_size;
+                for (int j = 0; j < d; j++)
+                    if (xi[j] > 0) code [j>>3] |= 1 << (j & 7);
+            }
+        }
+
+        if (search_type == ST_SDC)  {
+
+            float_maxheap_array_t res = {
+                size_t(n),  size_t(k), labels, distances};
+
+            pq.search_sdc (q_codes, n, codes.data(), ntotal, &res, true);
+
+        } else {
+            int * idistances = new int [n * k];
+            ScopeDeleter<int> del (idistances);
+
+            int_maxheap_array_t res = {
+                size_t (n), size_t (k), labels, idistances};
+
+            if (search_type == ST_HE) {
+
+                hammings_knn_hc (&res, q_codes, codes.data(),
+                                 ntotal, pq.code_size, true);
+
+            } else if (search_type == ST_generalized_HE) {
+
+                generalized_hammings_knn_hc (&res, q_codes, codes.data(),
+                                             ntotal, pq.code_size, true);
+            }
+
+            // convert distances to floats
+            for (int i = 0; i < k * n; i++)
+                distances[i] = idistances[i];
+
+        }
+
+
+        indexPQ_stats.nq += n;
+        indexPQ_stats.ncode += n * ntotal;
+    }
+}
+
+
+
+
+
+void IndexPQStats::reset()
+{
+    nq = ncode = n_hamming_pass = 0;
+}
+
+IndexPQStats indexPQ_stats;
+
+
+template <class HammingComputer>
+static size_t polysemous_inner_loop (
+        const IndexPQ & index,
+        const float *dis_table_qi, const uint8_t *q_code,
+        size_t k, float *heap_dis, int64_t *heap_ids)
+{
+
+    int M = index.pq.M;
+    int code_size = index.pq.code_size;
+    int ksub = index.pq.ksub;
+    size_t ntotal = index.ntotal;
+    int ht = index.polysemous_ht;
+
+    const uint8_t *b_code = index.codes.data();
+
+    size_t n_pass_i = 0;
+
+    HammingComputer hc (q_code, code_size);
+
+    for (int64_t bi = 0; bi < ntotal; bi++) {
+        int hd = hc.hamming (b_code);
+
+        if (hd < ht) {
+            n_pass_i ++;
+
+            float dis = 0;
+            const float * dis_table = dis_table_qi;
+            for (int m = 0; m < M; m++) {
+                dis += dis_table [b_code[m]];
+                dis_table += ksub;
+            }
+
+            if (dis < heap_dis[0]) {
+                maxheap_pop (k, heap_dis, heap_ids);
+                maxheap_push (k, heap_dis, heap_ids, dis, bi);
+            }
+        }
+        b_code += code_size;
+    }
+    return n_pass_i;
+}
+
+
+void IndexPQ::search_core_polysemous (idx_t n, const float *x, idx_t k,
+                                          float *distances, idx_t *labels) const
+{
+    FAISS_THROW_IF_NOT (pq.nbits == 8);
+
+    // PQ distance tables
+    float * dis_tables = new float [n * pq.ksub * pq.M];
+    ScopeDeleter<float> del (dis_tables);
+    pq.compute_distance_tables (n, x, dis_tables);
+
+    // Hamming embedding queries
+    uint8_t * q_codes = new uint8_t [n * pq.code_size];
+    ScopeDeleter<uint8_t> del2 (q_codes);
+
+    if (false) {
+        pq.compute_codes (x, q_codes, n);
+    } else {
+#pragma omp parallel for
+        for (idx_t qi = 0; qi < n; qi++) {
+            pq.compute_code_from_distance_table
+                (dis_tables + qi * pq.M * pq.ksub,
+                 q_codes + qi * pq.code_size);
+        }
+    }
+
+    size_t n_pass = 0;
+
+#pragma omp parallel for reduction (+: n_pass)
+    for (idx_t qi = 0; qi < n; qi++) {
+        const uint8_t * q_code = q_codes + qi * pq.code_size;
+
+        const float * dis_table_qi = dis_tables + qi * pq.M * pq.ksub;
+
+        int64_t * heap_ids = labels + qi * k;
+        float *heap_dis = distances + qi * k;
+        maxheap_heapify (k, heap_dis, heap_ids);
+
+        if (search_type == ST_polysemous) {
+
+            switch (pq.code_size) {
+            case 4:
+                n_pass += polysemous_inner_loop<HammingComputer4>
+                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                break;
+            case 8:
+                n_pass += polysemous_inner_loop<HammingComputer8>
+                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                break;
+            case 16:
+                n_pass += polysemous_inner_loop<HammingComputer16>
+                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                break;
+            case 32:
+                n_pass += polysemous_inner_loop<HammingComputer32>
+                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                break;
+            case 20:
+                n_pass += polysemous_inner_loop<HammingComputer20>
+                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                break;
+            default:
+                if (pq.code_size % 8 == 0) {
+                    n_pass += polysemous_inner_loop<HammingComputerM8>
+                        (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                } else if (pq.code_size % 4 == 0) {
+                    n_pass += polysemous_inner_loop<HammingComputerM4>
+                        (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                } else {
+                    FAISS_THROW_FMT(
+                         "code size %zd not supported for polysemous",
+                         pq.code_size);
+                }
+                break;
+            }
+        } else {
+            switch (pq.code_size) {
+            case 8:
+                n_pass += polysemous_inner_loop<GenHammingComputer8>
+                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                break;
+            case 16:
+                n_pass += polysemous_inner_loop<GenHammingComputer16>
+                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                break;
+            case 32:
+                n_pass += polysemous_inner_loop<GenHammingComputer32>
+                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                break;
+            default:
+                if (pq.code_size % 8 == 0) {
+                    n_pass += polysemous_inner_loop<GenHammingComputerM8>
+                        (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                } else {
+                    FAISS_THROW_FMT(
+                         "code size %zd not supported for polysemous",
+                         pq.code_size);
+                }
+                break;
+            }
+        }
+        maxheap_reorder (k, heap_dis, heap_ids);
+    }
+
+    indexPQ_stats.nq += n;
+    indexPQ_stats.ncode += n * ntotal;
+    indexPQ_stats.n_hamming_pass += n_pass;
+
+
+}
+
+
+/* The standalone codec interface (just remaps to the PQ functions) */
+size_t IndexPQ::sa_code_size () const
+{
+    return pq.code_size;
+}
+
+void IndexPQ::sa_encode (idx_t n, const float *x, uint8_t *bytes) const
+{
+    pq.compute_codes (x, bytes, n);
+}
+
+void IndexPQ::sa_decode (idx_t n, const uint8_t *bytes, float *x) const
+{
+    pq.decode (bytes, x, n);
+}
+
+
+
+
+/*****************************************
+ * Stats of IndexPQ codes
+ ******************************************/
+
+
+
+
+void IndexPQ::hamming_distance_table (idx_t n, const float *x,
+                                      int32_t *dis) const
+{
+    uint8_t * q_codes = new uint8_t [n * pq.code_size];
+    ScopeDeleter<uint8_t> del (q_codes);
+
+    pq.compute_codes (x, q_codes, n);
+
+    hammings (q_codes, codes.data(), n, ntotal, pq.code_size, dis);
+}
+
+
+void IndexPQ::hamming_distance_histogram (idx_t n, const float *x,
+                                          idx_t nb, const float *xb,
+                                          int64_t *hist)
+{
+    FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
+    FAISS_THROW_IF_NOT (pq.code_size % 8 == 0);
+    FAISS_THROW_IF_NOT (pq.nbits == 8);
+
+    // Hamming embedding queries
+    uint8_t * q_codes = new uint8_t [n * pq.code_size];
+    ScopeDeleter <uint8_t> del (q_codes);
+    pq.compute_codes (x, q_codes, n);
+
+    uint8_t * b_codes ;
+    ScopeDeleter <uint8_t> del_b_codes;
+
+    if (xb) {
+        b_codes = new uint8_t [nb * pq.code_size];
+        del_b_codes.set (b_codes);
+        pq.compute_codes (xb, b_codes, nb);
+    } else {
+        nb = ntotal;
+        b_codes = codes.data();
+    }
+    int nbits = pq.M * pq.nbits;
+    memset (hist, 0, sizeof(*hist) * (nbits + 1));
+    size_t bs = 256;
+
+#pragma omp parallel
+    {
+        std::vector<int64_t> histi (nbits + 1);
+        hamdis_t *distances = new hamdis_t [nb * bs];
+        ScopeDeleter<hamdis_t> del (distances);
+#pragma omp for
+        for (size_t q0 = 0; q0 < n; q0 += bs) {
+            // printf ("dis stats: %ld/%ld\n", q0, n);
+            size_t q1 = q0 + bs;
+            if (q1 > n) q1 = n;
+
+            hammings (q_codes + q0 * pq.code_size, b_codes,
+                      q1 - q0, nb,
+                      pq.code_size, distances);
+
+            for (size_t i = 0; i < nb * (q1 - q0); i++)
+                histi [distances [i]]++;
+        }
+#pragma omp critical
+        {
+            for (int i = 0; i <= nbits; i++)
+                hist[i] += histi[i];
+        }
+    }
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+/*****************************************
+ * MultiIndexQuantizer
+ ******************************************/
+
+namespace {
+
+template <typename T>
+struct PreSortedArray {
+
+    const T * x;
+    int N;
+
+    explicit PreSortedArray (int N): N(N) {
+    }
+    void init (const T*x) {
+        this->x = x;
+    }
+    // get smallest value
+    T get_0 () {
+        return x[0];
+    }
+
+    // get delta between n-smallest and n-1 -smallest
+    T get_diff (int n) {
+        return x[n] - x[n - 1];
+    }
+
+    // remap orders counted from smallest to indices in array
+    int get_ord (int n) {
+        return n;
+    }
+
+};
+
+template <typename T>
+struct ArgSort {
+    const T * x;
+    bool operator() (size_t i, size_t j) {
+        return x[i] < x[j];
+    }
+};
+
+
+/** Array that maintains a permutation of its elements so that the
+ *  array's elements are sorted
+ */
+template <typename T>
+struct SortedArray {
+    const T * x;
+    int N;
+    std::vector<int> perm;
+
+    explicit SortedArray (int N) {
+        this->N = N;
+        perm.resize (N);
+    }
+
+    void init (const T*x) {
+        this->x = x;
+        for (int n = 0; n < N; n++)
+            perm[n] = n;
+        ArgSort<T> cmp = {x };
+        std::sort (perm.begin(), perm.end(), cmp);
+    }
+
+    // get smallest value
+    T get_0 () {
+        return x[perm[0]];
+    }
+
+    // get delta between n-smallest and n-1 -smallest
+    T get_diff (int n) {
+        return x[perm[n]] - x[perm[n - 1]];
+    }
+
+    // remap orders counted from smallest to indices in array
+    int get_ord (int n) {
+        return perm[n];
+    }
+};
+
+
+
+/** Array has n values. Sort the k first ones and copy the other ones
+ *  into elements k..n-1
+ */
+template <class C>
+void partial_sort (int k, int n,
+                   const typename C::T * vals, typename C::TI * perm) {
+    // insert first k elts in heap
+    for (int i = 1; i < k; i++) {
+        indirect_heap_push<C> (i + 1, vals, perm, perm[i]);
+    }
+
+    // insert next n - k elts in heap
+    for (int i = k; i < n; i++) {
+        typename C::TI id = perm[i];
+        typename C::TI top = perm[0];
+
+        if (C::cmp(vals[top], vals[id])) {
+            indirect_heap_pop<C> (k, vals, perm);
+            indirect_heap_push<C> (k, vals, perm, id);
+            perm[i] = top;
+        } else {
+            // nothing, elt at i is good where it is.
+        }
+    }
+
+    // order the k first elements in heap
+    for (int i = k - 1; i > 0; i--) {
+        typename C::TI top = perm[0];
+        indirect_heap_pop<C> (i + 1, vals, perm);
+        perm[i] = top;
+    }
+}
+
+/** same as SortedArray, but only the k first elements are sorted */
+template <typename T>
+struct SemiSortedArray {
+    const T * x;
+    int N;
+
+    // type of the heap: CMax = sort ascending
+    typedef CMax<T, int> HC;
+    std::vector<int> perm;
+
+    int k;  // k elements are sorted
+
+    int initial_k, k_factor;
+
+    explicit SemiSortedArray (int N) {
+        this->N = N;
+        perm.resize (N);
+        perm.resize (N);
+        initial_k = 3;
+        k_factor = 4;
+    }
+
+    void init (const T*x) {
+        this->x = x;
+        for (int n = 0; n < N; n++)
+            perm[n] = n;
+        k = 0;
+        grow (initial_k);
+    }
+
+    /// grow the sorted part of the array to size next_k
+    void grow (int next_k) {
+        if (next_k < N) {
+            partial_sort<HC> (next_k - k, N - k, x, &perm[k]);
+            k = next_k;
+        } else { // full sort of remainder of array
+            ArgSort<T> cmp = {x };
+            std::sort (perm.begin() + k, perm.end(), cmp);
+            k = N;
+        }
+    }
+
+    // get smallest value
+    T get_0 () {
+        return x[perm[0]];
+    }
+
+    // get delta between n-smallest and n-1 -smallest
+    T get_diff (int n) {
+        if (n >= k) {
+            // want to keep powers of 2 - 1
+            int next_k = (k + 1) * k_factor - 1;
+            grow (next_k);
+        }
+        return x[perm[n]] - x[perm[n - 1]];
+    }
+
+    // remap orders counted from smallest to indices in array
+    int get_ord (int n) {
+        assert (n < k);
+        return perm[n];
+    }
+};
+
+
+
+/*****************************************
+ * Find the k smallest sums of M terms, where each term is taken in a
+ *  table x of n values.
+ *
+ * A combination of terms is encoded as a scalar 0 <= t < n^M. The
+ * combination t0 ... t(M-1) that correspond to the sum
+ *
+ *           sum = x[0, t0] + x[1, t1] + .... + x[M-1, t(M-1)]
+ *
+ * is encoded as
+ *
+ *           t = t0 + t1 * n + t2 * n^2 + ... + t(M-1) * n^(M-1)
+ *
+ * MinSumK is an object rather than a function, so that storage can be
+ * re-used over several computations with the same sizes. use_seen is
+ * good when there may be ties in the x array and it is a concern if
+ * occasionally several t's are returned.
+ *
+ * @param x       size M * n, values to add up
+ * @parms k       nb of results to retrieve
+ * @param M       nb of terms
+ * @param n       nb of distinct values
+ * @param sums    output, size k, sorted
+ * @prarm terms   output, size k, with encoding as above
+ *
+ ******************************************/
+template <typename T, class SSA, bool use_seen>
+struct MinSumK {
+    int K;  ///< nb of sums to return
+    int M;  ///< nb of elements to sum up
+    int nbit; ///< nb of bits to encode one entry
+    int N;  ///< nb of possible elements for each of the M terms
+
+    /** the heap.
+     * We use a heap to maintain a queue of sums, with the associated
+     * terms involved in the sum.
+     */
+    typedef CMin<T, int64_t> HC;
+    size_t heap_capacity, heap_size;
+    T *bh_val;
+    int64_t *bh_ids;
+
+    std::vector <SSA> ssx;
+
+    // all results get pushed several times. When there are ties, they
+    // are popped interleaved with others, so it is not easy to
+    // identify them. Therefore, this bit array just marks elements
+    // that were seen before.
+    std::vector <uint8_t> seen;
+
+    MinSumK (int K, int M, int nbit, int N):
+        K(K), M(M), nbit(nbit), N(N) {
+        heap_capacity = K * M;
+        assert (N <= (1 << nbit));
+
+        // we'll do k steps, each step pushes at most M vals
+        bh_val = new T[heap_capacity];
+        bh_ids = new int64_t[heap_capacity];
+
+        if (use_seen) {
+            int64_t n_ids = weight(M);
+            seen.resize ((n_ids + 7) / 8);
+        }
+
+        for (int m = 0; m < M; m++)
+            ssx.push_back (SSA(N));
+
+    }
+
+    int64_t weight (int i) {
+        return 1 << (i * nbit);
+    }
+
+    bool is_seen (int64_t i) {
+        return (seen[i >> 3] >> (i & 7)) & 1;
+    }
+
+    void mark_seen (int64_t i) {
+        if (use_seen)
+            seen [i >> 3] |= 1 << (i & 7);
+    }
+
+    void run (const T *x, int64_t ldx,
+              T * sums, int64_t * terms) {
+        heap_size = 0;
+
+        for (int m = 0; m < M; m++) {
+            ssx[m].init(x);
+            x += ldx;
+        }
+
+        { // intial result: take min for all elements
+            T sum = 0;
+            terms[0] = 0;
+            mark_seen (0);
+            for (int m = 0; m < M; m++) {
+                sum += ssx[m].get_0();
+            }
+            sums[0] = sum;
+            for (int m = 0; m < M; m++) {
+                heap_push<HC> (++heap_size, bh_val, bh_ids,
+                               sum + ssx[m].get_diff(1),
+                               weight(m));
+            }
+        }
+
+        for (int k = 1; k < K; k++) {
+            // pop smallest value from heap
+            if (use_seen) {// skip already seen elements
+                while (is_seen (bh_ids[0])) {
+                    assert (heap_size > 0);
+                    heap_pop<HC> (heap_size--, bh_val, bh_ids);
+                }
+            }
+            assert (heap_size > 0);
+
+            T sum = sums[k] = bh_val[0];
+            int64_t ti = terms[k] = bh_ids[0];
+
+            if (use_seen) {
+                mark_seen (ti);
+                heap_pop<HC> (heap_size--, bh_val, bh_ids);
+            } else {
+                do {
+                    heap_pop<HC> (heap_size--, bh_val, bh_ids);
+                }  while (heap_size > 0 && bh_ids[0] == ti);
+            }
+
+            // enqueue followers
+            int64_t ii = ti;
+            for (int m = 0; m < M; m++) {
+                int64_t n = ii & ((1L << nbit) - 1);
+                ii >>= nbit;
+                if (n + 1 >= N) continue;
+
+                enqueue_follower (ti, m, n, sum);
+            }
+        }
+
+        /*
+        for (int k = 0; k < K; k++)
+            for (int l = k + 1; l < K; l++)
+                assert (terms[k] != terms[l]);
+        */
+
+        // convert indices by applying permutation
+        for (int k = 0; k < K; k++) {
+            int64_t ii = terms[k];
+            if (use_seen) {
+                // clear seen for reuse at next loop
+                seen[ii >> 3] = 0;
+            }
+            int64_t ti = 0;
+            for (int m = 0; m < M; m++) {
+                int64_t n = ii & ((1L << nbit) - 1);
+                ti += int64_t(ssx[m].get_ord(n)) << (nbit * m);
+                ii >>= nbit;
+            }
+            terms[k] = ti;
+        }
+    }
+
+
+    void enqueue_follower (int64_t ti, int m, int n, T sum) {
+        T next_sum = sum + ssx[m].get_diff(n + 1);
+        int64_t next_ti = ti + weight(m);
+        heap_push<HC> (++heap_size, bh_val, bh_ids, next_sum, next_ti);
+    }
+
+    ~MinSumK () {
+        delete [] bh_ids;
+        delete [] bh_val;
+    }
+};
+
+} // anonymous namespace
+
+
+MultiIndexQuantizer::MultiIndexQuantizer (int d,
+                     size_t M,
+                     size_t nbits):
+    Index(d, METRIC_L2), pq(d, M, nbits)
+{
+    is_trained = false;
+    pq.verbose = verbose;
+}
+
+
+
+void MultiIndexQuantizer::train(idx_t n, const float *x)
+{
+    pq.verbose = verbose;
+    pq.train (n, x);
+    is_trained = true;
+    // count virtual elements in index
+    ntotal = 1;
+    for (int m = 0; m < pq.M; m++)
+        ntotal *= pq.ksub;
+}
+
+
+void MultiIndexQuantizer::search (idx_t n, const float *x, idx_t k,
+                                  float *distances, idx_t *labels) const {
+    if (n == 0) return;
+
+    // the allocation just below can be severe...
+    idx_t bs = 32768;
+    if (n > bs) {
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min(i0 + bs, n);
+            if (verbose) {
+                printf("MultiIndexQuantizer::search: %ld:%ld / %ld\n",
+                       i0, i1, n);
+            }
+            search (i1 - i0, x + i0 * d, k,
+                    distances + i0 * k,
+                    labels + i0 * k);
+        }
+        return;
+    }
+
+    float * dis_tables = new float [n * pq.ksub * pq.M];
+    ScopeDeleter<float> del (dis_tables);
+
+    pq.compute_distance_tables (n, x, dis_tables);
+
+    if (k == 1) {
+        // simple version that just finds the min in each table
+
+#pragma omp parallel for
+        for (int i = 0; i < n; i++) {
+            const float * dis_table = dis_tables + i * pq.ksub * pq.M;
+            float dis = 0;
+            idx_t label = 0;
+
+            for (int s = 0; s < pq.M; s++) {
+                float vmin = HUGE_VALF;
+                idx_t lmin = -1;
+
+                for (idx_t j = 0; j < pq.ksub; j++) {
+                    if (dis_table[j] < vmin) {
+                        vmin = dis_table[j];
+                        lmin = j;
+                    }
+                }
+                dis += vmin;
+                label |= lmin << (s * pq.nbits);
+                dis_table += pq.ksub;
+            }
+
+            distances [i] = dis;
+            labels [i] = label;
+        }
+
+
+    } else {
+
+#pragma omp parallel if(n > 1)
+        {
+            MinSumK <float, SemiSortedArray<float>, false>
+                msk(k, pq.M, pq.nbits, pq.ksub);
+#pragma omp for
+            for (int i = 0; i < n; i++) {
+                msk.run (dis_tables + i * pq.ksub * pq.M, pq.ksub,
+                         distances + i * k, labels + i * k);
+
+            }
+        }
+    }
+
+}
+
+
+void MultiIndexQuantizer::reconstruct (idx_t key, float * recons) const
+{
+
+    int64_t jj = key;
+    for (int m = 0; m < pq.M; m++) {
+        int64_t n = jj & ((1L << pq.nbits) - 1);
+        jj >>= pq.nbits;
+        memcpy(recons, pq.get_centroids(m, n), sizeof(recons[0]) * pq.dsub);
+        recons += pq.dsub;
+    }
+}
+
+void MultiIndexQuantizer::add(idx_t /*n*/, const float* /*x*/) {
+  FAISS_THROW_MSG(
+      "This index has virtual elements, "
+      "it does not support add");
+}
+
+void MultiIndexQuantizer::reset ()
+{
+    FAISS_THROW_MSG ( "This index has virtual elements, "
+                      "it does not support reset");
+}
+
+
+
+
+
+
+
+
+
+
+/*****************************************
+ * MultiIndexQuantizer2
+ ******************************************/
+
+
+
+MultiIndexQuantizer2::MultiIndexQuantizer2 (
+        int d, size_t M, size_t nbits,
+        Index **indexes):
+    MultiIndexQuantizer (d, M, nbits)
+{
+    assign_indexes.resize (M);
+    for (int i = 0; i < M; i++) {
+        FAISS_THROW_IF_NOT_MSG(
+            indexes[i]->d == pq.dsub,
+            "Provided sub-index has incorrect size");
+        assign_indexes[i] = indexes[i];
+    }
+    own_fields = false;
+}
+
+MultiIndexQuantizer2::MultiIndexQuantizer2 (
+        int d, size_t nbits,
+        Index *assign_index_0,
+        Index *assign_index_1):
+    MultiIndexQuantizer (d, 2, nbits)
+{
+    FAISS_THROW_IF_NOT_MSG(
+            assign_index_0->d == pq.dsub &&
+            assign_index_1->d == pq.dsub,
+            "Provided sub-index has incorrect size");
+    assign_indexes.resize (2);
+    assign_indexes [0] = assign_index_0;
+    assign_indexes [1] = assign_index_1;
+    own_fields = false;
+}
+
+void MultiIndexQuantizer2::train(idx_t n, const float* x)
+{
+    MultiIndexQuantizer::train(n, x);
+    // add centroids to sub-indexes
+    for (int i = 0; i < pq.M; i++) {
+        assign_indexes[i]->add(pq.ksub, pq.get_centroids(i, 0));
+    }
+}
+
+
+void MultiIndexQuantizer2::search(
+        idx_t n, const float* x, idx_t K,
+        float* distances, idx_t* labels) const
+{
+
+    if (n == 0) return;
+
+    int k2 = std::min(K, int64_t(pq.ksub));
+
+    int64_t M = pq.M;
+    int64_t dsub = pq.dsub, ksub = pq.ksub;
+
+    // size (M, n, k2)
+    std::vector<idx_t> sub_ids(n * M * k2);
+    std::vector<float> sub_dis(n * M * k2);
+    std::vector<float> xsub(n * dsub);
+
+    for (int m = 0; m < M; m++) {
+        float *xdest = xsub.data();
+        const float *xsrc = x + m * dsub;
+        for (int j = 0; j < n; j++) {
+            memcpy(xdest, xsrc, dsub * sizeof(xdest[0]));
+            xsrc += d;
+            xdest += dsub;
+        }
+
+        assign_indexes[m]->search(
+              n, xsub.data(), k2,
+              &sub_dis[k2 * n * m],
+              &sub_ids[k2 * n * m]);
+    }
+
+    if (K == 1) {
+        // simple version that just finds the min in each table
+        assert (k2 == 1);
+
+        for (int i = 0; i < n; i++) {
+            float dis = 0;
+            idx_t label = 0;
+
+            for (int m = 0; m < M; m++) {
+                float vmin = sub_dis[i + m * n];
+                idx_t lmin = sub_ids[i + m * n];
+                dis += vmin;
+                label |= lmin << (m * pq.nbits);
+            }
+            distances [i] = dis;
+            labels [i] = label;
+        }
+
+    } else {
+
+#pragma omp parallel if(n > 1)
+        {
+            MinSumK <float, PreSortedArray<float>, false>
+                msk(K, pq.M, pq.nbits, k2);
+#pragma omp for
+            for (int i = 0; i < n; i++) {
+                idx_t *li = labels + i * K;
+                msk.run (&sub_dis[i * k2], k2 * n,
+                         distances + i * K, li);
+
+                // remap ids
+
+                const idx_t *idmap0 = sub_ids.data() + i * k2;
+                int64_t ld_idmap = k2 * n;
+                int64_t mask1 = ksub - 1L;
+
+                for (int k = 0; k < K; k++) {
+                    const idx_t *idmap = idmap0;
+                    int64_t vin = li[k];
+                    int64_t vout = 0;
+                    int bs = 0;
+                    for (int m = 0; m < M; m++) {
+                        int64_t s = vin & mask1;
+                        vin >>= pq.nbits;
+                        vout |= idmap[s] << bs;
+                        bs += pq.nbits;
+                        idmap += ld_idmap;
+                    }
+                    li[k] = vout;
+                }
+            }
+        }
+    }
+}
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexPQ.h b/core/src/index/thirdparty/faiss/IndexPQ.h
new file mode 100644
index 0000000000..840b31a03c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexPQ.h
@@ -0,0 +1,199 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_PQ_H
+#define FAISS_INDEX_PQ_H
+
+#include <stdint.h>
+
+#include <vector>
+
+#include <faiss/Index.h>
+#include <faiss/impl/ProductQuantizer.h>
+#include <faiss/impl/PolysemousTraining.h>
+
+namespace faiss {
+
+
+/** Index based on a product quantizer. Stored vectors are
+ * approximated by PQ codes. */
+struct IndexPQ: Index {
+
+    /// The product quantizer used to encode the vectors
+    ProductQuantizer pq;
+
+    /// Codes. Size ntotal * pq.code_size
+    std::vector<uint8_t> codes;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    IndexPQ (int d,                    ///< dimensionality of the input vectors
+             size_t M,                 ///< number of subquantizers
+             size_t nbits,             ///< number of bit per subvector index
+             MetricType metric = METRIC_L2);
+
+    IndexPQ ();
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+    void reset() override;
+
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    /* The standalone codec interface */
+    size_t sa_code_size () const override;
+
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+
+    DistanceComputer * get_distance_computer() const override;
+
+    /******************************************************
+     * Polysemous codes implementation
+     ******************************************************/
+    bool do_polysemous_training; ///< false = standard PQ
+
+    /// parameters used for the polysemous training
+    PolysemousTraining polysemous_training;
+
+    /// how to perform the search in search_core
+    enum Search_type_t {
+        ST_PQ,             ///< asymmetric product quantizer (default)
+        ST_HE,             ///< Hamming distance on codes
+        ST_generalized_HE, ///< nb of same codes
+        ST_SDC,            ///< symmetric product quantizer (SDC)
+        ST_polysemous,     ///< HE filter (using ht) + PQ combination
+        ST_polysemous_generalize,  ///< Filter on generalized Hamming
+    };
+
+    Search_type_t search_type;
+
+    // just encode the sign of the components, instead of using the PQ encoder
+    // used only for the queries
+    bool encode_signs;
+
+    /// Hamming threshold used for polysemy
+    int polysemous_ht;
+
+    // actual polysemous search
+    void search_core_polysemous (idx_t n, const float *x, idx_t k,
+                               float *distances, idx_t *labels) const;
+
+    /// prepare query for a polysemous search, but instead of
+    /// computing the result, just get the histogram of Hamming
+    /// distances. May be computed on a provided dataset if xb != NULL
+    /// @param dist_histogram (M * nbits + 1)
+    void hamming_distance_histogram (idx_t n, const float *x,
+                                     idx_t nb, const float *xb,
+                                     int64_t *dist_histogram);
+
+    /** compute pairwise distances between queries and database
+     *
+     * @param n    nb of query vectors
+     * @param x    query vector, size n * d
+     * @param dis  output distances, size n * ntotal
+     */
+    void hamming_distance_table (idx_t n, const float *x,
+                                 int32_t *dis) const;
+
+};
+
+
+/// statistics are robust to internal threading, but not if
+/// IndexPQ::search is called by multiple threads
+struct IndexPQStats {
+    size_t nq;       // nb of queries run
+    size_t ncode;    // nb of codes visited
+
+    size_t n_hamming_pass; // nb of passed Hamming distance tests (for polysemy)
+
+    IndexPQStats () {reset (); }
+    void reset ();
+};
+
+extern IndexPQStats indexPQ_stats;
+
+
+
+/** Quantizer where centroids are virtual: they are the Cartesian
+ *  product of sub-centroids. */
+struct MultiIndexQuantizer: Index  {
+    ProductQuantizer pq;
+
+    MultiIndexQuantizer (int d,         ///< dimension of the input vectors
+                         size_t M,      ///< number of subquantizers
+                         size_t nbits); ///< number of bit per subvector index
+
+    void train(idx_t n, const float* x) override;
+
+    void search(
+        idx_t n, const float* x, idx_t k,
+        float* distances, idx_t* labels) const override;
+
+    /// add and reset will crash at runtime
+    void add(idx_t n, const float* x) override;
+    void reset() override;
+
+    MultiIndexQuantizer () {}
+
+    void reconstruct(idx_t key, float* recons) const override;
+};
+
+
+/** MultiIndexQuantizer where the PQ assignmnet is performed by sub-indexes
+ */
+struct MultiIndexQuantizer2: MultiIndexQuantizer {
+
+    /// M Indexes on d / M dimensions
+    std::vector<Index*> assign_indexes;
+    bool own_fields;
+
+    MultiIndexQuantizer2 (
+        int d, size_t M, size_t nbits,
+        Index **indexes);
+
+    MultiIndexQuantizer2 (
+        int d, size_t nbits,
+        Index *assign_index_0,
+        Index *assign_index_1);
+
+    void train(idx_t n, const float* x) override;
+
+    void search(
+        idx_t n, const float* x, idx_t k,
+        float* distances, idx_t* labels) const override;
+
+};
+
+
+} // namespace faiss
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/IndexPreTransform.cpp b/core/src/index/thirdparty/faiss/IndexPreTransform.cpp
new file mode 100644
index 0000000000..c27ce266c0
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexPreTransform.cpp
@@ -0,0 +1,288 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexPreTransform.h>
+
+#include <cstdio>
+#include <cmath>
+#include <cstring>
+#include <memory>
+
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+
+namespace faiss {
+
+/*********************************************
+ * IndexPreTransform
+ *********************************************/
+
+IndexPreTransform::IndexPreTransform ():
+    index(nullptr), own_fields (false)
+{
+}
+
+
+IndexPreTransform::IndexPreTransform (
+        Index * index):
+    Index (index->d, index->metric_type),
+    index (index), own_fields (false)
+{
+    is_trained = index->is_trained;
+    ntotal = index->ntotal;
+}
+
+
+IndexPreTransform::IndexPreTransform (
+        VectorTransform * ltrans,
+        Index * index):
+    Index (index->d, index->metric_type),
+    index (index), own_fields (false)
+{
+    is_trained = index->is_trained;
+    ntotal = index->ntotal;
+    prepend_transform (ltrans);
+}
+
+void IndexPreTransform::prepend_transform (VectorTransform *ltrans)
+{
+    FAISS_THROW_IF_NOT (ltrans->d_out == d);
+    is_trained = is_trained && ltrans->is_trained;
+    chain.insert (chain.begin(), ltrans);
+    d = ltrans->d_in;
+}
+
+
+IndexPreTransform::~IndexPreTransform ()
+{
+    if (own_fields) {
+        for (int i = 0; i < chain.size(); i++)
+            delete chain[i];
+        delete index;
+    }
+}
+
+
+
+
+void IndexPreTransform::train (idx_t n, const float *x)
+{
+    int last_untrained = 0;
+    if (!index->is_trained) {
+        last_untrained = chain.size();
+    } else {
+        for (int i = chain.size() - 1; i >= 0; i--) {
+            if (!chain[i]->is_trained) {
+                last_untrained = i;
+                break;
+            }
+        }
+    }
+    const float *prev_x = x;
+    ScopeDeleter<float> del;
+
+    if (verbose) {
+        printf("IndexPreTransform::train: training chain 0 to %d\n",
+               last_untrained);
+    }
+
+    for (int i = 0; i <= last_untrained; i++) {
+
+        if (i < chain.size()) {
+            VectorTransform *ltrans = chain [i];
+            if (!ltrans->is_trained) {
+                if (verbose) {
+                    printf("   Training chain component %d/%zd\n",
+                           i, chain.size());
+                    if (OPQMatrix *opqm = dynamic_cast<OPQMatrix*>(ltrans)) {
+                        opqm->verbose = true;
+                    }
+                }
+                ltrans->train (n, prev_x);
+            }
+        } else {
+            if (verbose) {
+                printf("   Training sub-index\n");
+            }
+            index->train (n, prev_x);
+        }
+        if (i == last_untrained) break;
+        if (verbose) {
+            printf("   Applying transform %d/%zd\n",
+                   i, chain.size());
+        }
+
+        float * xt = chain[i]->apply (n, prev_x);
+
+        if (prev_x != x) delete [] prev_x;
+        prev_x = xt;
+        del.set(xt);
+    }
+
+    is_trained = true;
+}
+
+
+const float *IndexPreTransform::apply_chain (idx_t n, const float *x) const
+{
+    const float *prev_x = x;
+    ScopeDeleter<float> del;
+
+    for (int i = 0; i < chain.size(); i++) {
+        float * xt = chain[i]->apply (n, prev_x);
+        ScopeDeleter<float> del2 (xt);
+        del2.swap (del);
+        prev_x = xt;
+    }
+    del.release ();
+    return prev_x;
+}
+
+void IndexPreTransform::reverse_chain (idx_t n, const float* xt, float* x) const
+{
+    const float* next_x = xt;
+    ScopeDeleter<float> del;
+
+    for (int i = chain.size() - 1; i >= 0; i--) {
+        float* prev_x = (i == 0) ? x : new float [n * chain[i]->d_in];
+        ScopeDeleter<float> del2 ((prev_x == x) ? nullptr : prev_x);
+        chain [i]->reverse_transform (n, next_x, prev_x);
+        del2.swap (del);
+        next_x = prev_x;
+    }
+}
+
+void IndexPreTransform::add (idx_t n, const float *x)
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    const float *xt = apply_chain (n, x);
+    ScopeDeleter<float> del(xt == x ? nullptr : xt);
+    index->add (n, xt);
+    ntotal = index->ntotal;
+}
+
+void IndexPreTransform::add_with_ids (idx_t n, const float * x,
+                                      const idx_t *xids)
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    const float *xt = apply_chain (n, x);
+    ScopeDeleter<float> del(xt == x ? nullptr : xt);
+    index->add_with_ids (n, xt, xids);
+    ntotal = index->ntotal;
+}
+
+
+
+
+void IndexPreTransform::search (idx_t n, const float *x, idx_t k,
+                               float *distances, idx_t *labels) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    const float *xt = apply_chain (n, x);
+    ScopeDeleter<float> del(xt == x ? nullptr : xt);
+    index->search (n, xt, k, distances, labels);
+}
+
+void IndexPreTransform::range_search (idx_t n, const float* x, float radius,
+                                      RangeSearchResult* result) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    const float *xt = apply_chain (n, x);
+    ScopeDeleter<float> del(xt == x ? nullptr : xt);
+    index->range_search (n, xt, radius, result);
+}
+
+
+
+void IndexPreTransform::reset () {
+    index->reset();
+    ntotal = 0;
+}
+
+size_t IndexPreTransform::remove_ids (const IDSelector & sel) {
+    size_t nremove = index->remove_ids (sel);
+    ntotal = index->ntotal;
+    return nremove;
+}
+
+
+void IndexPreTransform::reconstruct (idx_t key, float * recons) const
+{
+    float *x = chain.empty() ? recons : new float [index->d];
+    ScopeDeleter<float> del (recons == x ? nullptr : x);
+    // Initial reconstruction
+    index->reconstruct (key, x);
+
+    // Revert transformations from last to first
+    reverse_chain (1, x, recons);
+}
+
+
+void IndexPreTransform::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
+{
+    float *x = chain.empty() ? recons : new float [ni * index->d];
+    ScopeDeleter<float> del (recons == x ? nullptr : x);
+    // Initial reconstruction
+    index->reconstruct_n (i0, ni, x);
+
+    // Revert transformations from last to first
+    reverse_chain (ni, x, recons);
+}
+
+
+void IndexPreTransform::search_and_reconstruct (
+      idx_t n, const float *x, idx_t k,
+      float *distances, idx_t *labels, float* recons) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+
+    const float* xt = apply_chain (n, x);
+    ScopeDeleter<float> del ((xt == x) ? nullptr : xt);
+
+    float* recons_temp = chain.empty() ? recons : new float [n * k * index->d];
+    ScopeDeleter<float> del2 ((recons_temp == recons) ? nullptr : recons_temp);
+    index->search_and_reconstruct (n, xt, k, distances, labels, recons_temp);
+
+    // Revert transformations from last to first
+    reverse_chain (n * k, recons_temp, recons);
+}
+
+size_t IndexPreTransform::sa_code_size () const
+{
+    return index->sa_code_size ();
+}
+
+void IndexPreTransform::sa_encode (idx_t n, const float *x,
+                                         uint8_t *bytes) const
+{
+    if (chain.empty()) {
+        index->sa_encode (n, x, bytes);
+    } else {
+        const float *xt = apply_chain (n, x);
+        ScopeDeleter<float> del(xt == x ? nullptr : xt);
+        index->sa_encode (n, xt, bytes);
+    }
+}
+
+void IndexPreTransform::sa_decode (idx_t n, const uint8_t *bytes,
+                                           float *x) const
+{
+    if (chain.empty()) {
+        index->sa_decode (n, bytes, x);
+    } else {
+        std::unique_ptr<float []> x1 (new float [index->d * n]);
+        index->sa_decode (n, bytes, x1.get());
+        // Revert transformations from last to first
+        reverse_chain (n, x1.get(), x);
+    }
+}
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexPreTransform.h b/core/src/index/thirdparty/faiss/IndexPreTransform.h
new file mode 100644
index 0000000000..a3becc9188
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexPreTransform.h
@@ -0,0 +1,91 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+
+
+#include <faiss/Index.h>
+#include <faiss/VectorTransform.h>
+
+namespace faiss {
+
+/** Index that applies a LinearTransform transform on vectors before
+ *  handing them over to a sub-index */
+struct IndexPreTransform: Index {
+
+    std::vector<VectorTransform *> chain;  ///! chain of tranforms
+    Index * index;            ///! the sub-index
+
+    bool own_fields;          ///! whether pointers are deleted in destructor
+
+    explicit IndexPreTransform (Index *index);
+
+    IndexPreTransform ();
+
+    /// ltrans is the last transform before the index
+    IndexPreTransform (VectorTransform * ltrans, Index * index);
+
+    void prepend_transform (VectorTransform * ltrans);
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    void reset() override;
+
+    /** removes IDs from the index. Not supported by all indexes.
+     */
+    size_t remove_ids(const IDSelector& sel) override;
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+
+    /* range search, no attempt is done to change the radius */
+    void range_search (idx_t n, const float* x, float radius,
+                       RangeSearchResult* result) const override;
+
+
+    void reconstruct (idx_t key, float * recons) const override;
+
+    void reconstruct_n (idx_t i0, idx_t ni, float *recons)
+        const override;
+
+    void search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                 float *distances, idx_t *labels,
+                                 float *recons) const override;
+
+    /// apply the transforms in the chain. The returned float * may be
+    /// equal to x, otherwise it should be deallocated.
+    const float * apply_chain (idx_t n, const float *x) const;
+
+    /// Reverse the transforms in the chain. May not be implemented for
+    /// all transforms in the chain or may return approximate results.
+    void reverse_chain (idx_t n, const float* xt, float* x) const;
+
+
+    /* standalone codec interface */
+    size_t sa_code_size () const override;
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+    ~IndexPreTransform() override;
+};
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexReplicas.cpp b/core/src/index/thirdparty/faiss/IndexReplicas.cpp
new file mode 100644
index 0000000000..5aa392271e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexReplicas.cpp
@@ -0,0 +1,123 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexReplicas.h>
+#include <faiss/impl/FaissAssert.h>
+
+namespace faiss {
+
+template <typename IndexT>
+IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(bool threaded)
+    : ThreadedIndex<IndexT>(threaded) {
+}
+
+template <typename IndexT>
+IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(idx_t d, bool threaded)
+    : ThreadedIndex<IndexT>(d, threaded) {
+}
+
+template <typename IndexT>
+IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(int d, bool threaded)
+    : ThreadedIndex<IndexT>(d, threaded) {
+}
+
+template <typename IndexT>
+void
+IndexReplicasTemplate<IndexT>::onAfterAddIndex(IndexT* index) {
+  // Make sure that the parameters are the same for all prior indices, unless
+  // we're the first index to be added
+  if (this->count() > 0 && this->at(0) != index) {
+    auto existing = this->at(0);
+
+    FAISS_THROW_IF_NOT_FMT(index->ntotal == existing->ntotal,
+                           "IndexReplicas: newly added index does "
+                           "not have same number of vectors as prior index; "
+                           "prior index has %ld vectors, new index has %ld",
+                           existing->ntotal, index->ntotal);
+
+    FAISS_THROW_IF_NOT_MSG(index->is_trained == existing->is_trained,
+                           "IndexReplicas: newly added index does "
+                           "not have same train status as prior index");
+  } else {
+    // Set our parameters based on the first index we're adding
+    // (dimension is handled in ThreadedIndex)
+    this->ntotal = index->ntotal;
+    this->verbose = index->verbose;
+    this->is_trained = index->is_trained;
+    this->metric_type = index->metric_type;
+  }
+}
+
+template <typename IndexT>
+void
+IndexReplicasTemplate<IndexT>::train(idx_t n, const component_t* x) {
+  this->runOnIndex([n, x](int, IndexT* index){ index->train(n, x); });
+}
+
+template <typename IndexT>
+void
+IndexReplicasTemplate<IndexT>::add(idx_t n, const component_t* x) {
+  this->runOnIndex([n, x](int, IndexT* index){ index->add(n, x); });
+  this->ntotal += n;
+}
+
+template <typename IndexT>
+void
+IndexReplicasTemplate<IndexT>::reconstruct(idx_t n, component_t* x) const {
+  FAISS_THROW_IF_NOT_MSG(this->count() > 0, "no replicas in index");
+
+  // Just pass to the first replica
+  this->at(0)->reconstruct(n, x);
+}
+
+template <typename IndexT>
+void
+IndexReplicasTemplate<IndexT>::search(idx_t n,
+                                      const component_t* x,
+                                      idx_t k,
+                                      distance_t* distances,
+                                      idx_t* labels) const {
+  FAISS_THROW_IF_NOT_MSG(this->count() > 0, "no replicas in index");
+
+  if (n == 0) {
+    return;
+  }
+
+  auto dim = this->d;
+  size_t componentsPerVec =
+    sizeof(component_t) == 1 ? (dim + 7) / 8 : dim;
+
+  // Partition the query by the number of indices we have
+  faiss::Index::idx_t queriesPerIndex =
+    (faiss::Index::idx_t) (n + this->count() - 1) /
+    (faiss::Index::idx_t) this->count();
+  FAISS_ASSERT(n / queriesPerIndex <= this->count());
+
+  auto fn =
+    [queriesPerIndex, componentsPerVec,
+     n, x, k, distances, labels](int i, const IndexT* index) {
+      faiss::Index::idx_t base = (faiss::Index::idx_t) i * queriesPerIndex;
+
+      if (base < n) {
+        auto numForIndex = std::min(queriesPerIndex, n - base);
+
+        index->search(numForIndex,
+                      x + base * componentsPerVec,
+                      k,
+                      distances + base * k,
+                      labels + base * k);
+      }
+    };
+
+  this->runOnIndex(fn);
+}
+
+// explicit instantiations
+template struct IndexReplicasTemplate<Index>;
+template struct IndexReplicasTemplate<IndexBinary>;
+
+} // namespace
diff --git a/core/src/index/thirdparty/faiss/IndexReplicas.h b/core/src/index/thirdparty/faiss/IndexReplicas.h
new file mode 100644
index 0000000000..f61ff19b2d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexReplicas.h
@@ -0,0 +1,76 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/IndexBinary.h>
+#include <faiss/impl/ThreadedIndex.h>
+
+namespace faiss {
+
+/// Takes individual faiss::Index instances, and splits queries for
+/// sending to each Index instance, and joins the results together
+/// when done.
+/// Each index is managed by a separate CPU thread.
+template <typename IndexT>
+class IndexReplicasTemplate : public ThreadedIndex<IndexT> {
+ public:
+  using idx_t = typename IndexT::idx_t;
+  using component_t = typename IndexT::component_t;
+  using distance_t = typename IndexT::distance_t;
+
+  /// The dimension that all sub-indices must share will be the dimension of the
+  /// first sub-index added
+  /// @param threaded do we use one thread per sub-index or do queries
+  /// sequentially?
+  explicit IndexReplicasTemplate(bool threaded = true);
+
+  /// @param d the dimension that all sub-indices must share
+  /// @param threaded do we use one thread per sub index or do queries
+  /// sequentially?
+  explicit IndexReplicasTemplate(idx_t d, bool threaded = true);
+
+  /// int version due to the implicit bool conversion ambiguity of int as
+  /// dimension
+  explicit IndexReplicasTemplate(int d, bool threaded = true);
+
+  /// Alias for addIndex()
+  void add_replica(IndexT* index) { this->addIndex(index); }
+
+  /// Alias for removeIndex()
+  void remove_replica(IndexT* index) { this->removeIndex(index); }
+
+  /// faiss::Index API
+  /// All indices receive the same call
+  void train(idx_t n, const component_t* x) override;
+
+  /// faiss::Index API
+  /// All indices receive the same call
+  void add(idx_t n, const component_t* x) override;
+
+  /// faiss::Index API
+  /// Query is partitioned into a slice for each sub-index
+  /// split by ceil(n / #indices) for our sub-indices
+  void search(idx_t n,
+              const component_t* x,
+              idx_t k,
+              distance_t* distances,
+              idx_t* labels) const override;
+
+  /// reconstructs from the first index
+  void reconstruct(idx_t, component_t *v) const override;
+
+ protected:
+  /// Called just after an index is added
+  void onAfterAddIndex(IndexT* index) override;
+};
+
+using IndexReplicas = IndexReplicasTemplate<Index>;
+using IndexBinaryReplicas = IndexReplicasTemplate<IndexBinary>;
+
+} // namespace
diff --git a/core/src/index/thirdparty/faiss/IndexSQHybrid.cpp b/core/src/index/thirdparty/faiss/IndexSQHybrid.cpp
new file mode 100644
index 0000000000..1f6367d7cf
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexSQHybrid.cpp
@@ -0,0 +1,183 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <cstdio>
+#include <algorithm>
+
+#include <omp.h>
+
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/IndexSQHybrid.h>
+
+namespace faiss {
+
+/*******************************************************************
+ * IndexIVFSQHybrid implementation
+ ********************************************************************/
+
+IndexIVFSQHybrid::IndexIVFSQHybrid (
+            Index *quantizer, size_t d, size_t nlist,
+            ScalarQuantizer::QuantizerType qtype,
+            MetricType metric, bool encode_residual)
+    : IndexIVF(quantizer, d, nlist, 0, metric),
+      sq(d, qtype),
+      by_residual(encode_residual)
+{
+    code_size = sq.code_size;
+    // was not known at construction time
+    invlists->code_size = code_size;
+    is_trained = false;
+}
+
+IndexIVFSQHybrid::IndexIVFSQHybrid ():
+    IndexIVF(),
+    by_residual(true)
+{
+}
+
+void IndexIVFSQHybrid::train_residual (idx_t n, const float *x)
+{
+    sq.train_residual(n, x, quantizer, by_residual, verbose);
+}
+
+void IndexIVFSQHybrid::encode_vectors(idx_t n, const float* x,
+                                             const idx_t *list_nos,
+                                             uint8_t * codes,
+                                             bool include_listnos) const
+{
+    std::unique_ptr<ScalarQuantizer::Quantizer> squant (sq.select_quantizer ());
+    size_t coarse_size = include_listnos ? coarse_code_size () : 0;
+    memset(codes, 0, (code_size + coarse_size) * n);
+
+#pragma omp parallel if(n > 1)
+    {
+        std::vector<float> residual (d);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            int64_t list_no = list_nos [i];
+            if (list_no >= 0) {
+                const float *xi = x + i * d;
+                uint8_t *code = codes + i * (code_size + coarse_size);
+                if (by_residual) {
+                    quantizer->compute_residual (
+                          xi, residual.data(), list_no);
+                    xi = residual.data ();
+                }
+                if (coarse_size) {
+                    encode_listno (list_no, code);
+                }
+                squant->encode_vector (xi, code + coarse_size);
+            }
+        }
+    }
+}
+
+void IndexIVFSQHybrid::sa_decode (idx_t n, const uint8_t *codes,
+                                                 float *x) const
+{
+    std::unique_ptr<ScalarQuantizer::Quantizer> squant (sq.select_quantizer ());
+    size_t coarse_size = coarse_code_size ();
+
+#pragma omp parallel if(n > 1)
+    {
+        std::vector<float> residual (d);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            const uint8_t *code = codes + i * (code_size + coarse_size);
+            int64_t list_no = decode_listno (code);
+            float *xi = x + i * d;
+            squant->decode_vector (code + coarse_size, xi);
+            if (by_residual) {
+                quantizer->reconstruct (list_no, residual.data());
+                for (size_t j = 0; j < d; j++) {
+                    xi[j] += residual[j];
+                }
+            }
+        }
+    }
+}
+
+
+
+void IndexIVFSQHybrid::add_with_ids
+       (idx_t n, const float * x, const idx_t *xids)
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    std::unique_ptr<int64_t []> idx (new int64_t [n]);
+    quantizer->assign (n, x, idx.get());
+    size_t nadd = 0;
+    std::unique_ptr<ScalarQuantizer::Quantizer> squant(sq.select_quantizer ());
+
+#pragma omp parallel reduction(+: nadd)
+    {
+        std::vector<float> residual (d);
+        std::vector<uint8_t> one_code (code_size);
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+
+        // each thread takes care of a subset of lists
+        for (size_t i = 0; i < n; i++) {
+            int64_t list_no = idx [i];
+            if (list_no >= 0 && list_no % nt == rank) {
+                int64_t id = xids ? xids[i] : ntotal + i;
+
+                const float * xi = x + i * d;
+                if (by_residual) {
+                    quantizer->compute_residual (xi, residual.data(), list_no);
+                    xi = residual.data();
+                }
+
+                memset (one_code.data(), 0, code_size);
+                squant->encode_vector (xi, one_code.data());
+
+                invlists->add_entry (list_no, id, one_code.data());
+
+                nadd++;
+
+            }
+        }
+    }
+    ntotal += n;
+}
+
+
+
+
+
+InvertedListScanner* IndexIVFSQHybrid::get_InvertedListScanner
+    (bool store_pairs) const
+{
+    return sq.select_InvertedListScanner (metric_type, quantizer, store_pairs,
+                                          by_residual);
+}
+
+
+void IndexIVFSQHybrid::reconstruct_from_offset (int64_t list_no,
+                                                       int64_t offset,
+                                                       float* recons) const
+{
+    std::vector<float> centroid(d);
+    quantizer->reconstruct (list_no, centroid.data());
+
+    const uint8_t* code = invlists->get_single_code (list_no, offset);
+    sq.decode (code, recons, 1);
+    for (int i = 0; i < d; ++i) {
+        recons[i] += centroid[i];
+    }
+}
+
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexSQHybrid.h b/core/src/index/thirdparty/faiss/IndexSQHybrid.h
new file mode 100644
index 0000000000..1fe31d21d1
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexSQHybrid.h
@@ -0,0 +1,65 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_SQ_HYBRID_H
+#define FAISS_INDEX_SQ_HYBRID_H
+
+#include <stdint.h>
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/ScalarQuantizer.h>
+
+
+namespace faiss {
+
+ /** An IVF implementation where the components of the residuals are
+ * encoded with a scalar uniform quantizer. All distance computations
+ * are asymmetric, so the encoded vectors are decoded and approximate
+ * distances are computed.
+ */
+
+struct IndexIVFSQHybrid: IndexIVF {
+    ScalarQuantizer sq;
+    bool by_residual;
+
+    IndexIVFSQHybrid(Index *quantizer, size_t d, size_t nlist,
+                            ScalarQuantizer::QuantizerType qtype,
+                            MetricType metric = METRIC_L2,
+                            bool encode_residual = true);
+
+    IndexIVFSQHybrid();
+
+    void train_residual(idx_t n, const float* x) override;
+
+    void encode_vectors(idx_t n, const float* x,
+                        const idx_t *list_nos,
+                        uint8_t * codes,
+                        bool include_listnos=false) const override;
+
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    InvertedListScanner *get_InvertedListScanner (bool store_pairs)
+        const override;
+
+
+    void reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                  float* recons) const override;
+
+    /* standalone codec interface */
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+};
+
+
+}
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/IndexScalarQuantizer.cpp b/core/src/index/thirdparty/faiss/IndexScalarQuantizer.cpp
new file mode 100644
index 0000000000..658b744bb9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexScalarQuantizer.cpp
@@ -0,0 +1,317 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexScalarQuantizer.h>
+
+#include <cstdio>
+#include <algorithm>
+
+#include <omp.h>
+
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/ScalarQuantizer.h>
+
+namespace faiss {
+
+
+
+/*******************************************************************
+ * IndexScalarQuantizer implementation
+ ********************************************************************/
+
+IndexScalarQuantizer::IndexScalarQuantizer
+                      (int d, ScalarQuantizer::QuantizerType qtype,
+                       MetricType metric):
+          Index(d, metric),
+          sq (d, qtype)
+{
+    is_trained =
+        qtype == ScalarQuantizer::QT_fp16 ||
+        qtype == ScalarQuantizer::QT_8bit_direct;
+    code_size = sq.code_size;
+}
+
+
+IndexScalarQuantizer::IndexScalarQuantizer ():
+    IndexScalarQuantizer(0, ScalarQuantizer::QT_8bit)
+{}
+
+void IndexScalarQuantizer::train(idx_t n, const float* x)
+{
+    sq.train(n, x);
+    is_trained = true;
+}
+
+void IndexScalarQuantizer::add(idx_t n, const float* x)
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    codes.resize ((n + ntotal) * code_size);
+    sq.compute_codes (x, &codes[ntotal * code_size], n);
+    ntotal += n;
+}
+
+
+void IndexScalarQuantizer::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    FAISS_THROW_IF_NOT (metric_type == METRIC_L2 ||
+                        metric_type == METRIC_INNER_PRODUCT);
+
+#pragma omp parallel
+    {
+        InvertedListScanner* scanner = sq.select_InvertedListScanner
+            (metric_type, nullptr, true);
+        ScopeDeleter1<InvertedListScanner> del(scanner);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            float * D = distances + k * i;
+            idx_t * I = labels + k * i;
+            // re-order heap
+            if (metric_type == METRIC_L2) {
+                maxheap_heapify (k, D, I);
+            } else {
+                minheap_heapify (k, D, I);
+            }
+            scanner->set_query (x + i * d);
+            scanner->scan_codes (ntotal, codes.data(),
+                                 nullptr, D, I, k);
+
+            // re-order heap
+            if (metric_type == METRIC_L2) {
+                maxheap_reorder (k, D, I);
+            } else {
+                minheap_reorder (k, D, I);
+            }
+        }
+    }
+
+}
+
+
+DistanceComputer *IndexScalarQuantizer::get_distance_computer () const
+{
+    ScalarQuantizer::SQDistanceComputer *dc =
+        sq.get_distance_computer (metric_type);
+    dc->code_size = sq.code_size;
+    dc->codes = codes.data();
+    return dc;
+}
+
+
+void IndexScalarQuantizer::reset()
+{
+    codes.clear();
+    ntotal = 0;
+}
+
+void IndexScalarQuantizer::reconstruct_n(
+             idx_t i0, idx_t ni, float* recons) const
+{
+    std::unique_ptr<ScalarQuantizer::Quantizer> squant(sq.select_quantizer ());
+    for (size_t i = 0; i < ni; i++) {
+        squant->decode_vector(&codes[(i + i0) * code_size], recons + i * d);
+    }
+}
+
+void IndexScalarQuantizer::reconstruct(idx_t key, float* recons) const
+{
+    reconstruct_n(key, 1, recons);
+}
+
+/* Codec interface */
+size_t IndexScalarQuantizer::sa_code_size () const
+{
+    return sq.code_size;
+}
+
+void IndexScalarQuantizer::sa_encode (idx_t n, const float *x,
+                      uint8_t *bytes) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    sq.compute_codes (x, bytes, n);
+}
+
+void IndexScalarQuantizer::sa_decode (idx_t n, const uint8_t *bytes,
+                                              float *x) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    sq.decode(bytes, x, n);
+}
+
+
+
+/*******************************************************************
+ * IndexIVFScalarQuantizer implementation
+ ********************************************************************/
+
+IndexIVFScalarQuantizer::IndexIVFScalarQuantizer (
+            Index *quantizer, size_t d, size_t nlist,
+            ScalarQuantizer::QuantizerType qtype,
+            MetricType metric, bool encode_residual)
+    : IndexIVF(quantizer, d, nlist, 0, metric),
+      sq(d, qtype),
+      by_residual(encode_residual)
+{
+    code_size = sq.code_size;
+    // was not known at construction time
+    invlists->code_size = code_size;
+    is_trained = false;
+}
+
+IndexIVFScalarQuantizer::IndexIVFScalarQuantizer ():
+    IndexIVF(),
+    by_residual(true)
+{
+}
+
+void IndexIVFScalarQuantizer::train_residual (idx_t n, const float *x)
+{
+    sq.train_residual(n, x, quantizer, by_residual, verbose);
+}
+
+void IndexIVFScalarQuantizer::encode_vectors(idx_t n, const float* x,
+                                             const idx_t *list_nos,
+                                             uint8_t * codes,
+                                             bool include_listnos) const
+{
+    std::unique_ptr<ScalarQuantizer::Quantizer> squant (sq.select_quantizer ());
+    size_t coarse_size = include_listnos ? coarse_code_size () : 0;
+    memset(codes, 0, (code_size + coarse_size) * n);
+
+#pragma omp parallel if(n > 1)
+    {
+        std::vector<float> residual (d);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            int64_t list_no = list_nos [i];
+            if (list_no >= 0) {
+                const float *xi = x + i * d;
+                uint8_t *code = codes + i * (code_size + coarse_size);
+                if (by_residual) {
+                    quantizer->compute_residual (
+                          xi, residual.data(), list_no);
+                    xi = residual.data ();
+                }
+                if (coarse_size) {
+                    encode_listno (list_no, code);
+                }
+                squant->encode_vector (xi, code + coarse_size);
+            }
+        }
+    }
+}
+
+void IndexIVFScalarQuantizer::sa_decode (idx_t n, const uint8_t *codes,
+                                                 float *x) const
+{
+    std::unique_ptr<ScalarQuantizer::Quantizer> squant (sq.select_quantizer ());
+    size_t coarse_size = coarse_code_size ();
+
+#pragma omp parallel if(n > 1)
+    {
+        std::vector<float> residual (d);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            const uint8_t *code = codes + i * (code_size + coarse_size);
+            int64_t list_no = decode_listno (code);
+            float *xi = x + i * d;
+            squant->decode_vector (code + coarse_size, xi);
+            if (by_residual) {
+                quantizer->reconstruct (list_no, residual.data());
+                for (size_t j = 0; j < d; j++) {
+                    xi[j] += residual[j];
+                }
+            }
+        }
+    }
+}
+
+
+
+void IndexIVFScalarQuantizer::add_with_ids
+       (idx_t n, const float * x, const idx_t *xids)
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    std::unique_ptr<int64_t []> idx (new int64_t [n]);
+    quantizer->assign (n, x, idx.get());
+    size_t nadd = 0;
+    std::unique_ptr<ScalarQuantizer::Quantizer> squant(sq.select_quantizer ());
+
+#pragma omp parallel reduction(+: nadd)
+    {
+        std::vector<float> residual (d);
+        std::vector<uint8_t> one_code (code_size);
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+
+        // each thread takes care of a subset of lists
+        for (size_t i = 0; i < n; i++) {
+            int64_t list_no = idx [i];
+            if (list_no >= 0 && list_no % nt == rank) {
+                int64_t id = xids ? xids[i] : ntotal + i;
+
+                const float * xi = x + i * d;
+                if (by_residual) {
+                    quantizer->compute_residual (xi, residual.data(), list_no);
+                    xi = residual.data();
+                }
+
+                memset (one_code.data(), 0, code_size);
+                squant->encode_vector (xi, one_code.data());
+
+                invlists->add_entry (list_no, id, one_code.data());
+
+                nadd++;
+
+            }
+        }
+    }
+    ntotal += n;
+}
+
+
+
+
+
+InvertedListScanner* IndexIVFScalarQuantizer::get_InvertedListScanner
+    (bool store_pairs) const
+{
+    return sq.select_InvertedListScanner (metric_type, quantizer, store_pairs,
+                                          by_residual);
+}
+
+
+void IndexIVFScalarQuantizer::reconstruct_from_offset (int64_t list_no,
+                                                       int64_t offset,
+                                                       float* recons) const
+{
+    std::vector<float> centroid(d);
+    quantizer->reconstruct (list_no, centroid.data());
+
+    const uint8_t* code = invlists->get_single_code (list_no, offset);
+    sq.decode (code, recons, 1);
+    for (int i = 0; i < d; ++i) {
+        recons[i] += centroid[i];
+    }
+}
+
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexScalarQuantizer.h b/core/src/index/thirdparty/faiss/IndexScalarQuantizer.h
new file mode 100644
index 0000000000..bb0e20b65f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexScalarQuantizer.h
@@ -0,0 +1,127 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_SCALAR_QUANTIZER_H
+#define FAISS_INDEX_SCALAR_QUANTIZER_H
+
+#include <stdint.h>
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/ScalarQuantizer.h>
+
+
+namespace faiss {
+
+/**
+ * The uniform quantizer has a range [vmin, vmax]. The range can be
+ * the same for all dimensions (uniform) or specific per dimension
+ * (default).
+ */
+
+
+
+
+struct IndexScalarQuantizer: Index {
+    /// Used to encode the vectors
+    ScalarQuantizer sq;
+
+    /// Codes. Size ntotal * pq.code_size
+    std::vector<uint8_t> codes;
+
+    size_t code_size;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    IndexScalarQuantizer (int d,
+                          ScalarQuantizer::QuantizerType qtype,
+                          MetricType metric = METRIC_L2);
+
+    IndexScalarQuantizer ();
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+    void reset() override;
+
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    DistanceComputer *get_distance_computer () const override;
+
+    /* standalone codec interface */
+    size_t sa_code_size () const override;
+
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+
+};
+
+
+ /** An IVF implementation where the components of the residuals are
+ * encoded with a scalar uniform quantizer. All distance computations
+ * are asymmetric, so the encoded vectors are decoded and approximate
+ * distances are computed.
+ */
+
+struct IndexIVFScalarQuantizer: IndexIVF {
+    ScalarQuantizer sq;
+    bool by_residual;
+
+    IndexIVFScalarQuantizer(Index *quantizer, size_t d, size_t nlist,
+                            ScalarQuantizer::QuantizerType qtype,
+                            MetricType metric = METRIC_L2,
+                            bool encode_residual = true);
+
+    IndexIVFScalarQuantizer();
+
+    void train_residual(idx_t n, const float* x) override;
+
+    void encode_vectors(idx_t n, const float* x,
+                        const idx_t *list_nos,
+                        uint8_t * codes,
+                        bool include_listnos=false) const override;
+
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    InvertedListScanner *get_InvertedListScanner (bool store_pairs)
+        const override;
+
+
+    void reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                  float* recons) const override;
+
+    /* standalone codec interface */
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+};
+
+
+}
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/IndexShards.cpp b/core/src/index/thirdparty/faiss/IndexShards.cpp
new file mode 100644
index 0000000000..ac6c605d7c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexShards.cpp
@@ -0,0 +1,317 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexShards.h>
+
+#include <cstdio>
+#include <functional>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/WorkerThread.h>
+
+namespace faiss {
+
+// subroutines
+namespace {
+
+typedef Index::idx_t idx_t;
+
+
+// add translation to all valid labels
+void translate_labels (long n, idx_t *labels, long translation)
+{
+    if (translation == 0) return;
+    for (long i = 0; i < n; i++) {
+        if(labels[i] < 0) continue;
+        labels[i] += translation;
+    }
+}
+
+
+/** merge result tables from several shards.
+ * @param all_distances  size nshard * n * k
+ * @param all_labels     idem
+ * @param translartions  label translations to apply, size nshard
+ */
+
+template <class IndexClass, class C>
+void
+merge_tables(long n, long k, long nshard,
+             typename IndexClass::distance_t *distances,
+             idx_t *labels,
+             const std::vector<typename IndexClass::distance_t>& all_distances,
+             const std::vector<idx_t>& all_labels,
+             const std::vector<long>& translations) {
+  if (k == 0) {
+    return;
+  }
+  using distance_t = typename IndexClass::distance_t;
+
+  long stride = n * k;
+#pragma omp parallel
+  {
+    std::vector<int> buf (2 * nshard);
+    int * pointer = buf.data();
+    int * shard_ids = pointer + nshard;
+    std::vector<distance_t> buf2 (nshard);
+    distance_t * heap_vals = buf2.data();
+#pragma omp for
+    for (long i = 0; i < n; i++) {
+      // the heap maps values to the shard where they are
+      // produced.
+      const distance_t *D_in = all_distances.data() + i * k;
+      const idx_t *I_in = all_labels.data() + i * k;
+      int heap_size = 0;
+
+      for (long s = 0; s < nshard; s++) {
+        pointer[s] = 0;
+        if (I_in[stride * s] >= 0) {
+          heap_push<C> (++heap_size, heap_vals, shard_ids,
+                        D_in[stride * s], s);
+        }
+      }
+
+      distance_t *D = distances + i * k;
+      idx_t *I = labels + i * k;
+
+      for (int j = 0; j < k; j++) {
+        if (heap_size == 0) {
+          I[j] = -1;
+          D[j] = C::neutral();
+        } else {
+          // pop best element
+          int s = shard_ids[0];
+          int & p = pointer[s];
+          D[j] = heap_vals[0];
+          I[j] = I_in[stride * s + p] + translations[s];
+
+          heap_pop<C> (heap_size--, heap_vals, shard_ids);
+          p++;
+          if (p < k && I_in[stride * s + p] >= 0) {
+            heap_push<C> (++heap_size, heap_vals, shard_ids,
+                          D_in[stride * s + p], s);
+          }
+        }
+      }
+    }
+  }
+}
+
+} // anonymous namespace
+
+template <typename IndexT>
+IndexShardsTemplate<IndexT>::IndexShardsTemplate(idx_t d,
+                                                 bool threaded,
+                                                 bool successive_ids)
+    : ThreadedIndex<IndexT>(d, threaded),
+      successive_ids(successive_ids) {
+}
+
+template <typename IndexT>
+IndexShardsTemplate<IndexT>::IndexShardsTemplate(int d,
+                                                 bool threaded,
+                                                 bool successive_ids)
+    : ThreadedIndex<IndexT>(d, threaded),
+      successive_ids(successive_ids) {
+}
+
+template <typename IndexT>
+IndexShardsTemplate<IndexT>::IndexShardsTemplate(bool threaded,
+                                                 bool successive_ids)
+    : ThreadedIndex<IndexT>(threaded),
+      successive_ids(successive_ids) {
+}
+
+template <typename IndexT>
+void
+IndexShardsTemplate<IndexT>::onAfterAddIndex(IndexT* index /* unused */) {
+  sync_with_shard_indexes();
+}
+
+template <typename IndexT>
+void
+IndexShardsTemplate<IndexT>::onAfterRemoveIndex(IndexT* index /* unused */) {
+  sync_with_shard_indexes();
+}
+
+template <typename IndexT>
+void
+IndexShardsTemplate<IndexT>::sync_with_shard_indexes() {
+  if (!this->count()) {
+    this->is_trained = false;
+    this->ntotal = 0;
+
+    return;
+  }
+
+  auto firstIndex = this->at(0);
+  this->metric_type = firstIndex->metric_type;
+  this->is_trained = firstIndex->is_trained;
+  this->ntotal = firstIndex->ntotal;
+
+  for (int i = 1; i < this->count(); ++i) {
+    auto index = this->at(i);
+    FAISS_THROW_IF_NOT(this->metric_type == index->metric_type);
+    FAISS_THROW_IF_NOT(this->d == index->d);
+
+    this->ntotal += index->ntotal;
+  }
+}
+
+template <typename IndexT>
+void
+IndexShardsTemplate<IndexT>::train(idx_t n,
+                                   const component_t *x) {
+  auto fn =
+    [n, x](int no, IndexT *index) {
+      if (index->verbose) {
+        printf("begin train shard %d on %ld points\n", no, n);
+      }
+
+      index->train(n, x);
+
+      if (index->verbose) {
+        printf("end train shard %d\n", no);
+      }
+    };
+
+  this->runOnIndex(fn);
+  sync_with_shard_indexes();
+}
+
+template <typename IndexT>
+void
+IndexShardsTemplate<IndexT>::add(idx_t n,
+                                 const component_t *x) {
+  add_with_ids(n, x, nullptr);
+}
+
+template <typename IndexT>
+void
+IndexShardsTemplate<IndexT>::add_with_ids(idx_t n,
+                                          const component_t * x,
+                                          const idx_t *xids) {
+
+  FAISS_THROW_IF_NOT_MSG(!(successive_ids && xids),
+                         "It makes no sense to pass in ids and "
+                         "request them to be shifted");
+
+  if (successive_ids) {
+    FAISS_THROW_IF_NOT_MSG(!xids,
+                           "It makes no sense to pass in ids and "
+                           "request them to be shifted");
+    FAISS_THROW_IF_NOT_MSG(this->ntotal == 0,
+                           "when adding to IndexShards with sucessive_ids, "
+                           "only add() in a single pass is supported");
+  }
+
+  idx_t nshard = this->count();
+  const idx_t *ids = xids;
+
+  std::vector<idx_t> aids;
+
+  if (!ids && !successive_ids) {
+    aids.resize(n);
+
+    for (idx_t i = 0; i < n; i++) {
+      aids[i] = this->ntotal + i;
+    }
+
+    ids = aids.data();
+  }
+
+  size_t components_per_vec =
+    sizeof(component_t) == 1 ? (this->d + 7) / 8 : this->d;
+
+  auto fn =
+    [n, ids, x, nshard, components_per_vec](int no, IndexT *index) {
+      idx_t i0 = (idx_t) no * n / nshard;
+      idx_t i1 = ((idx_t) no + 1) * n / nshard;
+      auto x0 = x + i0 * components_per_vec;
+
+      if (index->verbose) {
+        printf ("begin add shard %d on %ld points\n", no, n);
+      }
+
+      if (ids) {
+        index->add_with_ids (i1 - i0, x0, ids + i0);
+      } else {
+        index->add (i1 - i0, x0);
+      }
+
+      if (index->verbose) {
+        printf ("end add shard %d on %ld points\n", no, i1 - i0);
+      }
+    };
+
+  this->runOnIndex(fn);
+
+  // This is safe to do here because the current thread controls execution in
+  // all threads, and nothing else is happening
+  this->ntotal += n;
+}
+
+template <typename IndexT>
+void
+IndexShardsTemplate<IndexT>::search(idx_t n,
+                                    const component_t *x,
+                                    idx_t k,
+                                    distance_t *distances,
+                                    idx_t *labels) const {
+  long nshard = this->count();
+
+  std::vector<distance_t> all_distances(nshard * k * n);
+  std::vector<idx_t> all_labels(nshard * k * n);
+
+  auto fn =
+    [n, k, x, &all_distances, &all_labels](int no, const IndexT *index) {
+      if (index->verbose) {
+        printf ("begin query shard %d on %ld points\n", no, n);
+      }
+
+      index->search (n, x, k,
+                     all_distances.data() + no * k * n,
+                     all_labels.data() + no * k * n);
+
+      if (index->verbose) {
+        printf ("end query shard %d\n", no);
+      }
+    };
+
+  this->runOnIndex(fn);
+
+  std::vector<long> translations(nshard, 0);
+
+  // Because we just called runOnIndex above, it is safe to access the sub-index
+  // ntotal here
+  if (successive_ids) {
+    translations[0] = 0;
+
+    for (int s = 0; s + 1 < nshard; s++) {
+      translations[s + 1] = translations[s] + this->at(s)->ntotal;
+    }
+  }
+
+  if (this->metric_type == METRIC_L2) {
+    merge_tables<IndexT, CMin<distance_t, int>>(
+      n, k, nshard, distances, labels,
+      all_distances, all_labels, translations);
+  } else {
+    merge_tables<IndexT, CMax<distance_t, int>>(
+      n, k, nshard, distances, labels,
+      all_distances, all_labels, translations);
+  }
+}
+
+// explicit instanciations
+template struct IndexShardsTemplate<Index>;
+template struct IndexShardsTemplate<IndexBinary>;
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexShards.h b/core/src/index/thirdparty/faiss/IndexShards.h
new file mode 100644
index 0000000000..1bbc664b0a
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexShards.h
@@ -0,0 +1,100 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/IndexBinary.h>
+#include <faiss/impl/ThreadedIndex.h>
+
+namespace faiss {
+
+/**
+ * Index that concatenates the results from several sub-indexes
+ */
+template <typename IndexT>
+struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
+  using idx_t = typename IndexT::idx_t;
+  using component_t = typename IndexT::component_t;
+  using distance_t = typename IndexT::distance_t;
+
+  /**
+   * The dimension that all sub-indices must share will be the dimension of the
+   * first sub-index added
+   *
+   * @param threaded     do we use one thread per sub_index or do
+   *                     queries sequentially?
+   * @param successive_ids should we shift the returned ids by
+   *                     the size of each sub-index or return them
+   *                     as they are?
+   */
+  explicit IndexShardsTemplate(bool threaded = false,
+                               bool successive_ids = true);
+
+  /**
+   * @param threaded     do we use one thread per sub_index or do
+   *                     queries sequentially?
+   * @param successive_ids should we shift the returned ids by
+   *                     the size of each sub-index or return them
+   *                     as they are?
+   */
+  explicit IndexShardsTemplate(idx_t d,
+                               bool threaded = false,
+                               bool successive_ids = true);
+
+  /// int version due to the implicit bool conversion ambiguity of int as
+  /// dimension
+  explicit IndexShardsTemplate(int d,
+                               bool threaded = false,
+                               bool successive_ids = true);
+
+  /// Alias for addIndex()
+  void add_shard(IndexT* index) { this->addIndex(index); }
+
+  /// Alias for removeIndex()
+  void remove_shard(IndexT* index) { this->removeIndex(index); }
+
+  /// supported only for sub-indices that implement add_with_ids
+  void add(idx_t n, const component_t* x) override;
+
+  /**
+   * Cases (successive_ids, xids):
+   * - true, non-NULL       ERROR: it makes no sense to pass in ids and
+   *                        request them to be shifted
+   * - true, NULL           OK, but should be called only once (calls add()
+   *                        on sub-indexes).
+   * - false, non-NULL      OK: will call add_with_ids with passed in xids
+   *                        distributed evenly over shards
+   * - false, NULL          OK: will call add_with_ids on each sub-index,
+   *                        starting at ntotal
+   */
+  void add_with_ids(idx_t n, const component_t* x, const idx_t* xids) override;
+
+  void search(idx_t n, const component_t* x, idx_t k,
+              distance_t* distances, idx_t* labels) const override;
+
+  void train(idx_t n, const component_t* x) override;
+
+  // update metric_type and ntotal. Call if you changes something in
+  // the shard indexes.
+  void sync_with_shard_indexes();
+
+  bool successive_ids;
+
+ protected:
+  /// Called just after an index is added
+  void onAfterAddIndex(IndexT* index) override;
+
+  /// Called just after an index is removed
+  void onAfterRemoveIndex(IndexT* index) override;
+};
+
+using IndexShards = IndexShardsTemplate<Index>;
+using IndexBinaryShards = IndexShardsTemplate<IndexBinary>;
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/InvertedLists.cpp b/core/src/index/thirdparty/faiss/InvertedLists.cpp
new file mode 100644
index 0000000000..a438020fe9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/InvertedLists.cpp
@@ -0,0 +1,805 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/InvertedLists.h>
+
+#include <cstdio>
+#include <numeric>
+
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include "gpu/utils/DeviceUtils.h"
+#include "cuda.h"
+#include "cuda_runtime.h"
+
+namespace faiss {
+
+PageLockMemory::PageLockMemory(size_t size) : nbytes(size) {
+    CUDA_VERIFY(cudaHostAlloc(&data, size, 0));
+}
+
+PageLockMemory::~PageLockMemory() {
+    CUDA_VERIFY(cudaFreeHost((void*)data));
+}
+
+PageLockMemory::PageLockMemory(const PageLockMemory& other) {
+    CUDA_VERIFY(cudaHostAlloc(&data, other.nbytes, 0));
+    memcpy(data, other.data, other.nbytes);
+    nbytes = other.nbytes;
+}
+
+PageLockMemory::PageLockMemory(PageLockMemory &&other) {
+    data = other.data;
+    nbytes = other.nbytes;
+    other.data = nullptr;
+    other.nbytes = 0;
+}
+}
+
+namespace faiss {
+
+using ScopedIds = InvertedLists::ScopedIds;
+using ScopedCodes = InvertedLists::ScopedCodes;
+
+
+/*****************************************
+ * InvertedLists implementation
+ ******************************************/
+
+InvertedLists::InvertedLists (size_t nlist, size_t code_size):
+    nlist (nlist), code_size (code_size)
+{
+}
+
+InvertedLists::~InvertedLists ()
+{}
+
+InvertedLists::idx_t InvertedLists::get_single_id (
+     size_t list_no, size_t offset) const
+{
+    assert (offset < list_size (list_no));
+    return get_ids(list_no)[offset];
+}
+
+
+void InvertedLists::release_codes (size_t, const uint8_t *) const
+{}
+
+void InvertedLists::release_ids (size_t, const idx_t *) const
+{}
+
+void InvertedLists::prefetch_lists (const idx_t *, int) const
+{}
+
+const uint8_t * InvertedLists::get_single_code (
+                   size_t list_no, size_t offset) const
+{
+    assert (offset < list_size (list_no));
+    return get_codes(list_no) + offset * code_size;
+}
+
+size_t InvertedLists::add_entry (size_t list_no, idx_t theid,
+                                 const uint8_t *code)
+{
+    return add_entries (list_no, 1, &theid, code);
+}
+
+void InvertedLists::update_entry (size_t list_no, size_t offset,
+                                        idx_t id, const uint8_t *code)
+{
+    update_entries (list_no, offset, 1, &id, code);
+}
+
+InvertedLists* InvertedLists::to_readonly() {
+    return nullptr;
+}
+
+bool InvertedLists::is_readonly() const {
+    return false;
+}
+
+void InvertedLists::reset () {
+    for (size_t i = 0; i < nlist; i++) {
+        resize (i, 0);
+    }
+}
+
+void InvertedLists::merge_from (InvertedLists *oivf, size_t add_id) {
+
+#pragma omp parallel for
+    for (idx_t i = 0; i < nlist; i++) {
+        size_t list_size = oivf->list_size (i);
+        ScopedIds ids (oivf, i);
+        if (add_id == 0) {
+            add_entries (i, list_size, ids.get (),
+                         ScopedCodes (oivf, i).get());
+        } else {
+            std::vector <idx_t> new_ids (list_size);
+
+            for (size_t j = 0; j < list_size; j++) {
+                new_ids [j] = ids[j] + add_id;
+            }
+            add_entries (i, list_size, new_ids.data(),
+                                   ScopedCodes (oivf, i).get());
+        }
+        oivf->resize (i, 0);
+    }
+}
+
+double InvertedLists::imbalance_factor () const {
+    std::vector<int> hist(nlist);
+
+    for (size_t i = 0; i < nlist; i++) {
+        hist[i] = list_size(i);
+    }
+
+    return faiss::imbalance_factor(nlist, hist.data());
+}
+
+void InvertedLists::print_stats () const {
+    std::vector<int> sizes(40);
+    for (size_t i = 0; i < nlist; i++) {
+        for (size_t j = 0; j < sizes.size(); j++) {
+            if ((list_size(i) >> j) == 0) {
+                sizes[j]++;
+                break;
+            }
+        }
+    }
+    for (size_t i = 0; i < sizes.size(); i++) {
+        if (sizes[i]) {
+            printf("list size in < %d: %d instances\n", 1 << i, sizes[i]);
+        }
+    }
+}
+
+size_t InvertedLists::compute_ntotal () const {
+    size_t tot = 0;
+    for (size_t i = 0; i < nlist; i++) {
+        tot += list_size(i);
+    }
+    return tot;
+}
+
+/*****************************************
+ * ArrayInvertedLists implementation
+ ******************************************/
+
+ArrayInvertedLists::ArrayInvertedLists (size_t nlist, size_t code_size):
+    InvertedLists (nlist, code_size)
+{
+    ids.resize (nlist);
+    codes.resize (nlist);
+}
+
+size_t ArrayInvertedLists::add_entries (
+           size_t list_no, size_t n_entry,
+           const idx_t* ids_in, const uint8_t *code)
+{
+    if (n_entry == 0) return 0;
+    assert (list_no < nlist);
+    size_t o = ids [list_no].size();
+    ids [list_no].resize (o + n_entry);
+    memcpy (&ids[list_no][o], ids_in, sizeof (ids_in[0]) * n_entry);
+    codes [list_no].resize ((o + n_entry) * code_size);
+    memcpy (&codes[list_no][o * code_size], code, code_size * n_entry);
+    return o;
+}
+
+size_t ArrayInvertedLists::list_size(size_t list_no) const
+{
+    assert (list_no < nlist);
+    return ids[list_no].size();
+}
+
+const uint8_t * ArrayInvertedLists::get_codes (size_t list_no) const
+{
+    assert (list_no < nlist);
+    return codes[list_no].data();
+}
+
+
+const InvertedLists::idx_t * ArrayInvertedLists::get_ids (size_t list_no) const
+{
+    assert (list_no < nlist);
+    return ids[list_no].data();
+}
+
+void ArrayInvertedLists::resize (size_t list_no, size_t new_size)
+{
+    ids[list_no].resize (new_size);
+    codes[list_no].resize (new_size * code_size);
+}
+
+void ArrayInvertedLists::update_entries (
+      size_t list_no, size_t offset, size_t n_entry,
+      const idx_t *ids_in, const uint8_t *codes_in)
+{
+    assert (list_no < nlist);
+    assert (n_entry + offset <= ids[list_no].size());
+    memcpy (&ids[list_no][offset], ids_in, sizeof(ids_in[0]) * n_entry);
+    memcpy (&codes[list_no][offset * code_size], codes_in, code_size * n_entry);
+}
+
+InvertedLists* ArrayInvertedLists::to_readonly() {
+    ReadOnlyArrayInvertedLists* readonly = new ReadOnlyArrayInvertedLists(*this);
+    return readonly;
+}
+
+ArrayInvertedLists::~ArrayInvertedLists ()
+{}
+
+/*****************************************************************
+ * ReadOnlyArrayInvertedLists implementations
+ *****************************************************************/
+
+ReadOnlyArrayInvertedLists::ReadOnlyArrayInvertedLists(size_t nlist,
+                                                       size_t code_size, const std::vector<size_t>& list_length)
+        : InvertedLists (nlist, code_size),
+          readonly_length(list_length) {
+    valid = readonly_length.size() == nlist;
+    if (!valid) {
+        FAISS_THROW_MSG ("Invalid list_length");
+        return;
+    }
+    auto total_size = std::accumulate(readonly_length.begin(), readonly_length.end(), 0);
+    readonly_offset.reserve(nlist);
+
+    size_t offset = 0;
+    for (auto i=0; i<readonly_length.size(); ++i) {
+        readonly_offset.emplace_back(offset);
+        offset += readonly_length[i];
+    }
+}
+
+ReadOnlyArrayInvertedLists::ReadOnlyArrayInvertedLists(const ArrayInvertedLists& other)
+        : InvertedLists (other.nlist, other.code_size) {
+    std::vector <uint8_t> readonly_codes;
+    std::vector <idx_t> readonly_ids;
+    readonly_length.reserve(nlist);
+    size_t offset = 0;
+    for (auto& list_ids : other.ids) {
+        readonly_length.emplace_back(list_ids.size());
+        readonly_offset.emplace_back(offset);
+        offset += list_ids.size();
+        readonly_ids.insert(readonly_ids.end(), list_ids.begin(), list_ids.end());
+    }
+
+    for(auto& list_codes : other.codes) {
+        readonly_codes.insert(readonly_codes.end(), list_codes.begin(), list_codes.end());
+    }
+
+    // convert to page-lock memory
+    {
+        size_t size = readonly_codes.size() * sizeof(uint8_t);
+        pin_readonly_codes = std::make_shared<PageLockMemory>(size);
+        memcpy(pin_readonly_codes->data, readonly_codes.data(), size);
+    }
+    {
+        size_t size = readonly_ids.size() * sizeof(idx_t);
+        pin_readonly_ids = std::make_shared<PageLockMemory>(size);
+        memcpy(pin_readonly_ids->data, readonly_ids.data(), size);
+    }
+
+    valid = true;
+}
+
+//ReadOnlyArrayInvertedLists::ReadOnlyArrayInvertedLists(const ReadOnlyArrayInvertedLists &other)
+//    : InvertedLists (other.nlist, other.code_size) {
+//    readonly_length = other.readonly_length;
+//    readonly_offset = other.readonly_offset;
+//    pin_readonly_codes = std::make_shared<PageLockMemory>(*other.pin_readonly_codes);
+//    pin_readonly_ids = std::make_shared<PageLockMemory>(*other.pin_readonly_ids);
+//    valid = true;
+//}
+
+//ReadOnlyArrayInvertedLists::ReadOnlyArrayInvertedLists(ReadOnlyArrayInvertedLists &&other)
+//    : InvertedLists (other.nlist, other.code_size) {
+//    readonly_length = std::move(other.readonly_length);
+//    readonly_offset = std::move(other.readonly_offset);
+//    pin_readonly_codes = other.pin_readonly_codes;
+//    pin_readonly_ids = other.pin_readonly_ids;
+//
+//    other.pin_readonly_codes = nullptr;
+//    other.pin_readonly_ids = nullptr;
+//    valid = true;
+//}
+
+ReadOnlyArrayInvertedLists::~ReadOnlyArrayInvertedLists() {
+}
+
+bool
+ReadOnlyArrayInvertedLists::is_valid() {
+    return valid;
+}
+
+size_t ReadOnlyArrayInvertedLists::add_entries (
+        size_t , size_t ,
+        const idx_t* , const uint8_t *)
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+void ReadOnlyArrayInvertedLists::update_entries (size_t, size_t , size_t ,
+                                                 const idx_t *, const uint8_t *)
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+void ReadOnlyArrayInvertedLists::resize (size_t , size_t )
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+size_t ReadOnlyArrayInvertedLists::list_size(size_t list_no) const
+{
+    FAISS_ASSERT(list_no < nlist && valid);
+    return readonly_length[list_no];
+}
+
+const uint8_t * ReadOnlyArrayInvertedLists::get_codes (size_t list_no) const
+{
+    FAISS_ASSERT(list_no < nlist && valid);
+    uint8_t *pcodes = (uint8_t *)(pin_readonly_codes->data);
+    return pcodes + readonly_offset[list_no] * code_size;
+}
+
+const InvertedLists::idx_t* ReadOnlyArrayInvertedLists::get_ids (size_t list_no) const
+{
+    FAISS_ASSERT(list_no < nlist && valid);
+    idx_t *pids = (idx_t *)pin_readonly_ids->data;
+    return pids + readonly_offset[list_no];
+}
+
+const InvertedLists::idx_t* ReadOnlyArrayInvertedLists::get_all_ids() const {
+    FAISS_ASSERT(valid);
+    return (idx_t *)(pin_readonly_ids->data);
+}
+
+const uint8_t* ReadOnlyArrayInvertedLists::get_all_codes() const {
+    FAISS_ASSERT(valid);
+    return (uint8_t *)(pin_readonly_codes->data);
+}
+
+const std::vector<size_t>& ReadOnlyArrayInvertedLists::get_list_length() const {
+    FAISS_ASSERT(valid);
+    return readonly_length;
+}
+
+bool ReadOnlyArrayInvertedLists::is_readonly() const {
+    FAISS_ASSERT(valid);
+    return true;
+}
+
+/*****************************************************************
+ * Meta-inverted list implementations
+ *****************************************************************/
+
+
+size_t ReadOnlyInvertedLists::add_entries (
+           size_t , size_t ,
+           const idx_t* , const uint8_t *)
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+void ReadOnlyInvertedLists::update_entries (size_t, size_t , size_t ,
+                         const idx_t *, const uint8_t *)
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+void ReadOnlyInvertedLists::resize (size_t , size_t )
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+
+
+/*****************************************
+ * HStackInvertedLists implementation
+ ******************************************/
+
+HStackInvertedLists::HStackInvertedLists (
+          int nil, const InvertedLists **ils_in):
+    ReadOnlyInvertedLists (nil > 0 ? ils_in[0]->nlist : 0,
+                   nil > 0 ? ils_in[0]->code_size : 0)
+{
+    FAISS_THROW_IF_NOT (nil > 0);
+    for (int i = 0; i < nil; i++) {
+        ils.push_back (ils_in[i]);
+        FAISS_THROW_IF_NOT (ils_in[i]->code_size == code_size &&
+                            ils_in[i]->nlist == nlist);
+    }
+}
+
+size_t HStackInvertedLists::list_size(size_t list_no) const
+{
+    size_t sz = 0;
+    for (int i = 0; i < ils.size(); i++) {
+        const InvertedLists *il = ils[i];
+        sz += il->list_size (list_no);
+    }
+    return sz;
+}
+
+const uint8_t * HStackInvertedLists::get_codes (size_t list_no) const
+{
+    uint8_t *codes = new uint8_t [code_size * list_size(list_no)], *c = codes;
+
+    for (int i = 0; i < ils.size(); i++) {
+        const InvertedLists *il = ils[i];
+        size_t sz = il->list_size(list_no) * code_size;
+        if (sz > 0) {
+            memcpy (c, ScopedCodes (il, list_no).get(), sz);
+            c += sz;
+        }
+    }
+    return codes;
+}
+
+const uint8_t * HStackInvertedLists::get_single_code (
+           size_t list_no, size_t offset) const
+{
+    for (int i = 0; i < ils.size(); i++) {
+        const InvertedLists *il = ils[i];
+        size_t sz = il->list_size (list_no);
+        if (offset < sz) {
+            // here we have to copy the code, otherwise it will crash at dealloc
+            uint8_t * code = new uint8_t [code_size];
+            memcpy (code, ScopedCodes (il, list_no, offset).get(), code_size);
+            return code;
+        }
+        offset -= sz;
+    }
+    FAISS_THROW_FMT ("offset %ld unknown", offset);
+}
+
+
+void HStackInvertedLists::release_codes (size_t, const uint8_t *codes) const {
+    delete [] codes;
+}
+
+const Index::idx_t * HStackInvertedLists::get_ids (size_t list_no) const
+{
+    idx_t *ids = new idx_t [list_size(list_no)], *c = ids;
+
+    for (int i = 0; i < ils.size(); i++) {
+        const InvertedLists *il = ils[i];
+        size_t sz = il->list_size(list_no);
+        if (sz > 0) {
+            memcpy (c, ScopedIds (il, list_no).get(), sz * sizeof(idx_t));
+            c += sz;
+        }
+    }
+    return ids;
+}
+
+Index::idx_t HStackInvertedLists::get_single_id (
+                    size_t list_no, size_t offset) const
+{
+
+    for (int i = 0; i < ils.size(); i++) {
+        const InvertedLists *il = ils[i];
+        size_t sz = il->list_size (list_no);
+        if (offset < sz) {
+            return il->get_single_id (list_no, offset);
+        }
+        offset -= sz;
+    }
+    FAISS_THROW_FMT ("offset %ld unknown", offset);
+}
+
+
+void HStackInvertedLists::release_ids (size_t, const idx_t *ids) const {
+    delete [] ids;
+}
+
+void HStackInvertedLists::prefetch_lists (const idx_t *list_nos, int nlist) const
+{
+    for (int i = 0; i < ils.size(); i++) {
+        const InvertedLists *il = ils[i];
+        il->prefetch_lists (list_nos, nlist);
+    }
+}
+
+/*****************************************
+ * SliceInvertedLists implementation
+ ******************************************/
+
+
+namespace {
+
+    using idx_t = InvertedLists::idx_t;
+
+    idx_t translate_list_no (const SliceInvertedLists *sil,
+                             idx_t list_no) {
+        FAISS_THROW_IF_NOT (list_no >= 0 && list_no < sil->nlist);
+        return list_no + sil->i0;
+    }
+
+};
+
+
+
+SliceInvertedLists::SliceInvertedLists (
+    const InvertedLists *il, idx_t i0, idx_t i1):
+    ReadOnlyInvertedLists (i1 - i0, il->code_size),
+    il (il), i0(i0), i1(i1)
+{
+
+}
+
+size_t SliceInvertedLists::list_size(size_t list_no) const
+{
+    return il->list_size (translate_list_no (this, list_no));
+}
+
+const uint8_t * SliceInvertedLists::get_codes (size_t list_no) const
+{
+    return il->get_codes (translate_list_no (this, list_no));
+}
+
+const uint8_t * SliceInvertedLists::get_single_code (
+           size_t list_no, size_t offset) const
+{
+    return il->get_single_code (translate_list_no (this, list_no), offset);
+}
+
+
+void SliceInvertedLists::release_codes (
+       size_t list_no, const uint8_t *codes) const {
+    return il->release_codes (translate_list_no (this, list_no), codes);
+}
+
+const Index::idx_t * SliceInvertedLists::get_ids (size_t list_no) const
+{
+    return il->get_ids (translate_list_no (this, list_no));
+}
+
+Index::idx_t SliceInvertedLists::get_single_id (
+                    size_t list_no, size_t offset) const
+{
+    return il->get_single_id (translate_list_no (this, list_no), offset);
+}
+
+
+void SliceInvertedLists::release_ids (size_t list_no, const idx_t *ids) const {
+    return il->release_ids (translate_list_no (this, list_no), ids);
+}
+
+void SliceInvertedLists::prefetch_lists (const idx_t *list_nos, int nlist) const
+{
+    std::vector<idx_t> translated_list_nos;
+    for (int j = 0; j < nlist; j++) {
+        idx_t list_no = list_nos[j];
+        if (list_no < 0) continue;
+        translated_list_nos.push_back (translate_list_no (this, list_no));
+    }
+    il->prefetch_lists (translated_list_nos.data(),
+                        translated_list_nos.size());
+}
+
+
+/*****************************************
+ * VStackInvertedLists implementation
+ ******************************************/
+
+namespace {
+
+    using idx_t = InvertedLists::idx_t;
+
+    // find the invlist this number belongs to
+    int translate_list_no (const VStackInvertedLists *vil,
+                             idx_t list_no) {
+        FAISS_THROW_IF_NOT (list_no >= 0 && list_no < vil->nlist);
+        int i0 = 0, i1 = vil->ils.size();
+        const idx_t *cumsz = vil->cumsz.data();
+        while (i0 + 1 < i1) {
+            int imed = (i0 + i1) / 2;
+            if (list_no >= cumsz[imed]) {
+                i0 = imed;
+            } else {
+                i1 = imed;
+            }
+        }
+        assert(list_no >= cumsz[i0] && list_no < cumsz[i0 + 1]);
+        return i0;
+    }
+
+    idx_t sum_il_sizes (int nil, const InvertedLists **ils_in) {
+        idx_t tot = 0;
+        for (int i = 0; i < nil; i++) {
+            tot += ils_in[i]->nlist;
+        }
+        return tot;
+    }
+
+};
+
+
+
+VStackInvertedLists::VStackInvertedLists (
+          int nil, const InvertedLists **ils_in):
+    ReadOnlyInvertedLists (sum_il_sizes(nil, ils_in),
+                   nil > 0 ? ils_in[0]->code_size : 0)
+{
+    FAISS_THROW_IF_NOT (nil > 0);
+    cumsz.resize (nil + 1);
+    for (int i = 0; i < nil; i++) {
+        ils.push_back (ils_in[i]);
+        FAISS_THROW_IF_NOT (ils_in[i]->code_size == code_size);
+        cumsz[i + 1] = cumsz[i] + ils_in[i]->nlist;
+    }
+}
+
+size_t VStackInvertedLists::list_size(size_t list_no) const
+{
+    int i = translate_list_no (this, list_no);
+    list_no -= cumsz[i];
+    return ils[i]->list_size (list_no);
+}
+
+const uint8_t * VStackInvertedLists::get_codes (size_t list_no) const
+{
+    int i = translate_list_no (this, list_no);
+    list_no -= cumsz[i];
+    return ils[i]->get_codes (list_no);
+}
+
+const uint8_t * VStackInvertedLists::get_single_code (
+           size_t list_no, size_t offset) const
+{
+    int i = translate_list_no (this, list_no);
+    list_no -= cumsz[i];
+    return ils[i]->get_single_code (list_no, offset);
+}
+
+
+void VStackInvertedLists::release_codes (
+          size_t list_no, const uint8_t *codes) const {
+    int i = translate_list_no (this, list_no);
+    list_no -= cumsz[i];
+    return ils[i]->release_codes (list_no, codes);
+}
+
+const Index::idx_t * VStackInvertedLists::get_ids (size_t list_no) const
+{
+    int i = translate_list_no (this, list_no);
+    list_no -= cumsz[i];
+    return ils[i]->get_ids (list_no);
+}
+
+Index::idx_t VStackInvertedLists::get_single_id (
+                    size_t list_no, size_t offset) const
+{
+    int i = translate_list_no (this, list_no);
+    list_no -= cumsz[i];
+    return ils[i]->get_single_id (list_no, offset);
+}
+
+
+void VStackInvertedLists::release_ids (size_t list_no, const idx_t *ids) const {
+    int i = translate_list_no (this, list_no);
+    list_no -= cumsz[i];
+    return ils[i]->release_ids (list_no, ids);
+}
+
+void VStackInvertedLists::prefetch_lists (
+           const idx_t *list_nos, int nlist) const
+{
+    std::vector<int> ilno (nlist, -1);
+    std::vector<int> n_per_il (ils.size(), 0);
+    for (int j = 0; j < nlist; j++) {
+        idx_t list_no = list_nos[j];
+        if (list_no < 0) continue;
+        int i = ilno[j] = translate_list_no (this, list_no);
+        n_per_il[i]++;
+    }
+    std::vector<int> cum_n_per_il (ils.size() + 1, 0);
+    for (int j = 0; j < ils.size(); j++) {
+        cum_n_per_il[j + 1] = cum_n_per_il[j] + n_per_il[j];
+    }
+    std::vector<idx_t> sorted_list_nos (cum_n_per_il.back());
+    for (int j = 0; j < nlist; j++) {
+        idx_t list_no = list_nos[j];
+        if (list_no < 0) continue;
+        int i = ilno[j];
+        list_no -= cumsz[i];
+        sorted_list_nos[cum_n_per_il[i]++] = list_no;
+    }
+
+    int i0 = 0;
+    for (int j = 0; j < ils.size(); j++) {
+        int i1 = i0 + n_per_il[j];
+        if (i1 > i0) {
+            ils[j]->prefetch_lists (sorted_list_nos.data() + i0,
+                                    i1 - i0);
+        }
+        i0 = i1;
+    }
+}
+
+
+
+/*****************************************
+ * MaskedInvertedLists implementation
+ ******************************************/
+
+
+MaskedInvertedLists::MaskedInvertedLists (const InvertedLists *il0,
+                                          const InvertedLists *il1):
+    ReadOnlyInvertedLists (il0->nlist, il0->code_size),
+    il0 (il0), il1 (il1)
+{
+    FAISS_THROW_IF_NOT (il1->nlist == nlist);
+    FAISS_THROW_IF_NOT (il1->code_size == code_size);
+}
+
+size_t MaskedInvertedLists::list_size(size_t list_no) const
+{
+    size_t sz = il0->list_size(list_no);
+    return sz ? sz : il1->list_size(list_no);
+}
+
+const uint8_t * MaskedInvertedLists::get_codes (size_t list_no) const
+{
+    size_t sz = il0->list_size(list_no);
+    return (sz ? il0 : il1)->get_codes(list_no);
+}
+
+const idx_t * MaskedInvertedLists::get_ids (size_t list_no) const
+{
+    size_t sz = il0->list_size (list_no);
+    return (sz ? il0 : il1)->get_ids (list_no);
+}
+
+void MaskedInvertedLists::release_codes (
+      size_t list_no, const uint8_t *codes) const
+{
+    size_t sz = il0->list_size (list_no);
+    (sz ? il0 : il1)->release_codes (list_no, codes);
+}
+
+void MaskedInvertedLists::release_ids (size_t list_no, const idx_t *ids) const
+{
+    size_t sz = il0->list_size (list_no);
+    (sz ? il0 : il1)->release_ids (list_no, ids);
+}
+
+idx_t MaskedInvertedLists::get_single_id (size_t list_no, size_t offset) const
+{
+    size_t sz = il0->list_size (list_no);
+    return (sz ? il0 : il1)->get_single_id (list_no, offset);
+}
+
+const uint8_t * MaskedInvertedLists::get_single_code (
+           size_t list_no, size_t offset) const
+{
+    size_t sz = il0->list_size (list_no);
+    return (sz ? il0 : il1)->get_single_code (list_no, offset);
+}
+
+void MaskedInvertedLists::prefetch_lists (
+       const idx_t *list_nos, int nlist) const
+{
+    std::vector<idx_t> list0, list1;
+    for (int i = 0; i < nlist; i++) {
+        idx_t list_no = list_nos[i];
+        if (list_no < 0) continue;
+        size_t sz = il0->list_size(list_no);
+        (sz ? list0 : list1).push_back (list_no);
+    }
+    il0->prefetch_lists (list0.data(), list0.size());
+    il1->prefetch_lists (list1.data(), list1.size());
+}
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/InvertedLists.h b/core/src/index/thirdparty/faiss/InvertedLists.h
new file mode 100644
index 0000000000..1e0d84d5d0
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/InvertedLists.h
@@ -0,0 +1,402 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INVERTEDLISTS_IVF_H
+#define FAISS_INVERTEDLISTS_IVF_H
+
+/**
+ * Definition of inverted lists + a few common classes that implement
+ * the interface.
+ */
+
+#include <memory>
+#include <vector>
+#include <faiss/Index.h>
+
+namespace faiss {
+
+struct PageLockMemory {
+public:
+    PageLockMemory() : data(nullptr), nbytes(0) {}
+
+    PageLockMemory(size_t size);
+
+    ~PageLockMemory();
+
+    PageLockMemory(const PageLockMemory& other);
+
+    PageLockMemory(PageLockMemory &&other);
+
+    inline size_t size() {
+        return nbytes;
+    }
+
+    void *data;
+    size_t nbytes;
+};
+using PageLockMemoryPtr = std::shared_ptr<PageLockMemory>;
+}
+
+namespace faiss {
+
+/** Table of inverted lists
+ * multithreading rules:
+ * - concurrent read accesses are allowed
+ * - concurrent update accesses are allowed
+ * - for resize and add_entries, only concurrent access to different lists
+ *   are allowed
+ */
+struct InvertedLists {
+    typedef Index::idx_t idx_t;
+
+    size_t nlist;             ///< number of possible key values
+    size_t code_size;         ///< code size per vector in bytes
+
+    InvertedLists (size_t nlist, size_t code_size);
+
+    /*************************
+     *  Read only functions */
+
+    /// get the size of a list
+    virtual size_t list_size(size_t list_no) const = 0;
+
+    /** get the codes for an inverted list
+     * must be released by release_codes
+     *
+     * @return codes    size list_size * code_size
+     */
+    virtual const uint8_t * get_codes (size_t list_no) const = 0;
+
+    /** get the ids for an inverted list
+     * must be released by release_ids
+     *
+     * @return ids      size list_size
+     */
+    virtual const idx_t * get_ids (size_t list_no) const = 0;
+
+    /// release codes returned by get_codes (default implementation is nop
+    virtual void release_codes (size_t list_no, const uint8_t *codes) const;
+
+    /// release ids returned by get_ids
+    virtual void release_ids (size_t list_no, const idx_t *ids) const;
+
+    /// @return a single id in an inverted list
+    virtual idx_t get_single_id (size_t list_no, size_t offset) const;
+
+    /// @return a single code in an inverted list
+    /// (should be deallocated with release_codes)
+    virtual const uint8_t * get_single_code (
+                size_t list_no, size_t offset) const;
+
+    /// prepare the following lists (default does nothing)
+    /// a list can be -1 hence the signed long
+    virtual void prefetch_lists (const idx_t *list_nos, int nlist) const;
+
+    /*************************
+     * writing functions     */
+
+    /// add one entry to an inverted list
+    virtual size_t add_entry (size_t list_no, idx_t theid,
+                              const uint8_t *code);
+
+    virtual size_t add_entries (
+           size_t list_no, size_t n_entry,
+           const idx_t* ids, const uint8_t *code) = 0;
+
+    virtual void update_entry (size_t list_no, size_t offset,
+                               idx_t id, const uint8_t *code);
+
+    virtual void update_entries (size_t list_no, size_t offset, size_t n_entry,
+                                 const idx_t *ids, const uint8_t *code) = 0;
+
+    virtual void resize (size_t list_no, size_t new_size) = 0;
+
+    virtual void reset ();
+
+    virtual InvertedLists* to_readonly();
+
+    virtual bool is_readonly() const;
+
+    /// move all entries from oivf (empty on output)
+    void merge_from (InvertedLists *oivf, size_t add_id);
+
+    virtual ~InvertedLists ();
+
+    /*************************
+     * statistics            */
+
+    /// 1= perfectly balanced, >1: imbalanced
+    double imbalance_factor () const;
+
+    /// display some stats about the inverted lists
+    void print_stats () const;
+
+    /// sum up list sizes
+    size_t compute_ntotal () const;
+
+    /**************************************
+     * Scoped inverted lists (for automatic deallocation)
+     *
+     * instead of writing:
+     *
+     *     uint8_t * codes = invlists->get_codes (10);
+     *     ... use codes
+     *     invlists->release_codes(10, codes)
+     *
+     * write:
+     *
+     *    ScopedCodes codes (invlists, 10);
+     *    ... use codes.get()
+     *    // release called automatically when codes goes out of scope
+     *
+     * the following function call also works:
+     *
+     *    foo (123, ScopedCodes (invlists, 10).get(), 456);
+     *
+     */
+
+    struct ScopedIds {
+        const InvertedLists *il;
+        const idx_t *ids;
+        size_t list_no;
+
+        ScopedIds (const InvertedLists *il, size_t list_no):
+        il (il), ids (il->get_ids (list_no)), list_no (list_no)
+        {}
+
+        const idx_t *get() {return ids; }
+
+        idx_t operator [] (size_t i) const {
+            return ids[i];
+        }
+
+        ~ScopedIds () {
+            il->release_ids (list_no, ids);
+        }
+    };
+
+    struct ScopedCodes {
+        const InvertedLists *il;
+        const uint8_t *codes;
+        size_t list_no;
+
+        ScopedCodes (const InvertedLists *il, size_t list_no):
+            il (il), codes (il->get_codes (list_no)), list_no (list_no)
+        {}
+
+        ScopedCodes (const InvertedLists *il, size_t list_no, size_t offset):
+            il (il), codes (il->get_single_code (list_no, offset)),
+            list_no (list_no)
+        {}
+
+        const uint8_t *get() {return codes; }
+
+        ~ScopedCodes () {
+            il->release_codes (list_no, codes);
+        }
+    };
+
+
+};
+
+
+/// simple (default) implementation as an array of inverted lists
+struct ArrayInvertedLists: InvertedLists {
+    std::vector < std::vector<uint8_t> > codes; // binary codes, size nlist
+    std::vector < std::vector<idx_t> > ids;  ///< Inverted lists for indexes
+
+    ArrayInvertedLists (size_t nlist, size_t code_size);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t * get_codes (size_t list_no) const override;
+    const idx_t * get_ids (size_t list_no) const override;
+
+    size_t add_entries (
+           size_t list_no, size_t n_entry,
+           const idx_t* ids, const uint8_t *code) override;
+
+    void update_entries (size_t list_no, size_t offset, size_t n_entry,
+                         const idx_t *ids, const uint8_t *code) override;
+
+    void resize (size_t list_no, size_t new_size) override;
+
+    InvertedLists* to_readonly() override;
+
+    virtual ~ArrayInvertedLists ();
+};
+
+struct ReadOnlyArrayInvertedLists: InvertedLists {
+    PageLockMemoryPtr pin_readonly_codes;
+    PageLockMemoryPtr pin_readonly_ids;
+//    std::vector <uint8_t> readonly_codes;
+//    std::vector <idx_t> readonly_ids;
+    std::vector <size_t> readonly_length;
+    std::vector <size_t> readonly_offset;
+    bool valid;
+
+    ReadOnlyArrayInvertedLists(size_t nlist, size_t code_size, const std::vector<size_t>& list_length);
+    explicit ReadOnlyArrayInvertedLists(const ArrayInvertedLists& other);
+
+    // Use default copy construct, just copy pointer, DON'T COPY pin_readonly_codes AND pin_readonly_ids
+//    explicit ReadOnlyArrayInvertedLists(const ReadOnlyArrayInvertedLists &);
+//    explicit ReadOnlyArrayInvertedLists(ReadOnlyArrayInvertedLists &&);
+    virtual ~ReadOnlyArrayInvertedLists();
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t * get_codes (size_t list_no) const override;
+    const idx_t * get_ids (size_t list_no) const override;
+
+    const uint8_t * get_all_codes() const;
+    const idx_t * get_all_ids() const;
+    const std::vector<size_t>& get_list_length() const;
+
+    size_t add_entries (
+            size_t list_no, size_t n_entry,
+            const idx_t* ids, const uint8_t *code) override;
+
+    void update_entries (size_t list_no, size_t offset, size_t n_entry,
+                         const idx_t *ids, const uint8_t *code) override;
+
+    void resize (size_t list_no, size_t new_size) override;
+
+    bool is_readonly() const override;
+
+    bool is_valid();
+};
+/*****************************************************************
+ * Meta-inverted lists
+ *
+ * About terminology: the inverted lists are seen as a sparse matrix,
+ * that can be stacked horizontally, vertically and sliced.
+ *****************************************************************/
+
+struct ReadOnlyInvertedLists: InvertedLists {
+
+    ReadOnlyInvertedLists (size_t nlist, size_t code_size):
+    InvertedLists (nlist, code_size) {}
+
+    size_t add_entries (
+           size_t list_no, size_t n_entry,
+           const idx_t* ids, const uint8_t *code) override;
+
+    void update_entries (size_t list_no, size_t offset, size_t n_entry,
+                         const idx_t *ids, const uint8_t *code) override;
+
+    void resize (size_t list_no, size_t new_size) override;
+
+};
+
+
+/// Horizontal stack of inverted lists
+struct HStackInvertedLists: ReadOnlyInvertedLists {
+
+    std::vector<const InvertedLists *>ils;
+
+    /// build InvertedLists by concatenating nil of them
+    HStackInvertedLists (int nil, const InvertedLists **ils);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t * get_codes (size_t list_no) const override;
+    const idx_t * get_ids (size_t list_no) const override;
+
+    void prefetch_lists (const idx_t *list_nos, int nlist) const override;
+
+    void release_codes (size_t list_no, const uint8_t *codes) const override;
+    void release_ids (size_t list_no, const idx_t *ids) const override;
+
+    idx_t get_single_id (size_t list_no, size_t offset) const override;
+
+    const uint8_t * get_single_code (
+           size_t list_no, size_t offset) const override;
+
+};
+
+using ConcatenatedInvertedLists = HStackInvertedLists;
+
+
+/// vertical slice of indexes in another InvertedLists
+struct SliceInvertedLists: ReadOnlyInvertedLists {
+    const InvertedLists *il;
+    idx_t i0, i1;
+
+    SliceInvertedLists(const InvertedLists *il, idx_t i0, idx_t i1);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t * get_codes (size_t list_no) const override;
+    const idx_t * get_ids (size_t list_no) const override;
+
+    void release_codes (size_t list_no, const uint8_t *codes) const override;
+    void release_ids (size_t list_no, const idx_t *ids) const override;
+
+    idx_t get_single_id (size_t list_no, size_t offset) const override;
+
+    const uint8_t * get_single_code (
+           size_t list_no, size_t offset) const override;
+
+    void prefetch_lists (const idx_t *list_nos, int nlist) const override;
+};
+
+
+struct VStackInvertedLists: ReadOnlyInvertedLists {
+    std::vector<const InvertedLists *>ils;
+    std::vector<idx_t> cumsz;
+
+    /// build InvertedLists by concatenating nil of them
+    VStackInvertedLists (int nil, const InvertedLists **ils);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t * get_codes (size_t list_no) const override;
+    const idx_t * get_ids (size_t list_no) const override;
+
+    void release_codes (size_t list_no, const uint8_t *codes) const override;
+    void release_ids (size_t list_no, const idx_t *ids) const override;
+
+    idx_t get_single_id (size_t list_no, size_t offset) const override;
+
+    const uint8_t * get_single_code (
+           size_t list_no, size_t offset) const override;
+
+    void prefetch_lists (const idx_t *list_nos, int nlist) const override;
+
+};
+
+
+/** use the first inverted lists if they are non-empty otherwise use the second
+ *
+ * This is useful if il1 has a few inverted lists that are too long,
+ * and that il0 has replacement lists for those, with empty lists for
+ * the others. */
+struct MaskedInvertedLists: ReadOnlyInvertedLists {
+
+    const InvertedLists *il0;
+    const InvertedLists *il1;
+
+    MaskedInvertedLists (const InvertedLists *il0,
+                         const InvertedLists *il1);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t * get_codes (size_t list_no) const override;
+    const idx_t * get_ids (size_t list_no) const override;
+
+    void release_codes (size_t list_no, const uint8_t *codes) const override;
+    void release_ids (size_t list_no, const idx_t *ids) const override;
+
+    idx_t get_single_id (size_t list_no, size_t offset) const override;
+
+    const uint8_t * get_single_code (
+           size_t list_no, size_t offset) const override;
+
+    void prefetch_lists (const idx_t *list_nos, int nlist) const override;
+
+};
+
+} // namespace faiss
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/LICENSE b/core/src/index/thirdparty/faiss/LICENSE
new file mode 100644
index 0000000000..b96dcb0480
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/core/src/index/thirdparty/faiss/Makefile b/core/src/index/thirdparty/faiss/Makefile
new file mode 100644
index 0000000000..8a6fb7611a
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/Makefile
@@ -0,0 +1,113 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+-include makefile.inc
+
+HEADERS     = $(wildcard *.h impl/*.h utils/*.h)
+SRC         = $(wildcard *.cpp impl/*.cpp utils/*.cpp)
+OBJ         = $(SRC:.cpp=.o)
+INSTALLDIRS = $(DESTDIR)$(libdir) $(DESTDIR)$(includedir)/faiss
+
+GPU_HEADERS = $(wildcard gpu/*.h gpu/impl/*.h gpu/utils/*.h)
+GPU_CPPSRC  = $(wildcard gpu/*.cpp gpu/impl/*.cpp gpu/utils/*.cpp)
+GPU_CUSRC   = $(wildcard gpu/*.cu gpu/impl/*.cu gpu/utils/*.cu \
+gpu/utils/nvidia/*.cu gpu/utils/blockselect/*.cu gpu/utils/warpselect/*.cu)
+GPU_SRC     = $(GPU_CPPSRC) $(GPU_CUSRC)
+GPU_CPPOBJ  = $(GPU_CPPSRC:.cpp=.o)
+GPU_CUOBJ   = $(GPU_CUSRC:.cu=.o)
+GPU_OBJ     = $(GPU_CPPOBJ) $(GPU_CUOBJ)
+
+ifneq ($(strip $(NVCC)),)
+	OBJ         += $(GPU_OBJ)
+	HEADERS     += $(GPU_HEADERS)
+endif
+
+CPPFLAGS += -I.
+NVCCFLAGS += -I.
+
+############################
+# Building
+
+all: libfaiss.a libfaiss.$(SHAREDEXT)
+
+libfaiss.a: $(OBJ)
+	$(AR) r $@ $^
+
+libfaiss.$(SHAREDEXT): $(OBJ)
+	$(CXX) $(SHAREDFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+%.o: %.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -c $< -o $@
+
+%.o: %.cu
+	$(NVCC) $(NVCCFLAGS) -c $< -o $@
+
+clean:
+	rm -f libfaiss.a libfaiss.$(SHAREDEXT)
+	rm -f $(OBJ)
+
+
+############################
+# Installing
+
+install: libfaiss.a libfaiss.$(SHAREDEXT) installdirs
+	cp libfaiss.a libfaiss.$(SHAREDEXT) $(DESTDIR)$(libdir)
+	tar cf - $(HEADERS) | tar xf - -C $(DESTDIR)$(includedir)/faiss/
+
+installdirs:
+	$(MKDIR_P) $(INSTALLDIRS)
+
+uninstall:
+	rm -f $(DESTDIR)$(libdir)/libfaiss.a \
+	      $(DESTDIR)$(libdir)/libfaiss.$(SHAREDEXT)
+	rm -rf $(DESTDIR)$(includedir)/faiss
+
+
+#############################
+# Dependencies
+
+-include depend
+
+depend: $(SRC) $(GPU_SRC)
+	for i in $^; do \
+		$(CXXCPP) $(CPPFLAGS) -DCUDA_VERSION=7050 -x c++ -MM $$i; \
+	done > depend
+
+
+#############################
+# Python
+
+py: libfaiss.a
+	$(MAKE) -C python
+
+
+#############################
+# Tests
+
+test: libfaiss.a py
+	$(MAKE) -C tests run
+	PYTHONPATH=./python/build/`ls python/build | grep lib` \
+	$(PYTHON) -m unittest discover tests/ -v
+
+test_gpu: libfaiss.a
+	$(MAKE) -C gpu/test run
+	PYTHONPATH=./python/build/`ls python/build | grep lib` \
+	$(PYTHON) -m unittest discover gpu/test/ -v
+
+#############################
+# Demos
+
+demos: libfaiss.a
+	$(MAKE) -C demos
+
+
+#############################
+# Misc
+
+misc/test_blas: misc/test_blas.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+
+.PHONY: all clean demos install installdirs py test test_gpu uninstall
diff --git a/core/src/index/thirdparty/faiss/MatrixStats.cpp b/core/src/index/thirdparty/faiss/MatrixStats.cpp
new file mode 100644
index 0000000000..1862d1a52f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/MatrixStats.cpp
@@ -0,0 +1,252 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+
+#include <faiss/MatrixStats.h>
+
+
+#include <stdarg.h>     /* va_list, va_start, va_arg, va_end */
+
+#include <cmath>
+#include <cstdio>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+/*********************************************************************
+ * MatrixStats
+ *********************************************************************/
+
+MatrixStats::PerDimStats::PerDimStats():
+    n(0), n_nan(0), n_inf(0), n0(0),
+    min(HUGE_VALF), max(-HUGE_VALF),
+    sum(0), sum2(0),
+    mean(NAN), stddev(NAN)
+{}
+
+
+void MatrixStats::PerDimStats::add (float x)
+{
+    n++;
+    if (std::isnan(x)) {
+        n_nan++;
+        return;
+    }
+    if (!std::isfinite(x)) {
+        n_inf++;
+        return;
+    }
+    if (x == 0) n0++;
+    if (x < min) min = x;
+    if (x > max) max = x;
+    sum += x;
+    sum2 += (double)x * (double)x;
+}
+
+void MatrixStats::PerDimStats::compute_mean_std ()
+{
+    n_valid = n - n_nan - n_inf;
+    mean = sum / n_valid;
+    double var = sum2 / n_valid - mean * mean;
+    if (var < 0) var = 0;
+    stddev = sqrt(var);
+}
+
+
+void MatrixStats::do_comment (const char *fmt, ...)
+{
+    va_list ap;
+
+    /* Determine required size */
+    va_start(ap, fmt);
+    size_t size = vsnprintf(buf, nbuf, fmt, ap);
+    va_end(ap);
+
+    nbuf -= size;
+    buf += size;
+}
+
+
+
+MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
+    n(n), d(d),
+    n_collision(0), n_valid(0), n0(0),
+    min_norm2(HUGE_VAL), max_norm2(0)
+{
+    std::vector<char> comment_buf (10000);
+    buf = comment_buf.data ();
+    nbuf = comment_buf.size();
+
+    do_comment ("analyzing %ld vectors of size %ld\n", n, d);
+
+    if (d > 1024) {
+        do_comment (
+           "indexing this many dimensions is hard, "
+           "please consider dimensionality reducution (with PCAMatrix)\n");
+    }
+
+    size_t nbytes = sizeof (x[0]) * d;
+    per_dim_stats.resize (d);
+
+    for (size_t i = 0; i < n; i++) {
+        const float *xi = x + d * i;
+        double sum2 = 0;
+        for (size_t j = 0; j < d; j++) {
+            per_dim_stats[j].add (xi[j]);
+            sum2 += xi[j] * (double)xi[j];
+        }
+
+        if (std::isfinite (sum2)) {
+            n_valid++;
+            if (sum2 == 0) {
+                n0 ++;
+            } else {
+                if (sum2 < min_norm2) min_norm2 = sum2;
+                if (sum2 > max_norm2) max_norm2 = sum2;
+            }
+        }
+
+        { // check hash
+            uint64_t hash = hash_bytes((const uint8_t*)xi, nbytes);
+            auto elt = occurrences.find (hash);
+            if (elt == occurrences.end()) {
+                Occurrence occ = {i, 1};
+                occurrences[hash] = occ;
+            } else {
+                if (!memcmp (xi, x + elt->second.first * d, nbytes)) {
+                    elt->second.count ++;
+                } else {
+                    n_collision ++;
+                    // we should use a list of collisions but overkill
+                }
+            }
+        }
+    }
+
+    // invalid vecor stats
+    if (n_valid == n) {
+        do_comment ("no NaN or Infs in data\n");
+    } else {
+        do_comment ("%ld vectors contain NaN or Inf "
+                 "(or have too large components), "
+                 "expect bad results with indexing!\n", n - n_valid);
+    }
+
+    // copies in dataset
+    if (occurrences.size() == n) {
+        do_comment ("all vectors are distinct\n");
+    } else {
+        do_comment ("%ld vectors are distinct (%.2f%%)\n",
+                 occurrences.size(),
+                 occurrences.size() * 100.0 / n);
+
+        if (n_collision > 0) {
+            do_comment ("%ld collisions in hash table, "
+                     "counts may be invalid\n", n_collision);
+        }
+
+        Occurrence max = {0, 0};
+        for (auto it = occurrences.begin();
+             it != occurrences.end(); ++it) {
+            if (it->second.count > max.count) {
+                max = it->second;
+            }
+        }
+        do_comment ("vector %ld has %ld copies\n", max.first, max.count);
+    }
+
+    { // norm stats
+        min_norm2 = sqrt (min_norm2);
+        max_norm2 = sqrt (max_norm2);
+        do_comment ("range of L2 norms=[%g, %g] (%ld null vectors)\n",
+                 min_norm2, max_norm2, n0);
+
+        if (max_norm2 < min_norm2 * 1.0001) {
+            do_comment ("vectors are normalized, inner product and "
+                     "L2  search are equivalent\n");
+        }
+
+        if (max_norm2 > min_norm2 * 100) {
+            do_comment ("vectors have very large differences in norms, "
+                     "is this normal?\n");
+        }
+    }
+
+    { // per dimension stats
+
+        double max_std = 0, min_std = HUGE_VAL;
+
+        size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
+
+        for (size_t j = 0; j < d; j++) {
+            PerDimStats &st = per_dim_stats[j];
+            st.compute_mean_std ();
+            n0 += st.n0;
+
+            if (st.max == st.min) {
+                n_0_range ++;
+            } else if (st.max < 1.001 * st.min) {
+                n_dangerous_range ++;
+            }
+
+            if (st.stddev > max_std) max_std = st.stddev;
+            if (st.stddev < min_std) min_std = st.stddev;
+        }
+
+
+
+        if (n0 == 0) {
+            do_comment ("matrix contains no 0s\n");
+        } else {
+            do_comment ("matrix contains %.2f %% 0 entries\n",
+                     n0 * 100.0 / (n * d));
+        }
+
+        if (n_0_range == 0) {
+            do_comment ("no constant dimensions\n");
+        } else {
+            do_comment ("%ld dimensions are constant: they can be removed\n",
+                     n_0_range);
+        }
+
+        if (n_dangerous_range == 0) {
+            do_comment ("no dimension has a too large mean\n");
+        } else {
+            do_comment ("%ld dimensions are too large "
+                     "wrt. their variance, may loose precision "
+                     "in IndexFlatL2 (use CenteringTransform)\n",
+                     n_dangerous_range);
+        }
+
+        do_comment ("stddevs per dimension are in [%g %g]\n", min_std, max_std);
+
+        size_t n_small_var = 0;
+
+        for (size_t j = 0; j < d; j++) {
+            const PerDimStats &st = per_dim_stats[j];
+            if (st.stddev < max_std * 1e-4) {
+                n_small_var++;
+            }
+        }
+
+        if (n_small_var > 0) {
+            do_comment ("%ld dimensions have negligible stddev wrt. "
+                     "the largest dimension, they could be ignored",
+                     n_small_var);
+        }
+
+    }
+    comments = comment_buf.data ();
+    buf = nullptr;
+    nbuf = 0;
+}
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/MatrixStats.h b/core/src/index/thirdparty/faiss/MatrixStats.h
new file mode 100644
index 0000000000..6418644c6e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/MatrixStats.h
@@ -0,0 +1,62 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include <stdint.h>
+
+
+namespace faiss {
+
+
+/** Reports some statistics on a dataset and comments on them.
+ *
+ * It is a class rather than a function so that all stats can also be
+ * accessed from code */
+
+struct MatrixStats {
+    MatrixStats (size_t n, size_t d, const float *x);
+    std::string comments;
+
+    // raw statistics
+    size_t n, d;
+    size_t n_collision, n_valid, n0;
+    double min_norm2, max_norm2;
+
+    struct PerDimStats {
+        size_t n, n_nan, n_inf, n0;
+
+        float min, max;
+        double sum, sum2;
+
+        size_t n_valid;
+        double mean, stddev;
+
+        PerDimStats();
+        void add (float x);
+        void compute_mean_std ();
+    };
+
+    std::vector<PerDimStats> per_dim_stats;
+    struct Occurrence {
+        size_t first;
+        size_t count;
+    };
+    std::unordered_map<uint64_t, Occurrence> occurrences;
+
+    char *buf;
+    size_t nbuf;
+    void do_comment (const char *fmt, ...);
+
+};
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/MetaIndexes.cpp b/core/src/index/thirdparty/faiss/MetaIndexes.cpp
new file mode 100644
index 0000000000..c48b65d6ea
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/MetaIndexes.cpp
@@ -0,0 +1,351 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/MetaIndexes.h>
+
+#include <cstdio>
+#include <stdint.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/utils/WorkerThread.h>
+
+
+namespace faiss {
+
+namespace {
+
+typedef Index::idx_t idx_t;
+
+} // namespace
+
+/*****************************************************
+ * IndexIDMap implementation
+ *******************************************************/
+
+template <typename IndexT>
+IndexIDMapTemplate<IndexT>::IndexIDMapTemplate (IndexT *index):
+    index (index),
+    own_fields (false)
+{
+    FAISS_THROW_IF_NOT_MSG (index->ntotal == 0, "index must be empty on input");
+    this->is_trained = index->is_trained;
+    this->metric_type = index->metric_type;
+    this->verbose = index->verbose;
+    this->d = index->d;
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::add
+    (idx_t, const typename IndexT::component_t *)
+{
+    FAISS_THROW_MSG ("add does not make sense with IndexIDMap, "
+                      "use add_with_ids");
+}
+
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::train
+    (idx_t n, const typename IndexT::component_t *x)
+{
+    index->train (n, x);
+    this->is_trained = index->is_trained;
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::reset ()
+{
+    index->reset ();
+    id_map.clear();
+    this->ntotal = 0;
+}
+
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::add_with_ids
+    (idx_t n, const typename IndexT::component_t * x,
+     const typename IndexT::idx_t *xids)
+{
+    index->add (n, x);
+    for (idx_t i = 0; i < n; i++)
+        id_map.push_back (xids[i]);
+    this->ntotal = index->ntotal;
+}
+
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::search
+    (idx_t n, const typename IndexT::component_t *x, idx_t k,
+     typename IndexT::distance_t *distances, typename IndexT::idx_t *labels) const
+{
+    index->search (n, x, k, distances, labels);
+    idx_t *li = labels;
+#pragma omp parallel for
+    for (idx_t i = 0; i < n * k; i++) {
+        li[i] = li[i] < 0 ? li[i] : id_map[li[i]];
+    }
+}
+
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::range_search
+    (typename IndexT::idx_t n, const typename IndexT::component_t *x,
+     typename IndexT::distance_t radius, RangeSearchResult *result) const
+{
+  index->range_search(n, x, radius, result);
+#pragma omp parallel for
+  for (idx_t i = 0; i < result->lims[result->nq]; i++) {
+      result->labels[i] = result->labels[i] < 0 ?
+        result->labels[i] : id_map[result->labels[i]];
+  }
+}
+
+namespace {
+
+struct IDTranslatedSelector: IDSelector {
+    const std::vector <int64_t> & id_map;
+    const IDSelector & sel;
+    IDTranslatedSelector (const std::vector <int64_t> & id_map,
+                          const IDSelector & sel):
+        id_map (id_map), sel (sel)
+    {}
+    bool is_member(idx_t id) const override {
+      return sel.is_member(id_map[id]);
+    }
+};
+
+}
+
+template <typename IndexT>
+size_t IndexIDMapTemplate<IndexT>::remove_ids (const IDSelector & sel)
+{
+    // remove in sub-index first
+    IDTranslatedSelector sel2 (id_map, sel);
+    size_t nremove = index->remove_ids (sel2);
+
+    int64_t j = 0;
+    for (idx_t i = 0; i < this->ntotal; i++) {
+        if (sel.is_member (id_map[i])) {
+            // remove
+        } else {
+            id_map[j] = id_map[i];
+            j++;
+        }
+    }
+    FAISS_ASSERT (j == index->ntotal);
+    this->ntotal = j;
+    id_map.resize(this->ntotal);
+    return nremove;
+}
+
+template <typename IndexT>
+IndexIDMapTemplate<IndexT>::~IndexIDMapTemplate ()
+{
+    if (own_fields) delete index;
+}
+
+
+
+/*****************************************************
+ * IndexIDMap2 implementation
+ *******************************************************/
+
+template <typename IndexT>
+IndexIDMap2Template<IndexT>::IndexIDMap2Template (IndexT *index):
+    IndexIDMapTemplate<IndexT> (index)
+{}
+
+template <typename IndexT>
+void IndexIDMap2Template<IndexT>::add_with_ids
+    (idx_t n, const typename IndexT::component_t* x,
+     const typename IndexT::idx_t* xids)
+{
+    size_t prev_ntotal = this->ntotal;
+    IndexIDMapTemplate<IndexT>::add_with_ids (n, x, xids);
+    for (size_t i = prev_ntotal; i < this->ntotal; i++) {
+        rev_map [this->id_map [i]] = i;
+    }
+}
+
+template <typename IndexT>
+void IndexIDMap2Template<IndexT>::construct_rev_map ()
+{
+    rev_map.clear ();
+    for (size_t i = 0; i < this->ntotal; i++) {
+        rev_map [this->id_map [i]] = i;
+    }
+}
+
+
+template <typename IndexT>
+size_t IndexIDMap2Template<IndexT>::remove_ids(const IDSelector& sel)
+{
+    // This is quite inefficient
+    size_t nremove = IndexIDMapTemplate<IndexT>::remove_ids (sel);
+    construct_rev_map ();
+    return nremove;
+}
+
+template <typename IndexT>
+void IndexIDMap2Template<IndexT>::reconstruct
+    (idx_t key, typename IndexT::component_t * recons) const
+{
+    try {
+        this->index->reconstruct (rev_map.at (key), recons);
+    } catch (const std::out_of_range& e) {
+        FAISS_THROW_FMT ("key %ld not found", key);
+    }
+}
+
+
+// explicit template instantiations
+
+template struct IndexIDMapTemplate<Index>;
+template struct IndexIDMapTemplate<IndexBinary>;
+template struct IndexIDMap2Template<Index>;
+template struct IndexIDMap2Template<IndexBinary>;
+
+
+/*****************************************************
+ * IndexSplitVectors implementation
+ *******************************************************/
+
+
+IndexSplitVectors::IndexSplitVectors (idx_t d, bool threaded):
+    Index (d), own_fields (false),
+    threaded (threaded), sum_d (0)
+{
+
+}
+
+void IndexSplitVectors::add_sub_index (Index *index)
+{
+    sub_indexes.push_back (index);
+    sync_with_sub_indexes ();
+}
+
+void IndexSplitVectors::sync_with_sub_indexes ()
+{
+    if (sub_indexes.empty()) return;
+    Index * index0 = sub_indexes[0];
+    sum_d = index0->d;
+    metric_type = index0->metric_type;
+    is_trained = index0->is_trained;
+    ntotal = index0->ntotal;
+    for (int i = 1; i < sub_indexes.size(); i++) {
+        Index * index = sub_indexes[i];
+        FAISS_THROW_IF_NOT (metric_type == index->metric_type);
+        FAISS_THROW_IF_NOT (ntotal == index->ntotal);
+        sum_d += index->d;
+    }
+
+}
+
+void IndexSplitVectors::add(idx_t /*n*/, const float* /*x*/) {
+  FAISS_THROW_MSG("not implemented");
+}
+
+
+
+void IndexSplitVectors::search (
+           idx_t n, const float *x, idx_t k,
+           float *distances, idx_t *labels) const
+{
+    FAISS_THROW_IF_NOT_MSG (k == 1,
+                      "search implemented only for k=1");
+    FAISS_THROW_IF_NOT_MSG (sum_d == d,
+                      "not enough indexes compared to # dimensions");
+
+    int64_t nshard = sub_indexes.size();
+    float *all_distances = new float [nshard * k * n];
+    idx_t *all_labels = new idx_t [nshard * k * n];
+    ScopeDeleter<float> del (all_distances);
+    ScopeDeleter<idx_t> del2 (all_labels);
+
+    auto query_func = [n, x, k, distances, labels, all_distances, all_labels, this]
+        (int no) {
+        const IndexSplitVectors *index = this;
+        float *distances1 = no == 0 ? distances : all_distances + no * k * n;
+        idx_t *labels1 = no == 0 ? labels : all_labels + no * k * n;
+        if (index->verbose)
+            printf ("begin query shard %d on %ld points\n", no, n);
+        const Index * sub_index = index->sub_indexes[no];
+        int64_t sub_d = sub_index->d, d = index->d;
+        idx_t ofs = 0;
+        for (int i = 0; i < no; i++) ofs += index->sub_indexes[i]->d;
+        float *sub_x = new float [sub_d * n];
+        ScopeDeleter<float> del1 (sub_x);
+        for (idx_t i = 0; i < n; i++)
+            memcpy (sub_x + i * sub_d, x + ofs + i * d, sub_d * sizeof (sub_x));
+        sub_index->search (n, sub_x, k, distances1, labels1);
+        if (index->verbose)
+            printf ("end query shard %d\n", no);
+    };
+
+    if (!threaded) {
+        for (int i = 0; i < nshard; i++) {
+            query_func(i);
+        }
+    } else {
+        std::vector<std::unique_ptr<WorkerThread> > threads;
+        std::vector<std::future<bool>> v;
+
+        for (int i = 0; i < nshard; i++) {
+            threads.emplace_back(new WorkerThread());
+            WorkerThread *wt = threads.back().get();
+            v.emplace_back(wt->add([i, query_func](){query_func(i); }));
+        }
+
+        // Blocking wait for completion
+        for (auto& func : v) {
+            func.get();
+        }
+    }
+
+    int64_t factor = 1;
+    for (int i = 0; i < nshard; i++) {
+        if (i > 0) { // results of 0 are already in the table
+            const float *distances_i = all_distances + i * k * n;
+            const idx_t *labels_i = all_labels + i * k * n;
+            for (int64_t j = 0; j < n; j++) {
+                if (labels[j] >= 0 && labels_i[j] >= 0) {
+                    labels[j] += labels_i[j] * factor;
+                    distances[j] += distances_i[j];
+                } else {
+                    labels[j] = -1;
+                    distances[j] = 0.0 / 0.0;
+                }
+            }
+        }
+        factor *= sub_indexes[i]->ntotal;
+    }
+
+}
+
+void IndexSplitVectors::train(idx_t /*n*/, const float* /*x*/) {
+  FAISS_THROW_MSG("not implemented");
+}
+
+void IndexSplitVectors::reset ()
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+
+IndexSplitVectors::~IndexSplitVectors ()
+{
+    if (own_fields) {
+        for (int s = 0; s < sub_indexes.size(); s++)
+            delete sub_indexes [s];
+    }
+}
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/MetaIndexes.h b/core/src/index/thirdparty/faiss/MetaIndexes.h
new file mode 100644
index 0000000000..aed4c96f2e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/MetaIndexes.h
@@ -0,0 +1,126 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef META_INDEXES_H
+#define META_INDEXES_H
+
+#include <vector>
+#include <unordered_map>
+#include <faiss/Index.h>
+#include <faiss/IndexShards.h>
+#include <faiss/IndexReplicas.h>
+
+namespace faiss {
+
+/** Index that translates search results to ids */
+template <typename IndexT>
+struct IndexIDMapTemplate : IndexT {
+    using idx_t = typename IndexT::idx_t;
+    using component_t = typename IndexT::component_t;
+    using distance_t = typename IndexT::distance_t;
+
+    IndexT * index;           ///! the sub-index
+    bool own_fields;          ///! whether pointers are deleted in destructo
+    std::vector<idx_t> id_map;
+
+    explicit IndexIDMapTemplate (IndexT *index);
+
+    /// @param xids if non-null, ids to store for the vectors (size n)
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids) override;
+
+    /// this will fail. Use add_with_ids
+    void add(idx_t n, const component_t* x) override;
+
+    void search(
+        idx_t n, const component_t* x, idx_t k,
+        distance_t* distances,
+        idx_t* labels) const override;
+
+    void train(idx_t n, const component_t* x) override;
+
+    void reset() override;
+
+    /// remove ids adapted to IndexFlat
+    size_t remove_ids(const IDSelector& sel) override;
+
+    void range_search (idx_t n, const component_t *x, distance_t radius,
+                       RangeSearchResult *result) const override;
+
+    ~IndexIDMapTemplate () override;
+    IndexIDMapTemplate () {own_fields=false; index=nullptr; }
+};
+
+using IndexIDMap = IndexIDMapTemplate<Index>;
+using IndexBinaryIDMap = IndexIDMapTemplate<IndexBinary>;
+
+
+/** same as IndexIDMap but also provides an efficient reconstruction
+ *  implementation via a 2-way index */
+template <typename IndexT>
+struct IndexIDMap2Template : IndexIDMapTemplate<IndexT> {
+    using idx_t = typename IndexT::idx_t;
+    using component_t = typename IndexT::component_t;
+    using distance_t = typename IndexT::distance_t;
+
+    std::unordered_map<idx_t, idx_t> rev_map;
+
+    explicit IndexIDMap2Template (IndexT *index);
+
+    /// make the rev_map from scratch
+    void construct_rev_map ();
+
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids) override;
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    void reconstruct (idx_t key, component_t * recons) const override;
+
+    ~IndexIDMap2Template() override {}
+    IndexIDMap2Template () {}
+};
+
+using IndexIDMap2 = IndexIDMap2Template<Index>;
+using IndexBinaryIDMap2 = IndexIDMap2Template<IndexBinary>;
+
+
+/** splits input vectors in segments and assigns each segment to a sub-index
+ * used to distribute a MultiIndexQuantizer
+ */
+struct IndexSplitVectors: Index {
+    bool own_fields;
+    bool threaded;
+    std::vector<Index*> sub_indexes;
+    idx_t sum_d;  /// sum of dimensions seen so far
+
+    explicit IndexSplitVectors (idx_t d, bool threaded = false);
+
+    void add_sub_index (Index *);
+    void sync_with_sub_indexes ();
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+    void train(idx_t n, const float* x) override;
+
+    void reset() override;
+
+    ~IndexSplitVectors() override;
+};
+
+
+} // namespace faiss
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/OnDiskInvertedLists.cpp b/core/src/index/thirdparty/faiss/OnDiskInvertedLists.cpp
new file mode 100644
index 0000000000..2b798123d8
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/OnDiskInvertedLists.cpp
@@ -0,0 +1,674 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/OnDiskInvertedLists.h>
+
+#include <pthread.h>
+
+#include <unordered_set>
+
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/types.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/utils.h>
+
+
+namespace faiss {
+
+
+/**********************************************
+ * LockLevels
+ **********************************************/
+
+
+struct LockLevels {
+    /* There n times lock1(n), one lock2 and one lock3
+     * Invariants:
+     *    a single thread can hold one lock1(n) for some n
+     *    a single thread can hold lock2, if it holds lock1(n) for some n
+     *    a single thread can hold lock3, if it holds lock1(n) for some n
+     *       AND lock2 AND no other thread holds lock1(m) for m != n
+     */
+    pthread_mutex_t mutex1;
+    pthread_cond_t level1_cv;
+    pthread_cond_t level2_cv;
+    pthread_cond_t level3_cv;
+
+    std::unordered_set<int> level1_holders; // which level1 locks are held
+    int n_level2; // nb threads that wait on level2
+    bool level3_in_use; // a threads waits on level3
+    bool level2_in_use;
+
+    LockLevels() {
+        pthread_mutex_init(&mutex1, nullptr);
+        pthread_cond_init(&level1_cv, nullptr);
+        pthread_cond_init(&level2_cv, nullptr);
+        pthread_cond_init(&level3_cv, nullptr);
+        n_level2 = 0;
+        level2_in_use = false;
+        level3_in_use = false;
+    }
+
+    ~LockLevels() {
+        pthread_cond_destroy(&level1_cv);
+        pthread_cond_destroy(&level2_cv);
+        pthread_cond_destroy(&level3_cv);
+        pthread_mutex_destroy(&mutex1);
+    }
+
+    void lock_1(int no) {
+        pthread_mutex_lock(&mutex1);
+        while (level3_in_use || level1_holders.count(no) > 0) {
+            pthread_cond_wait(&level1_cv, &mutex1);
+        }
+        level1_holders.insert(no);
+        pthread_mutex_unlock(&mutex1);
+    }
+
+    void unlock_1(int no) {
+        pthread_mutex_lock(&mutex1);
+        assert(level1_holders.count(no) == 1);
+        level1_holders.erase(no);
+        if (level3_in_use) { // a writer is waiting
+            pthread_cond_signal(&level3_cv);
+        } else {
+            pthread_cond_broadcast(&level1_cv);
+        }
+        pthread_mutex_unlock(&mutex1);
+    }
+
+    void lock_2() {
+        pthread_mutex_lock(&mutex1);
+        n_level2 ++;
+        if (level3_in_use) { // tell waiting level3 that we are blocked
+            pthread_cond_signal(&level3_cv);
+        }
+        while (level2_in_use) {
+            pthread_cond_wait(&level2_cv, &mutex1);
+        }
+        level2_in_use = true;
+        pthread_mutex_unlock(&mutex1);
+    }
+
+    void unlock_2() {
+        pthread_mutex_lock(&mutex1);
+        level2_in_use = false;
+        n_level2 --;
+        pthread_cond_signal(&level2_cv);
+        pthread_mutex_unlock(&mutex1);
+    }
+
+    void lock_3() {
+        pthread_mutex_lock(&mutex1);
+        level3_in_use = true;
+        // wait until there are no level1 holders anymore except the
+        // ones that are waiting on level2 (we are holding lock2)
+        while (level1_holders.size() > n_level2) {
+            pthread_cond_wait(&level3_cv, &mutex1);
+        }
+        // don't release the lock!
+    }
+
+    void unlock_3() {
+        level3_in_use = false;
+        // wake up all level1_holders
+        pthread_cond_broadcast(&level1_cv);
+        pthread_mutex_unlock(&mutex1);
+    }
+
+    void print () {
+        pthread_mutex_lock(&mutex1);
+        printf("State: level3_in_use=%d n_level2=%d level1_holders: [", level3_in_use, n_level2);
+        for (int k : level1_holders) {
+            printf("%d ", k);
+        }
+        printf("]\n");
+        pthread_mutex_unlock(&mutex1);
+    }
+
+};
+
+/**********************************************
+ * OngoingPrefetch
+ **********************************************/
+
+struct OnDiskInvertedLists::OngoingPrefetch {
+
+    struct Thread {
+        pthread_t pth;
+        OngoingPrefetch *pf;
+
+        bool one_list () {
+            idx_t list_no = pf->get_next_list();
+            if(list_no == -1) return false;
+            const OnDiskInvertedLists *od = pf->od;
+            od->locks->lock_1 (list_no);
+            size_t n = od->list_size (list_no);
+            const Index::idx_t *idx = od->get_ids (list_no);
+            const uint8_t *codes = od->get_codes (list_no);
+            int cs = 0;
+            for (size_t i = 0; i < n;i++) {
+                cs += idx[i];
+            }
+            const idx_t *codes8 = (const idx_t*)codes;
+            idx_t n8 = n * od->code_size / 8;
+
+            for (size_t i = 0; i < n8;i++) {
+                cs += codes8[i];
+            }
+            od->locks->unlock_1(list_no);
+
+            global_cs += cs & 1;
+            return true;
+        }
+
+    };
+
+    std::vector<Thread> threads;
+
+    pthread_mutex_t list_ids_mutex;
+    std::vector<idx_t> list_ids;
+    int cur_list;
+
+    // mutex for the list of tasks
+    pthread_mutex_t mutex;
+
+    // pretext to avoid code below to be optimized out
+    static int global_cs;
+
+    const OnDiskInvertedLists *od;
+
+    explicit OngoingPrefetch (const OnDiskInvertedLists *od): od (od)
+    {
+        pthread_mutex_init (&mutex, nullptr);
+        pthread_mutex_init (&list_ids_mutex, nullptr);
+        cur_list = 0;
+    }
+
+    static void* prefetch_list (void * arg) {
+        Thread *th = static_cast<Thread*>(arg);
+
+        while (th->one_list()) ;
+
+        return nullptr;
+    }
+
+    idx_t get_next_list () {
+        idx_t list_no = -1;
+        pthread_mutex_lock (&list_ids_mutex);
+        if (cur_list >= 0 && cur_list < list_ids.size()) {
+            list_no = list_ids[cur_list++];
+        }
+        pthread_mutex_unlock (&list_ids_mutex);
+        return list_no;
+    }
+
+    void prefetch_lists (const idx_t *list_nos, int n) {
+        pthread_mutex_lock (&mutex);
+        pthread_mutex_lock (&list_ids_mutex);
+        list_ids.clear ();
+        pthread_mutex_unlock (&list_ids_mutex);
+        for (auto &th: threads) {
+            pthread_join (th.pth, nullptr);
+        }
+
+        threads.resize (0);
+        cur_list = 0;
+        int nt = std::min (n, od->prefetch_nthread);
+
+        if (nt > 0) {
+            // prepare tasks
+            for (int i = 0; i < n; i++) {
+                idx_t list_no = list_nos[i];
+                if (list_no >= 0 && od->list_size(list_no) > 0) {
+                    list_ids.push_back (list_no);
+                }
+            }
+            // prepare threads
+            threads.resize (nt);
+            for (Thread &th: threads) {
+                th.pf = this;
+                pthread_create (&th.pth, nullptr, prefetch_list, &th);
+            }
+        }
+        pthread_mutex_unlock (&mutex);
+    }
+
+    ~OngoingPrefetch () {
+        pthread_mutex_lock (&mutex);
+        for (auto &th: threads) {
+            pthread_join (th.pth, nullptr);
+        }
+        pthread_mutex_unlock (&mutex);
+        pthread_mutex_destroy (&mutex);
+        pthread_mutex_destroy (&list_ids_mutex);
+    }
+
+};
+
+int OnDiskInvertedLists::OngoingPrefetch::global_cs = 0;
+
+
+void OnDiskInvertedLists::prefetch_lists (const idx_t *list_nos, int n) const
+{
+    pf->prefetch_lists (list_nos, n);
+}
+
+
+
+/**********************************************
+ * OnDiskInvertedLists: mmapping
+ **********************************************/
+
+
+void OnDiskInvertedLists::do_mmap ()
+{
+    const char *rw_flags = read_only ? "r" : "r+";
+    int prot = read_only ? PROT_READ : PROT_WRITE | PROT_READ;
+    FILE *f = fopen (filename.c_str(), rw_flags);
+    FAISS_THROW_IF_NOT_FMT (f, "could not open %s in mode %s: %s",
+                            filename.c_str(), rw_flags, strerror(errno));
+
+    uint8_t * ptro = (uint8_t*)mmap (nullptr, totsize,
+                          prot, MAP_SHARED, fileno (f), 0);
+
+    FAISS_THROW_IF_NOT_FMT (ptro != MAP_FAILED,
+                            "could not mmap %s: %s",
+                            filename.c_str(),
+                            strerror(errno));
+    ptr = ptro;
+    fclose (f);
+
+}
+
+void OnDiskInvertedLists::update_totsize (size_t new_size)
+{
+
+    // unmap file
+    if (ptr != nullptr) {
+        int err = munmap (ptr, totsize);
+        FAISS_THROW_IF_NOT_FMT (err == 0, "munmap error: %s",
+                                strerror(errno));
+    }
+    if (totsize == 0) {
+        // must create file before truncating it
+        FILE *f = fopen (filename.c_str(), "w");
+        FAISS_THROW_IF_NOT_FMT (f, "could not open %s in mode W: %s",
+                                filename.c_str(), strerror(errno));
+        fclose (f);
+    }
+
+    if (new_size > totsize) {
+        if (!slots.empty() &&
+            slots.back().offset + slots.back().capacity == totsize) {
+            slots.back().capacity += new_size - totsize;
+        } else {
+            slots.push_back (Slot(totsize, new_size - totsize));
+        }
+    } else {
+        assert(!"not implemented");
+    }
+
+    totsize = new_size;
+
+    // create file
+    printf ("resizing %s to %ld bytes\n", filename.c_str(), totsize);
+
+    int err = truncate (filename.c_str(), totsize);
+
+    FAISS_THROW_IF_NOT_FMT (err == 0, "truncate %s to %ld: %s",
+                            filename.c_str(), totsize,
+                            strerror(errno));
+    do_mmap ();
+}
+
+
+
+
+
+
+/**********************************************
+ * OnDiskInvertedLists
+ **********************************************/
+
+#define INVALID_OFFSET (size_t)(-1)
+
+OnDiskInvertedLists::List::List ():
+    size (0), capacity (0), offset (INVALID_OFFSET)
+{}
+
+OnDiskInvertedLists::Slot::Slot (size_t offset, size_t capacity):
+    offset (offset), capacity (capacity)
+{}
+
+OnDiskInvertedLists::Slot::Slot ():
+    offset (0), capacity (0)
+{}
+
+
+
+OnDiskInvertedLists::OnDiskInvertedLists (
+        size_t nlist, size_t code_size,
+        const char *filename):
+    InvertedLists (nlist, code_size),
+    filename (filename),
+    totsize (0),
+    ptr (nullptr),
+    read_only (false),
+    locks (new LockLevels ()),
+    pf (new OngoingPrefetch (this)),
+    prefetch_nthread (32)
+{
+    lists.resize (nlist);
+
+    // slots starts empty
+}
+
+OnDiskInvertedLists::OnDiskInvertedLists ():
+    OnDiskInvertedLists (0, 0, "")
+{
+}
+
+OnDiskInvertedLists::~OnDiskInvertedLists ()
+{
+    delete pf;
+
+    // unmap all lists
+    if (ptr != nullptr) {
+        int err = munmap (ptr, totsize);
+        if (err != 0) {
+            fprintf(stderr, "mumap error: %s",
+                    strerror(errno));
+        }
+    }
+    delete locks;
+}
+
+
+
+
+size_t OnDiskInvertedLists::list_size(size_t list_no) const
+{
+    return lists[list_no].size;
+}
+
+
+const uint8_t * OnDiskInvertedLists::get_codes (size_t list_no) const
+{
+    if (lists[list_no].offset == INVALID_OFFSET) {
+        return nullptr;
+    }
+
+    return ptr + lists[list_no].offset;
+}
+
+const Index::idx_t * OnDiskInvertedLists::get_ids (size_t list_no) const
+{
+    if (lists[list_no].offset == INVALID_OFFSET) {
+        return nullptr;
+    }
+
+    return (const idx_t*)(ptr + lists[list_no].offset +
+                          code_size * lists[list_no].capacity);
+}
+
+
+void OnDiskInvertedLists::update_entries (
+      size_t list_no, size_t offset, size_t n_entry,
+      const idx_t *ids_in, const uint8_t *codes_in)
+{
+    FAISS_THROW_IF_NOT (!read_only);
+    if (n_entry == 0) return;
+    const List & l = lists[list_no];
+    assert (n_entry + offset <= l.size);
+    idx_t *ids = const_cast<idx_t*>(get_ids (list_no));
+    memcpy (ids + offset, ids_in, sizeof(ids_in[0]) * n_entry);
+    uint8_t *codes = const_cast<uint8_t*>(get_codes (list_no));
+    memcpy (codes + offset * code_size, codes_in, code_size * n_entry);
+}
+
+size_t OnDiskInvertedLists::add_entries (
+           size_t list_no, size_t n_entry,
+           const idx_t* ids, const uint8_t *code)
+{
+    FAISS_THROW_IF_NOT (!read_only);
+    locks->lock_1 (list_no);
+    size_t o = list_size (list_no);
+    resize_locked (list_no, n_entry + o);
+    update_entries (list_no, o, n_entry, ids, code);
+    locks->unlock_1 (list_no);
+    return o;
+}
+
+void OnDiskInvertedLists::resize (size_t list_no, size_t new_size)
+{
+    FAISS_THROW_IF_NOT (!read_only);
+    locks->lock_1 (list_no);
+    resize_locked (list_no, new_size);
+    locks->unlock_1 (list_no);
+}
+
+
+
+void OnDiskInvertedLists::resize_locked (size_t list_no, size_t new_size)
+{
+    List & l = lists[list_no];
+
+    if (new_size <= l.capacity &&
+        new_size > l.capacity / 2) {
+        l.size = new_size;
+        return;
+    }
+
+    // otherwise we release the current slot, and find a new one
+
+    locks->lock_2 ();
+    free_slot (l.offset, l.capacity);
+
+    List new_l;
+
+    if (new_size == 0) {
+        new_l = List();
+    } else {
+        new_l.size = new_size;
+        new_l.capacity = 1;
+        while (new_l.capacity < new_size) {
+            new_l.capacity *= 2;
+        }
+        new_l.offset = allocate_slot (
+            new_l.capacity * (sizeof(idx_t) + code_size));
+    }
+
+    // copy common data
+    if (l.offset != new_l.offset) {
+        size_t n = std::min (new_size, l.size);
+        if (n > 0) {
+            memcpy (ptr + new_l.offset, get_codes(list_no), n * code_size);
+            memcpy (ptr + new_l.offset + new_l.capacity * code_size,
+                    get_ids (list_no), n * sizeof(idx_t));
+        }
+    }
+
+    lists[list_no] = new_l;
+    locks->unlock_2 ();
+}
+
+size_t OnDiskInvertedLists::allocate_slot (size_t capacity) {
+    // should hold lock2
+
+    auto it = slots.begin();
+    while (it != slots.end() && it->capacity < capacity) {
+        it++;
+    }
+
+    if (it == slots.end()) {
+        // not enough capacity
+        size_t new_size = totsize == 0 ? 32 : totsize * 2;
+        while (new_size - totsize < capacity)
+            new_size *= 2;
+        locks->lock_3 ();
+        update_totsize(new_size);
+        locks->unlock_3 ();
+        it = slots.begin();
+        while (it != slots.end() && it->capacity < capacity) {
+            it++;
+        }
+        assert (it != slots.end());
+    }
+
+    size_t o = it->offset;
+    if (it->capacity == capacity) {
+        slots.erase (it);
+    } else {
+        // take from beginning of slot
+        it->capacity -= capacity;
+        it->offset += capacity;
+    }
+
+    return o;
+}
+
+
+
+void OnDiskInvertedLists::free_slot (size_t offset, size_t capacity) {
+
+    // should hold lock2
+    if (capacity == 0) return;
+
+    auto it = slots.begin();
+    while (it != slots.end() && it->offset <= offset) {
+        it++;
+    }
+
+    size_t inf = 1UL << 60;
+
+    size_t end_prev = inf;
+    if (it != slots.begin()) {
+        auto prev = it;
+        prev--;
+        end_prev = prev->offset + prev->capacity;
+    }
+
+    size_t begin_next = 1L << 60;
+    if (it != slots.end()) {
+        begin_next = it->offset;
+    }
+
+    assert (end_prev == inf || offset >= end_prev);
+    assert (offset + capacity <= begin_next);
+
+    if (offset == end_prev) {
+        auto prev = it;
+        prev--;
+        if (offset + capacity == begin_next) {
+            prev->capacity += capacity + it->capacity;
+            slots.erase (it);
+        } else {
+            prev->capacity += capacity;
+        }
+    } else {
+        if (offset + capacity == begin_next) {
+            it->offset -= capacity;
+            it->capacity += capacity;
+        } else {
+            slots.insert (it, Slot (offset, capacity));
+        }
+    }
+
+    // TODO shrink global storage if needed
+}
+
+
+/*****************************************
+ * Compact form
+ *****************************************/
+
+size_t OnDiskInvertedLists::merge_from (const InvertedLists **ils, int n_il,
+                                        bool verbose)
+{
+    FAISS_THROW_IF_NOT_MSG (totsize == 0, "works only on an empty InvertedLists");
+
+    std::vector<size_t> sizes (nlist);
+    for (int i = 0; i < n_il; i++) {
+        const InvertedLists *il = ils[i];
+        FAISS_THROW_IF_NOT (il->nlist == nlist && il->code_size == code_size);
+
+        for (size_t j = 0; j < nlist; j++)  {
+            sizes [j] += il->list_size(j);
+        }
+    }
+
+    size_t cums = 0;
+    size_t ntotal = 0;
+    for (size_t j = 0; j < nlist; j++)  {
+        ntotal += sizes[j];
+        lists[j].size = 0;
+        lists[j].capacity = sizes[j];
+        lists[j].offset = cums;
+        cums += lists[j].capacity * (sizeof(idx_t) + code_size);
+    }
+
+    update_totsize (cums);
+
+
+    size_t nmerged = 0;
+    double t0 = getmillisecs(), last_t = t0;
+
+#pragma omp parallel for
+    for (size_t j = 0; j < nlist; j++) {
+        List & l = lists[j];
+        for (int i = 0; i < n_il; i++) {
+            const InvertedLists *il = ils[i];
+            size_t n_entry = il->list_size(j);
+            l.size += n_entry;
+            update_entries (j, l.size - n_entry, n_entry,
+                            ScopedIds(il, j).get(),
+                            ScopedCodes(il, j).get());
+        }
+        assert (l.size == l.capacity);
+        if (verbose) {
+#pragma omp critical
+            {
+                nmerged++;
+                double t1 = getmillisecs();
+                if (t1 - last_t > 500) {
+                    printf("merged %ld lists in %.3f s\r",
+                           nmerged, (t1 - t0) / 1000.0);
+                    fflush(stdout);
+                    last_t = t1;
+                }
+            }
+        }
+    }
+    if(verbose) {
+        printf("\n");
+    }
+
+    return ntotal;
+}
+
+
+void OnDiskInvertedLists::crop_invlists(size_t l0, size_t l1)
+{
+    FAISS_THROW_IF_NOT(0 <= l0 && l0 <= l1 && l1 <= nlist);
+
+    std::vector<List> new_lists (l1 - l0);
+    memcpy (new_lists.data(), &lists[l0], (l1 - l0) * sizeof(List));
+
+    lists.swap(new_lists);
+
+    nlist = l1 - l0;
+}
+
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/OnDiskInvertedLists.h b/core/src/index/thirdparty/faiss/OnDiskInvertedLists.h
new file mode 100644
index 0000000000..3476b48ca9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/OnDiskInvertedLists.h
@@ -0,0 +1,127 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_ON_DISK_INVERTED_LISTS_H
+#define FAISS_ON_DISK_INVERTED_LISTS_H
+
+#include <vector>
+#include <list>
+
+#include <faiss/IndexIVF.h>
+
+namespace faiss {
+
+
+struct LockLevels;
+
+/** On-disk storage of inverted lists.
+ *
+ * The data is stored in a mmapped chunk of memory (base ptointer ptr,
+ * size totsize). Each list is a range of memory that contains (object
+ * List) that contains:
+ *
+ * - uint8_t codes[capacity * code_size]
+ * - followed by idx_t ids[capacity]
+ *
+ * in each of the arrays, the size <= capacity first elements are
+ * used, the rest is not initialized.
+ *
+ * Addition and resize are supported by:
+ * - roundind up the capacity of the lists to a power of two
+ * - maintaining a list of empty slots, sorted by size.
+ * - resizing the mmapped block is adjusted as needed.
+ *
+ * An OnDiskInvertedLists is compact if the size == capacity for all
+ * lists and there are no available slots.
+ *
+ * Addition to the invlists is slow. For incremental add it is better
+ * to use a default ArrayInvertedLists object and convert it to an
+ * OnDisk with merge_from.
+ *
+ * When it is known that a set of lists will be accessed, it is useful
+ * to call prefetch_lists, that launches a set of threads to read the
+ * lists in parallel.
+ */
+struct OnDiskInvertedLists: InvertedLists {
+
+    struct List {
+        size_t size;     // size of inverted list (entries)
+        size_t capacity; // allocated size (entries)
+        size_t offset;   // offset in buffer (bytes)
+        List ();
+    };
+
+    // size nlist
+    std::vector<List> lists;
+
+    struct Slot {
+        size_t offset;    // bytes
+        size_t capacity;  // bytes
+        Slot (size_t offset, size_t capacity);
+        Slot ();
+    };
+
+    // size whatever space remains
+    std::list<Slot> slots;
+
+    std::string filename;
+    size_t totsize;
+    uint8_t *ptr; // mmap base pointer
+    bool read_only;  /// are inverted lists mapped read-only
+
+    OnDiskInvertedLists (size_t nlist, size_t code_size,
+                         const char *filename);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t * get_codes (size_t list_no) const override;
+    const idx_t * get_ids (size_t list_no) const override;
+
+    size_t add_entries (
+           size_t list_no, size_t n_entry,
+           const idx_t* ids, const uint8_t *code) override;
+
+    void update_entries (size_t list_no, size_t offset, size_t n_entry,
+                         const idx_t *ids, const uint8_t *code) override;
+
+    void resize (size_t list_no, size_t new_size) override;
+
+    // copy all inverted lists into *this, in compact form (without
+    // allocating slots)
+    size_t merge_from (const InvertedLists **ils, int n_il, bool verbose=false);
+
+    /// restrict the inverted lists to l0:l1 without touching the mmapped region
+    void crop_invlists(size_t l0, size_t l1);
+
+    void prefetch_lists (const idx_t *list_nos, int nlist) const override;
+
+    virtual ~OnDiskInvertedLists ();
+
+    // private
+
+    LockLevels * locks;
+
+    // encapsulates the threads that are busy prefeteching
+    struct OngoingPrefetch;
+    OngoingPrefetch *pf;
+    int prefetch_nthread;
+
+    void do_mmap ();
+    void update_totsize (size_t new_totsize);
+    void resize_locked (size_t list_no, size_t new_size);
+    size_t allocate_slot (size_t capacity);
+    void free_slot (size_t offset, size_t capacity);
+
+    // empty constructor for the I/O functions
+    OnDiskInvertedLists ();
+};
+
+
+} // namespace faiss
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/README.md b/core/src/index/thirdparty/faiss/README.md
new file mode 100644
index 0000000000..039005aa28
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/README.md
@@ -0,0 +1,87 @@
+# Faiss 
+
+Faiss is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning. Faiss is written in C++ with complete wrappers for Python/numpy. Some of the most useful algorithms are implemented on the GPU. It is developed by [Facebook AI Research](https://research.fb.com/category/facebook-ai-research-fair/).
+
+## NEWS
+
+*NEW: version 1.5.3 (2019-06-24) fix performance regression in IndexIVF.*
+
+*NEW: version 1.5.2 (2019-05-27) the license was relaxed to MIT from BSD+Patents. Read LICENSE for details.*
+
+*NEW: version 1.5.0 (2018-12-19) GPU binary flat index and binary HNSW index*
+
+*NEW: version 1.4.0 (2018-08-30) no more crashes in pure Python code*
+
+*NEW: version 1.3.0 (2018-07-12) support for binary indexes*
+
+*NEW: latest commit (2018-02-22) supports on-disk storage of inverted indexes, see demos/demo_ondisk_ivf.py*
+
+*NEW: latest commit (2018-01-09) includes an implementation of the HNSW indexing method, see benchs/bench_hnsw.py*
+
+*NEW: there is now a Facebook public discussion group for Faiss users at https://www.facebook.com/groups/faissusers/*
+
+*NEW: on 2017-07-30, the license on Faiss was relaxed to BSD from CC-BY-NC. Read LICENSE for details.*
+
+## Introduction
+
+Faiss contains several methods for similarity search. It assumes that the instances are represented as vectors and are identified by an integer, and that the vectors can be compared with L2 distances or dot products. Vectors that are similar to a query vector are those that have the lowest L2 distance or the highest dot product with the query vector. It also supports cosine similarity, since this is a dot product on normalized vectors.
+
+Most of the methods, like those based on binary vectors and compact quantization codes, solely use a compressed representation of the vectors and do not require to keep the original vectors. This generally comes at the cost of a less precise search but these methods can scale to billions of vectors in main memory on a single server. 
+
+The GPU implementation can accept input from either CPU or GPU memory. On a server with GPUs, the GPU indexes can be used a drop-in replacement for the CPU indexes (e.g., replace `IndexFlatL2` with `GpuIndexFlatL2`) and copies to/from GPU memory are handled automatically. Results will be faster however if both input and output remain resident on the GPU. Both single and multi-GPU usage is supported.
+
+## Building 
+
+The library is mostly implemented in C++, with optional GPU support provided via CUDA, and an optional Python interface. The CPU version requires a BLAS library. It compiles with a Makefile and can be packaged in a docker image. See [INSTALL.md](INSTALL.md) for details.
+
+## How Faiss works
+
+Faiss is built around an index type that stores a set of vectors, and provides a function to search in them with L2 and/or dot product vector comparison. Some index types are simple baselines, such as exact search. Most of the available indexing structures correspond to various trade-offs with respect to
+
+- search time
+- search quality
+- memory used per index vector 
+- training time
+- need for external data for unsupervised training
+
+The optional GPU implementation provides what is likely (as of March 2017) the fastest exact and approximate (compressed-domain) nearest neighbor search implementation for high-dimensional vectors, fastest Lloyd's k-means, and fastest small k-selection algorithm known. [The implementation is detailed here](https://arxiv.org/abs/1702.08734).
+
+## Full documentation of Faiss
+
+The following are entry points for documentation: 
+
+- the full documentation, including a [tutorial](https://github.com/facebookresearch/faiss/wiki/Getting-started), a [FAQ](https://github.com/facebookresearch/faiss/wiki/FAQ) and a [troubleshooting section](https://github.com/facebookresearch/faiss/wiki/Troubleshooting) can be found on the [wiki page](http://github.com/facebookresearch/faiss/wiki)
+- the [doxygen documentation](http://rawgithub.com/facebookresearch/faiss/master/docs/html/annotated.html) gives per-class information
+- to reproduce results from our research papers, [Polysemous codes](https://arxiv.org/abs/1609.01882) and [Billion-scale similarity search with GPUs](https://arxiv.org/abs/1702.08734), refer to the [benchmarks README](benchs/README.md). For [
+Link and code: Fast indexing with graphs and compact regression codes](https://arxiv.org/abs/1804.09996), see the [link_and_code README](benchs/link_and_code)
+
+## Authors
+
+The main authors of Faiss are:
+- [Hervé Jégou](https://github.com/jegou) initiated the Faiss project and wrote its first implementation
+- [Matthijs Douze](https://github.com/mdouze) implemented most of the CPU Faiss
+- [Jeff Johnson](https://github.com/wickedfoo) implemented all of the GPU Faiss
+- [Lucas Hosseini](https://github.com/beauby) implemented the binary indexes
+
+## Reference
+
+Reference to cite when you use Faiss in a research paper:
+
+```
+@article{JDH17,
+  title={Billion-scale similarity search with GPUs},
+  author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
+  journal={arXiv preprint arXiv:1702.08734},
+  year={2017}
+}
+```
+
+## Join the Faiss community
+
+For public discussion of Faiss or for questions, there is a Facebook public discussion group at https://www.facebook.com/groups/faissusers/
+
+We monitor the [issues page](http://github.com/facebookresearch/faiss/issues) of the repository. You can report bugs, ask questions, etc.
+
+## License
+
+Faiss is MIT-licensed.
diff --git a/core/src/index/thirdparty/faiss/VectorTransform.cpp b/core/src/index/thirdparty/faiss/VectorTransform.cpp
new file mode 100644
index 0000000000..7e339cd939
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/VectorTransform.cpp
@@ -0,0 +1,1157 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/VectorTransform.h>
+
+#include <cstdio>
+#include <cmath>
+#include <cstring>
+#include <memory>
+
+#include <faiss/utils/distances.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/IndexPQ.h>
+
+using namespace faiss;
+
+
+extern "C" {
+
+// this is to keep the clang syntax checker happy
+#ifndef FINTEGER
+#define FINTEGER int
+#endif
+
+
+/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
+
+int sgemm_ (
+        const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+        n, FINTEGER *k, const float *alpha, const float *a,
+        FINTEGER *lda, const float *b,
+        FINTEGER *ldb, float *beta,
+        float *c, FINTEGER *ldc);
+
+int dgemm_ (
+        const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+        n, FINTEGER *k, const double *alpha, const double *a,
+        FINTEGER *lda, const double *b,
+        FINTEGER *ldb, double *beta,
+        double *c, FINTEGER *ldc);
+
+int ssyrk_ (
+        const char *uplo, const char *trans, FINTEGER *n, FINTEGER *k,
+        float *alpha, float *a, FINTEGER *lda,
+        float *beta, float *c, FINTEGER *ldc);
+
+/* Lapack functions from http://www.netlib.org/clapack/old/single/ */
+
+int ssyev_ (
+        const char *jobz, const char *uplo, FINTEGER *n, float *a,
+        FINTEGER *lda, float *w, float *work, FINTEGER *lwork,
+        FINTEGER *info);
+
+int dsyev_ (
+        const char *jobz, const char *uplo, FINTEGER *n, double *a,
+        FINTEGER *lda, double *w, double *work, FINTEGER *lwork,
+        FINTEGER *info);
+
+int sgesvd_(
+        const char *jobu, const char *jobvt, FINTEGER *m, FINTEGER *n,
+        float *a, FINTEGER *lda, float *s, float *u, FINTEGER *ldu, float *vt,
+        FINTEGER *ldvt, float *work, FINTEGER *lwork, FINTEGER *info);
+
+
+int dgesvd_(
+     const char *jobu, const char *jobvt, FINTEGER *m, FINTEGER *n,
+     double *a, FINTEGER *lda, double *s, double *u, FINTEGER *ldu, double *vt,
+     FINTEGER *ldvt, double *work, FINTEGER *lwork, FINTEGER *info);
+
+}
+
+/*********************************************
+ * VectorTransform
+ *********************************************/
+
+
+
+float * VectorTransform::apply (Index::idx_t n, const float * x) const
+{
+    float * xt = new float[n * d_out];
+    apply_noalloc (n, x, xt);
+    return xt;
+}
+
+
+void VectorTransform::train (idx_t, const float *) {
+    // does nothing by default
+}
+
+
+void VectorTransform::reverse_transform (
+             idx_t , const float *,
+             float *) const
+{
+    FAISS_THROW_MSG ("reverse transform not implemented");
+}
+
+
+
+
+/*********************************************
+ * LinearTransform
+ *********************************************/
+
+/// both d_in > d_out and d_out < d_in are supported
+LinearTransform::LinearTransform (int d_in, int d_out,
+                                  bool have_bias):
+    VectorTransform (d_in, d_out), have_bias (have_bias),
+    is_orthonormal (false), verbose (false)
+{
+    is_trained = false; // will be trained when A and b are initialized
+}
+
+void LinearTransform::apply_noalloc (Index::idx_t n, const float * x,
+                               float * xt) const
+{
+    FAISS_THROW_IF_NOT_MSG(is_trained, "Transformation not trained yet");
+
+    float c_factor;
+    if (have_bias) {
+        FAISS_THROW_IF_NOT_MSG (b.size() == d_out, "Bias not initialized");
+        float * xi = xt;
+        for (int i = 0; i < n; i++)
+            for(int j = 0; j < d_out; j++)
+                *xi++ = b[j];
+        c_factor = 1.0;
+    } else {
+        c_factor = 0.0;
+    }
+
+    FAISS_THROW_IF_NOT_MSG (A.size() == d_out * d_in,
+                      "Transformation matrix not initialized");
+
+    float one = 1;
+    FINTEGER nbiti = d_out, ni = n, di = d_in;
+    sgemm_ ("Transposed", "Not transposed",
+            &nbiti, &ni, &di,
+            &one, A.data(), &di, x, &di, &c_factor, xt, &nbiti);
+
+}
+
+
+void LinearTransform::transform_transpose (idx_t n, const float * y,
+                                           float *x) const
+{
+    if (have_bias) { // allocate buffer to store bias-corrected data
+        float *y_new = new float [n * d_out];
+        const float *yr = y;
+        float *yw = y_new;
+        for (idx_t i = 0; i < n; i++) {
+            for (int j = 0; j < d_out; j++) {
+                *yw++ = *yr++ - b [j];
+            }
+        }
+        y = y_new;
+    }
+
+    {
+        FINTEGER dii = d_in, doi = d_out, ni = n;
+        float one = 1.0, zero = 0.0;
+        sgemm_ ("Not", "Not", &dii, &ni, &doi,
+                &one, A.data (), &dii, y, &doi, &zero, x, &dii);
+    }
+
+    if (have_bias) delete [] y;
+}
+
+void LinearTransform::set_is_orthonormal ()
+{
+    if (d_out > d_in) {
+        // not clear what we should do in this case
+        is_orthonormal = false;
+        return;
+    }
+    if (d_out == 0) { // borderline case, unnormalized matrix
+        is_orthonormal = true;
+        return;
+    }
+
+    double eps = 4e-5;
+    FAISS_ASSERT(A.size() >= d_out * d_in);
+    {
+        std::vector<float> ATA(d_out * d_out);
+        FINTEGER dii = d_in, doi = d_out;
+        float one = 1.0, zero = 0.0;
+
+        sgemm_ ("Transposed", "Not", &doi, &doi, &dii,
+                &one, A.data (), &dii,
+                A.data(), &dii,
+                &zero, ATA.data(), &doi);
+
+        is_orthonormal = true;
+        for (long i = 0; i < d_out; i++) {
+            for (long j = 0; j < d_out; j++) {
+                float v = ATA[i + j * d_out];
+                if (i == j) v-= 1;
+                if (fabs(v) > eps) {
+                    is_orthonormal = false;
+                }
+            }
+        }
+    }
+
+}
+
+
+void LinearTransform::reverse_transform (idx_t n, const float * xt,
+                                         float *x) const
+{
+    if (is_orthonormal) {
+        transform_transpose (n, xt, x);
+    } else {
+        FAISS_THROW_MSG ("reverse transform not implemented for non-orthonormal matrices");
+    }
+}
+
+
+void LinearTransform::print_if_verbose (
+         const char*name, const std::vector<double> &mat,
+         int n, int d) const
+{
+    if (!verbose) return;
+    printf("matrix %s: %d*%d [\n", name, n, d);
+    FAISS_THROW_IF_NOT (mat.size() >= n * d);
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < d; j++) {
+            printf("%10.5g ", mat[i * d + j]);
+        }
+        printf("\n");
+    }
+    printf("]\n");
+}
+
+/*********************************************
+ * RandomRotationMatrix
+ *********************************************/
+
+void RandomRotationMatrix::init (int seed)
+{
+
+    if(d_out <= d_in) {
+        A.resize (d_out * d_in);
+        float *q = A.data();
+        float_randn(q, d_out * d_in, seed);
+        matrix_qr(d_in, d_out, q);
+    } else {
+        // use tight-frame transformation
+        A.resize (d_out * d_out);
+        float *q = A.data();
+        float_randn(q, d_out * d_out, seed);
+        matrix_qr(d_out, d_out, q);
+        // remove columns
+        int i, j;
+        for (i = 0; i < d_out; i++) {
+            for(j = 0; j < d_in; j++) {
+                q[i * d_in + j] = q[i * d_out + j];
+            }
+        }
+        A.resize(d_in * d_out);
+    }
+    is_orthonormal = true;
+    is_trained = true;
+}
+
+void RandomRotationMatrix::train (Index::idx_t /*n*/, const float */*x*/)
+{
+    // initialize with some arbitrary seed
+    init (12345);
+}
+
+
+/*********************************************
+ * PCAMatrix
+ *********************************************/
+
+PCAMatrix::PCAMatrix (int d_in, int d_out,
+                      float eigen_power, bool random_rotation):
+    LinearTransform(d_in, d_out, true),
+    eigen_power(eigen_power), random_rotation(random_rotation)
+{
+    is_trained = false;
+    max_points_per_d = 1000;
+    balanced_bins = 0;
+}
+
+
+namespace {
+
+/// Compute the eigenvalue decomposition of symmetric matrix cov,
+/// dimensions d_in-by-d_in. Output eigenvectors in cov.
+
+void eig(size_t d_in, double *cov, double *eigenvalues, int verbose)
+{
+    { // compute eigenvalues and vectors
+        FINTEGER info = 0, lwork = -1, di = d_in;
+        double workq;
+
+        dsyev_ ("Vectors as well", "Upper",
+                &di, cov, &di, eigenvalues, &workq, &lwork, &info);
+        lwork = FINTEGER(workq);
+        double *work = new double[lwork];
+
+        dsyev_ ("Vectors as well", "Upper",
+                &di, cov, &di, eigenvalues, work, &lwork, &info);
+
+        delete [] work;
+
+        if (info != 0) {
+            fprintf (stderr, "WARN ssyev info returns %d, "
+                     "a very bad PCA matrix is learnt\n",
+                     int(info));
+            // do not throw exception, as the matrix could still be useful
+        }
+
+
+        if(verbose && d_in <= 10) {
+            printf("info=%ld new eigvals=[", long(info));
+            for(int j = 0; j < d_in; j++) printf("%g ", eigenvalues[j]);
+            printf("]\n");
+
+            double *ci = cov;
+            printf("eigenvecs=\n");
+            for(int i = 0; i < d_in; i++) {
+                for(int j = 0; j < d_in; j++)
+                    printf("%10.4g ", *ci++);
+                printf("\n");
+            }
+        }
+
+    }
+
+    // revert order of eigenvectors & values
+
+    for(int i = 0; i < d_in / 2; i++) {
+
+        std::swap(eigenvalues[i], eigenvalues[d_in - 1 - i]);
+        double *v1 = cov + i * d_in;
+        double *v2 = cov + (d_in - 1 - i) * d_in;
+        for(int j = 0; j < d_in; j++)
+            std::swap(v1[j], v2[j]);
+    }
+
+}
+
+
+}
+
+void PCAMatrix::train (Index::idx_t n, const float *x)
+{
+    const float * x_in = x;
+
+    x = fvecs_maybe_subsample (d_in, (size_t*)&n,
+                               max_points_per_d * d_in, x, verbose);
+
+    ScopeDeleter<float> del_x (x != x_in ? x : nullptr);
+
+    // compute mean
+    mean.clear(); mean.resize(d_in, 0.0);
+    if (have_bias) { // we may want to skip the bias
+        const float *xi = x;
+        for (int i = 0; i < n; i++) {
+            for(int j = 0; j < d_in; j++)
+                mean[j] += *xi++;
+        }
+        for(int j = 0; j < d_in; j++)
+            mean[j] /= n;
+    }
+    if(verbose) {
+        printf("mean=[");
+        for(int j = 0; j < d_in; j++) printf("%g ", mean[j]);
+        printf("]\n");
+    }
+
+    if(n >= d_in) {
+        // compute covariance matrix, store it in PCA matrix
+        PCAMat.resize(d_in * d_in);
+        float * cov = PCAMat.data();
+        { // initialize with  mean * mean^T term
+            float *ci = cov;
+            for(int i = 0; i < d_in; i++) {
+                for(int j = 0; j < d_in; j++)
+                    *ci++ = - n * mean[i] * mean[j];
+            }
+        }
+        {
+            FINTEGER di = d_in, ni = n;
+            float one = 1.0;
+            ssyrk_ ("Up", "Non transposed",
+                    &di, &ni, &one, (float*)x, &di, &one, cov, &di);
+
+        }
+        if(verbose && d_in <= 10) {
+            float *ci = cov;
+            printf("cov=\n");
+            for(int i = 0; i < d_in; i++) {
+                for(int j = 0; j < d_in; j++)
+                    printf("%10g ", *ci++);
+                printf("\n");
+            }
+        }
+
+        std::vector<double> covd (d_in * d_in);
+        for (size_t i = 0; i < d_in * d_in; i++) covd [i] = cov [i];
+
+        std::vector<double> eigenvaluesd (d_in);
+
+        eig (d_in, covd.data (), eigenvaluesd.data (), verbose);
+
+        for (size_t i = 0; i < d_in * d_in; i++) PCAMat [i] = covd [i];
+        eigenvalues.resize (d_in);
+
+        for (size_t i = 0; i < d_in; i++)
+            eigenvalues [i] = eigenvaluesd [i];
+
+
+    } else {
+
+        std::vector<float> xc (n * d_in);
+
+        for (size_t i = 0; i < n; i++)
+            for(size_t j = 0; j < d_in; j++)
+                xc [i * d_in + j] = x [i * d_in + j] - mean[j];
+
+        // compute Gram matrix
+        std::vector<float> gram (n * n);
+        {
+            FINTEGER di = d_in, ni = n;
+            float one = 1.0, zero = 0.0;
+            ssyrk_ ("Up", "Transposed",
+                    &ni, &di, &one, xc.data(), &di, &zero, gram.data(), &ni);
+        }
+
+        if(verbose && d_in <= 10) {
+            float *ci = gram.data();
+            printf("gram=\n");
+            for(int i = 0; i < n; i++) {
+                for(int j = 0; j < n; j++)
+                    printf("%10g ", *ci++);
+                printf("\n");
+            }
+        }
+
+        std::vector<double> gramd (n * n);
+        for (size_t i = 0; i < n * n; i++)
+            gramd [i] = gram [i];
+
+        std::vector<double> eigenvaluesd (n);
+
+        // eig will fill in only the n first eigenvals
+
+        eig (n, gramd.data (), eigenvaluesd.data (), verbose);
+
+        PCAMat.resize(d_in * n);
+
+        for (size_t i = 0; i < n * n; i++)
+            gram [i] = gramd [i];
+
+        eigenvalues.resize (d_in);
+        // fill in only the n first ones
+        for (size_t i = 0; i < n; i++)
+            eigenvalues [i] = eigenvaluesd [i];
+
+        { // compute PCAMat = x' * v
+            FINTEGER di = d_in, ni = n;
+            float one = 1.0;
+
+            sgemm_ ("Non", "Non Trans",
+                    &di, &ni, &ni,
+                    &one, xc.data(), &di, gram.data(), &ni,
+                    &one, PCAMat.data(), &di);
+        }
+
+        if(verbose && d_in <= 10) {
+            float *ci = PCAMat.data();
+            printf("PCAMat=\n");
+            for(int i = 0; i < n; i++) {
+                for(int j = 0; j < d_in; j++)
+                    printf("%10g ", *ci++);
+                printf("\n");
+            }
+        }
+        fvec_renorm_L2 (d_in, n, PCAMat.data());
+
+    }
+
+    prepare_Ab();
+    is_trained = true;
+}
+
+void PCAMatrix::copy_from (const PCAMatrix & other)
+{
+    FAISS_THROW_IF_NOT (other.is_trained);
+    mean = other.mean;
+    eigenvalues = other.eigenvalues;
+    PCAMat = other.PCAMat;
+    prepare_Ab ();
+    is_trained = true;
+}
+
+void PCAMatrix::prepare_Ab ()
+{
+    FAISS_THROW_IF_NOT_FMT (
+            d_out * d_in <= PCAMat.size(),
+            "PCA matrix cannot output %d dimensions from %d ",
+            d_out, d_in);
+
+    if (!random_rotation) {
+        A = PCAMat;
+        A.resize(d_out * d_in); // strip off useless dimensions
+
+        // first scale the components
+        if (eigen_power != 0) {
+            float *ai = A.data();
+            for (int i = 0; i < d_out; i++) {
+                float factor = pow(eigenvalues[i], eigen_power);
+                for(int j = 0; j < d_in; j++)
+                    *ai++ *= factor;
+            }
+        }
+
+        if (balanced_bins != 0) {
+            FAISS_THROW_IF_NOT (d_out % balanced_bins == 0);
+            int dsub = d_out / balanced_bins;
+            std::vector <float> Ain;
+            std::swap(A, Ain);
+            A.resize(d_out * d_in);
+
+            std::vector <float> accu(balanced_bins);
+            std::vector <int> counter(balanced_bins);
+
+            // greedy assignment
+            for (int i = 0; i < d_out; i++) {
+                // find best bin
+                int best_j = -1;
+                float min_w = 1e30;
+                for (int j = 0; j < balanced_bins; j++) {
+                    if (counter[j] < dsub && accu[j] < min_w) {
+                        min_w = accu[j];
+                        best_j = j;
+                    }
+                }
+                int row_dst = best_j * dsub + counter[best_j];
+                accu[best_j] += eigenvalues[i];
+                counter[best_j] ++;
+                memcpy (&A[row_dst * d_in], &Ain[i * d_in],
+                        d_in * sizeof (A[0]));
+            }
+
+            if (verbose) {
+                printf("  bin accu=[");
+                for (int i = 0; i < balanced_bins; i++)
+                    printf("%g ", accu[i]);
+                printf("]\n");
+            }
+        }
+
+
+    } else {
+        FAISS_THROW_IF_NOT_MSG (balanced_bins == 0,
+             "both balancing bins and applying a random rotation "
+             "does not make sense");
+        RandomRotationMatrix rr(d_out, d_out);
+
+        rr.init(5);
+
+        // apply scaling on the rotation matrix (right multiplication)
+        if (eigen_power != 0) {
+            for (int i = 0; i < d_out; i++) {
+                float factor = pow(eigenvalues[i], eigen_power);
+                for(int j = 0; j < d_out; j++)
+                   rr.A[j * d_out + i] *= factor;
+            }
+        }
+
+        A.resize(d_in * d_out);
+        {
+            FINTEGER dii = d_in, doo = d_out;
+            float one = 1.0, zero = 0.0;
+
+            sgemm_ ("Not", "Not", &dii, &doo, &doo,
+                    &one, PCAMat.data(), &dii, rr.A.data(), &doo, &zero,
+                    A.data(), &dii);
+
+        }
+
+    }
+
+    b.clear(); b.resize(d_out);
+
+    for (int i = 0; i < d_out; i++) {
+        float accu = 0;
+        for (int j = 0; j < d_in; j++)
+            accu -= mean[j] * A[j + i * d_in];
+        b[i] = accu;
+    }
+
+    is_orthonormal = eigen_power == 0;
+
+}
+
+/*********************************************
+ * ITQMatrix
+ *********************************************/
+
+ITQMatrix::ITQMatrix (int d):
+    LinearTransform(d, d, false),
+    max_iter (50),
+    seed (123)
+{
+}
+
+
+/** translated from fbcode/deeplearning/catalyzer/catalyzer/quantizers.py */
+void ITQMatrix::train (Index::idx_t n, const float* xf)
+{
+    size_t d = d_in;
+    std::vector<double> rotation (d * d);
+
+    if (init_rotation.size() == d * d) {
+        memcpy (rotation.data(), init_rotation.data(),
+                d * d * sizeof(rotation[0]));
+    } else {
+        RandomRotationMatrix rrot (d, d);
+        rrot.init (seed);
+        for (size_t i = 0; i < d * d; i++) {
+            rotation[i] = rrot.A[i];
+        }
+    }
+
+    std::vector<double> x (n * d);
+
+    for (size_t i = 0; i < n * d; i++) {
+        x[i] = xf[i];
+    }
+
+    std::vector<double> rotated_x (n * d), cov_mat (d * d);
+    std::vector<double> u (d * d), vt (d * d), singvals (d);
+
+    for (int i = 0; i < max_iter; i++) {
+        print_if_verbose ("rotation", rotation, d, d);
+        { // rotated_data = np.dot(training_data, rotation)
+            FINTEGER di = d, ni = n;
+            double one = 1, zero = 0;
+            dgemm_ ("N", "N", &di, &ni, &di,
+                    &one, rotation.data(), &di, x.data(), &di,
+                    &zero, rotated_x.data(), &di);
+        }
+        print_if_verbose ("rotated_x", rotated_x, n, d);
+        // binarize
+        for (size_t j = 0; j < n * d; j++) {
+            rotated_x[j] = rotated_x[j] < 0 ? -1 : 1;
+        }
+        // covariance matrix
+        { // rotated_data = np.dot(training_data, rotation)
+            FINTEGER di = d, ni = n;
+            double one = 1, zero = 0;
+            dgemm_ ("N", "T", &di, &di, &ni,
+                    &one, rotated_x.data(), &di, x.data(), &di,
+                    &zero, cov_mat.data(), &di);
+        }
+        print_if_verbose ("cov_mat", cov_mat, d, d);
+        // SVD
+        {
+
+            FINTEGER di = d;
+            FINTEGER lwork = -1, info;
+            double lwork1;
+
+            // workspace query
+            dgesvd_ ("A", "A", &di, &di, cov_mat.data(), &di,
+                     singvals.data(), u.data(), &di,
+                     vt.data(), &di,
+                     &lwork1, &lwork, &info);
+
+            FAISS_THROW_IF_NOT (info == 0);
+            lwork = size_t (lwork1);
+            std::vector<double> work (lwork);
+            dgesvd_ ("A", "A", &di, &di, cov_mat.data(), &di,
+                     singvals.data(), u.data(), &di,
+                     vt.data(), &di,
+                     work.data(), &lwork, &info);
+            FAISS_THROW_IF_NOT_FMT (info == 0, "sgesvd returned info=%d", info);
+
+        }
+        print_if_verbose ("u", u, d, d);
+        print_if_verbose ("vt", vt, d, d);
+        // update rotation
+        {
+            FINTEGER di = d;
+            double one = 1, zero = 0;
+            dgemm_ ("N", "T", &di, &di, &di,
+                    &one, u.data(), &di, vt.data(), &di,
+                    &zero, rotation.data(), &di);
+        }
+        print_if_verbose ("final rot", rotation, d, d);
+
+    }
+    A.resize (d * d);
+    for (size_t i = 0; i < d; i++) {
+        for (size_t j = 0; j < d; j++) {
+            A[i + d * j] = rotation[j + d * i];
+        }
+    }
+    is_trained = true;
+
+}
+
+ITQTransform::ITQTransform (int d_in, int d_out, bool do_pca):
+    VectorTransform (d_in, d_out),
+    do_pca (do_pca),
+    itq (d_out),
+    pca_then_itq (d_in, d_out, false)
+{
+    if (!do_pca) {
+        FAISS_THROW_IF_NOT (d_in == d_out);
+    }
+    max_train_per_dim = 10;
+    is_trained = false;
+}
+
+
+
+
+void ITQTransform::train (idx_t n, const float *x)
+{
+    FAISS_THROW_IF_NOT (!is_trained);
+
+    const float * x_in = x;
+    size_t max_train_points = std::max(d_in * max_train_per_dim, 32768);
+    x = fvecs_maybe_subsample (d_in, (size_t*)&n, max_train_points, x);
+
+    ScopeDeleter<float> del_x (x != x_in ? x : nullptr);
+
+    std::unique_ptr<float []> x_norm(new float[n * d_in]);
+    { // normalize
+        int d = d_in;
+
+        mean.resize (d, 0);
+        for (idx_t i = 0; i < n; i++) {
+            for (idx_t j = 0; j < d; j++) {
+                mean[j] += x[i * d + j];
+            }
+        }
+        for (idx_t j = 0; j < d; j++) {
+            mean[j] /= n;
+        }
+        for (idx_t i = 0; i < n; i++) {
+            for (idx_t j = 0; j < d; j++) {
+            x_norm[i * d + j] = x[i * d + j] - mean[j];
+            }
+        }
+        fvec_renorm_L2 (d_in, n, x_norm.get());
+    }
+
+    // train PCA
+
+    PCAMatrix pca (d_in, d_out);
+    float *x_pca;
+    std::unique_ptr<float []> x_pca_del;
+    if (do_pca) {
+        pca.have_bias = false;  // for consistency with reference implem
+        pca.train (n, x_norm.get());
+        x_pca = pca.apply (n, x_norm.get());
+        x_pca_del.reset(x_pca);
+    } else {
+        x_pca = x_norm.get();
+    }
+
+    // train ITQ
+    itq.train (n, x_pca);
+
+    // merge PCA and ITQ
+    if (do_pca) {
+        FINTEGER di = d_out, dini = d_in;
+        float one = 1, zero = 0;
+        pca_then_itq.A.resize(d_in * d_out);
+        sgemm_ ("N", "N", &dini, &di, &di,
+                &one, pca.A.data(), &dini,
+                itq.A.data(), &di,
+                &zero, pca_then_itq.A.data(), &dini);
+    } else {
+        pca_then_itq.A = itq.A;
+    }
+    pca_then_itq.is_trained = true;
+    is_trained = true;
+}
+
+void ITQTransform::apply_noalloc (Index::idx_t n, const float * x,
+                               float * xt) const
+{
+    FAISS_THROW_IF_NOT_MSG(is_trained, "Transformation not trained yet");
+
+    std::unique_ptr<float []> x_norm(new float[n * d_in]);
+    { // normalize
+        int d = d_in;
+        for (idx_t i = 0; i < n; i++) {
+            for (idx_t j = 0; j < d; j++) {
+                x_norm[i * d + j] = x[i * d + j] - mean[j];
+            }
+        }
+        // this is not really useful if we are going to binarize right
+        // afterwards but OK
+        fvec_renorm_L2 (d_in, n, x_norm.get());
+    }
+
+    pca_then_itq.apply_noalloc (n, x_norm.get(), xt);
+}
+
+/*********************************************
+ * OPQMatrix
+ *********************************************/
+
+
+OPQMatrix::OPQMatrix (int d, int M, int d2):
+    LinearTransform (d, d2 == -1 ? d : d2, false), M(M),
+    niter (50),
+    niter_pq (4), niter_pq_0 (40),
+    verbose(false),
+    pq(nullptr)
+{
+    is_trained = false;
+    // OPQ is quite expensive to train, so set this right.
+    max_train_points = 256 * 256;
+    pq = nullptr;
+}
+
+
+
+void OPQMatrix::train (Index::idx_t n, const float *x)
+{
+
+    const float * x_in = x;
+
+    x = fvecs_maybe_subsample (d_in, (size_t*)&n,
+                               max_train_points, x, verbose);
+
+    ScopeDeleter<float> del_x (x != x_in ? x : nullptr);
+
+    // To support d_out > d_in, we pad input vectors with 0s to d_out
+    size_t d = d_out <= d_in ? d_in : d_out;
+    size_t d2 = d_out;
+
+#if 0
+    // what this test shows: the only way of getting bit-exact
+    // reproducible results with sgeqrf and sgesvd seems to be forcing
+    // single-threading.
+    { // test repro
+        std::vector<float> r (d * d);
+        float * rotation = r.data();
+        float_randn (rotation, d * d, 1234);
+        printf("CS0: %016lx\n",
+               ivec_checksum (128*128, (int*)rotation));
+        matrix_qr (d, d, rotation);
+        printf("CS1: %016lx\n",
+               ivec_checksum (128*128, (int*)rotation));
+        return;
+    }
+#endif
+
+    if (verbose) {
+        printf ("OPQMatrix::train: training an OPQ rotation matrix "
+                "for M=%d from %ld vectors in %dD -> %dD\n",
+                M, n, d_in, d_out);
+    }
+
+    std::vector<float> xtrain (n * d);
+    // center x
+    {
+        std::vector<float> sum (d);
+        const float *xi = x;
+        for (size_t i = 0; i < n; i++) {
+            for (int j = 0; j < d_in; j++)
+                sum [j] += *xi++;
+        }
+        for (int i = 0; i < d; i++) sum[i] /= n;
+        float *yi = xtrain.data();
+        xi = x;
+        for (size_t i = 0; i < n; i++) {
+            for (int j = 0; j < d_in; j++)
+                *yi++ = *xi++ - sum[j];
+            yi += d - d_in;
+        }
+    }
+    float *rotation;
+
+    if (A.size () == 0) {
+        A.resize (d * d);
+        rotation = A.data();
+        if (verbose)
+            printf("  OPQMatrix::train: making random %ld*%ld rotation\n",
+                   d, d);
+        float_randn (rotation, d * d, 1234);
+        matrix_qr (d, d, rotation);
+        // we use only the d * d2 upper part of the matrix
+        A.resize (d * d2);
+    } else {
+        FAISS_THROW_IF_NOT (A.size() == d * d2);
+        rotation = A.data();
+    }
+
+    std::vector<float>
+        xproj (d2 * n), pq_recons (d2 * n), xxr (d * n),
+        tmp(d * d * 4);
+
+
+    ProductQuantizer pq_default (d2, M, 8);
+    ProductQuantizer &pq_regular = pq ? *pq : pq_default;
+    std::vector<uint8_t> codes (pq_regular.code_size * n);
+
+    double t0 = getmillisecs();
+    for (int iter = 0; iter < niter; iter++) {
+
+        { // torch.mm(xtrain, rotation:t())
+            FINTEGER di = d, d2i = d2, ni = n;
+            float zero = 0, one = 1;
+            sgemm_ ("Transposed", "Not transposed",
+                    &d2i, &ni, &di,
+                    &one, rotation, &di,
+                    xtrain.data(), &di,
+                    &zero, xproj.data(), &d2i);
+        }
+
+        pq_regular.cp.max_points_per_centroid = 1000;
+        pq_regular.cp.niter = iter == 0 ? niter_pq_0 : niter_pq;
+        pq_regular.verbose = verbose;
+        pq_regular.train (n, xproj.data());
+
+        if (verbose) {
+            printf("    encode / decode\n");
+        }
+        if (pq_regular.assign_index) {
+            pq_regular.compute_codes_with_assign_index
+                (xproj.data(), codes.data(), n);
+        } else {
+            pq_regular.compute_codes (xproj.data(), codes.data(), n);
+        }
+        pq_regular.decode (codes.data(), pq_recons.data(), n);
+
+        float pq_err = fvec_L2sqr (pq_recons.data(), xproj.data(), n * d2) / n;
+
+        if (verbose)
+            printf ("    Iteration %d (%d PQ iterations):"
+                    "%.3f s, obj=%g\n", iter, pq_regular.cp.niter,
+                    (getmillisecs () - t0) / 1000.0, pq_err);
+
+        {
+            float *u = tmp.data(), *vt = &tmp [d * d];
+            float *sing_val = &tmp [2 * d * d];
+            FINTEGER di = d, d2i = d2, ni = n;
+            float one = 1, zero = 0;
+
+            if (verbose) {
+                printf("    X * recons\n");
+            }
+            // torch.mm(xtrain:t(), pq_recons)
+            sgemm_ ("Not", "Transposed",
+                    &d2i, &di, &ni,
+                    &one, pq_recons.data(), &d2i,
+                    xtrain.data(), &di,
+                    &zero, xxr.data(), &d2i);
+
+
+            FINTEGER lwork = -1, info = -1;
+            float worksz;
+            // workspace query
+            sgesvd_ ("All", "All",
+                     &d2i, &di, xxr.data(), &d2i,
+                     sing_val,
+                     vt, &d2i, u, &di,
+                     &worksz, &lwork, &info);
+
+            lwork = int(worksz);
+            std::vector<float> work (lwork);
+            // u and vt swapped
+            sgesvd_ ("All", "All",
+                     &d2i, &di, xxr.data(), &d2i,
+                     sing_val,
+                     vt, &d2i, u, &di,
+                     work.data(), &lwork, &info);
+
+            sgemm_ ("Transposed", "Transposed",
+                    &di, &d2i, &d2i,
+                    &one, u, &di, vt, &d2i,
+                    &zero, rotation, &di);
+
+        }
+        pq_regular.train_type = ProductQuantizer::Train_hot_start;
+    }
+
+    // revert A matrix
+    if (d > d_in) {
+        for (long i = 0; i < d_out; i++)
+            memmove (&A[i * d_in], &A[i * d], sizeof(A[0]) * d_in);
+        A.resize (d_in * d_out);
+    }
+
+    is_trained = true;
+    is_orthonormal = true;
+}
+
+
+/*********************************************
+ * NormalizationTransform
+ *********************************************/
+
+NormalizationTransform::NormalizationTransform (int d, float norm):
+    VectorTransform (d, d), norm (norm)
+{
+}
+
+NormalizationTransform::NormalizationTransform ():
+    VectorTransform (-1, -1), norm (-1)
+{
+}
+
+void NormalizationTransform::apply_noalloc
+      (idx_t n, const float* x, float* xt) const
+{
+    if (norm == 2.0) {
+        memcpy (xt, x, sizeof (x[0]) * n * d_in);
+        fvec_renorm_L2 (d_in, n, xt);
+    } else {
+        FAISS_THROW_MSG ("not implemented");
+    }
+}
+
+void NormalizationTransform::reverse_transform (idx_t n, const float* xt,
+                                                float* x) const
+{
+    memcpy (x, xt, sizeof (xt[0]) * n * d_in);
+}
+
+/*********************************************
+ * CenteringTransform
+ *********************************************/
+
+CenteringTransform::CenteringTransform (int d):
+    VectorTransform (d, d)
+{
+    is_trained = false;
+}
+
+void CenteringTransform::train(Index::idx_t n, const float *x) {
+    FAISS_THROW_IF_NOT_MSG(n > 0, "need at least one training vector");
+    mean.resize (d_in, 0);
+    for (idx_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < d_in; j++) {
+            mean[j] += *x++;
+        }
+    }
+
+    for (size_t j = 0; j < d_in; j++) {
+        mean[j] /= n;
+    }
+    is_trained = true;
+}
+
+
+void CenteringTransform::apply_noalloc
+      (idx_t n, const float* x, float* xt) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+
+    for (idx_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < d_in; j++) {
+            *xt++ = *x++ - mean[j];
+        }
+    }
+}
+
+void CenteringTransform::reverse_transform (idx_t n, const float* xt,
+                                                float* x) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+
+    for (idx_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < d_in; j++) {
+            *x++ = *xt++ + mean[j];
+        }
+    }
+
+}
+
+
+
+
+
+/*********************************************
+ * RemapDimensionsTransform
+ *********************************************/
+
+
+RemapDimensionsTransform::RemapDimensionsTransform (
+        int d_in, int d_out, const int *map_in):
+    VectorTransform (d_in, d_out)
+{
+    map.resize (d_out);
+    for (int i = 0; i < d_out; i++) {
+        map[i] = map_in[i];
+        FAISS_THROW_IF_NOT (map[i] == -1 || (map[i] >= 0 && map[i] < d_in));
+    }
+}
+
+RemapDimensionsTransform::RemapDimensionsTransform (
+      int d_in, int d_out, bool uniform): VectorTransform (d_in, d_out)
+{
+    map.resize (d_out, -1);
+
+    if (uniform) {
+        if (d_in < d_out) {
+            for (int i = 0; i < d_in; i++) {
+                map [i * d_out / d_in] = i;
+        }
+        } else {
+            for (int i = 0; i < d_out; i++) {
+                map [i] = i * d_in / d_out;
+            }
+        }
+    } else {
+        for (int i = 0; i < d_in && i < d_out; i++)
+            map [i] = i;
+    }
+}
+
+
+void RemapDimensionsTransform::apply_noalloc (idx_t n, const float * x,
+                                              float *xt) const
+{
+    for (idx_t i = 0; i < n; i++) {
+        for (int j = 0; j < d_out; j++) {
+            xt[j] = map[j] < 0 ? 0 : x[map[j]];
+        }
+        x += d_in;
+        xt += d_out;
+    }
+}
+
+void RemapDimensionsTransform::reverse_transform (idx_t n, const float * xt,
+                                                  float *x) const
+{
+    memset (x, 0, sizeof (*x) * n * d_in);
+    for (idx_t i = 0; i < n; i++) {
+        for (int j = 0; j < d_out; j++) {
+            if (map[j] >= 0) x[map[j]] = xt[j];
+        }
+        x += d_in;
+        xt += d_out;
+    }
+}
diff --git a/core/src/index/thirdparty/faiss/VectorTransform.h b/core/src/index/thirdparty/faiss/VectorTransform.h
new file mode 100644
index 0000000000..4b55245b07
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/VectorTransform.h
@@ -0,0 +1,322 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_VECTOR_TRANSFORM_H
+#define FAISS_VECTOR_TRANSFORM_H
+
+/** Defines a few objects that apply transformations to a set of
+ * vectors Often these are pre-processing steps.
+ */
+
+#include <vector>
+#include <stdint.h>
+
+#include <faiss/Index.h>
+
+
+namespace faiss {
+
+
+/** Any transformation applied on a set of vectors */
+struct VectorTransform {
+
+    typedef Index::idx_t idx_t;
+
+    int d_in;      ///! input dimension
+    int d_out;     ///! output dimension
+
+    explicit VectorTransform (int d_in = 0, int d_out = 0):
+    d_in(d_in), d_out(d_out), is_trained(true)
+    {}
+
+
+    /// set if the VectorTransform does not require training, or if
+    /// training is done already
+    bool is_trained;
+
+
+    /** Perform training on a representative set of vectors. Does
+     * nothing by default.
+     *
+     * @param n      nb of training vectors
+     * @param x      training vecors, size n * d
+     */
+    virtual void train (idx_t n, const float *x);
+
+    /** apply the random roation, return new allocated matrix
+     * @param     x size n * d_in
+     * @return    size n * d_out
+     */
+    float *apply (idx_t n, const float * x) const;
+
+    /// same as apply, but result is pre-allocated
+    virtual void apply_noalloc (idx_t n, const float * x,
+                                float *xt) const = 0;
+
+    /// reverse transformation. May not be implemented or may return
+    /// approximate result
+    virtual void reverse_transform (idx_t n, const float * xt,
+                                    float *x) const;
+
+    virtual ~VectorTransform () {}
+
+};
+
+
+
+/** Generic linear transformation, with bias term applied on output
+ * y = A * x + b
+ */
+struct LinearTransform: VectorTransform {
+
+    bool have_bias; ///! whether to use the bias term
+
+    /// check if matrix A is orthonormal (enables reverse_transform)
+    bool is_orthonormal;
+
+    /// Transformation matrix, size d_out * d_in
+    std::vector<float> A;
+
+     /// bias vector, size d_out
+    std::vector<float> b;
+
+    /// both d_in > d_out and d_out < d_in are supported
+    explicit LinearTransform (int d_in = 0, int d_out = 0,
+                              bool have_bias = false);
+
+    /// same as apply, but result is pre-allocated
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// compute x = A^T * (x - b)
+    /// is reverse transform if A has orthonormal lines
+    void transform_transpose (idx_t n, const float * y,
+                              float *x) const;
+
+    /// works only if is_orthonormal
+    void reverse_transform (idx_t n, const float * xt,
+                            float *x) const override;
+
+    /// compute A^T * A to set the is_orthonormal flag
+    void set_is_orthonormal ();
+
+    bool verbose;
+    void print_if_verbose (const char*name, const std::vector<double> &mat,
+                           int n, int d) const;
+
+    ~LinearTransform() override {}
+};
+
+
+
+/// Randomly rotate a set of vectors
+struct RandomRotationMatrix: LinearTransform {
+
+     /// both d_in > d_out and d_out < d_in are supported
+     RandomRotationMatrix (int d_in, int d_out):
+         LinearTransform(d_in, d_out, false) {}
+
+     /// must be called before the transform is used
+     void init(int seed);
+
+     // intializes with an arbitrary seed
+     void train(idx_t n, const float* x) override;
+
+     RandomRotationMatrix () {}
+};
+
+
+/** Applies a principal component analysis on a set of vectors,
+ *  with optionally whitening and random rotation. */
+struct PCAMatrix: LinearTransform {
+
+    /** after transformation the components are multiplied by
+     * eigenvalues^eigen_power
+     *
+     * =0: no whitening
+     * =-0.5: full whitening
+     */
+    float eigen_power;
+
+    /// random rotation after PCA
+    bool random_rotation;
+
+    /// ratio between # training vectors and dimension
+    size_t max_points_per_d;
+
+    /// try to distribute output eigenvectors in this many bins
+    int balanced_bins;
+
+    /// Mean, size d_in
+    std::vector<float> mean;
+
+    /// eigenvalues of covariance matrix (= squared singular values)
+    std::vector<float> eigenvalues;
+
+    /// PCA matrix, size d_in * d_in
+    std::vector<float> PCAMat;
+
+    // the final matrix is computed after random rotation and/or whitening
+    explicit PCAMatrix (int d_in = 0, int d_out = 0,
+                        float eigen_power = 0, bool random_rotation = false);
+
+    /// train on n vectors. If n < d_in then the eigenvector matrix
+    /// will be completed with 0s
+    void train(idx_t n, const float* x) override;
+
+    /// copy pre-trained PCA matrix
+    void copy_from (const PCAMatrix & other);
+
+    /// called after mean, PCAMat and eigenvalues are computed
+    void prepare_Ab();
+
+};
+
+
+/** ITQ implementation from
+ *
+ *     Iterative quantization: A procrustean approach to learning binary codes
+ *     for large-scale image retrieval,
+ *
+ * Yunchao Gong, Svetlana Lazebnik, Albert Gordo, Florent Perronnin,
+ * PAMI'12.
+ */
+
+struct ITQMatrix: LinearTransform {
+
+    int max_iter;
+    int seed;
+
+    // force initialization of the rotation (for debugging)
+    std::vector<double> init_rotation;
+
+    explicit ITQMatrix (int d = 0);
+
+    void train (idx_t n, const float* x) override;
+};
+
+
+
+/** The full ITQ transform, including normalizations and PCA transformation
+ */
+struct ITQTransform: VectorTransform {
+
+    std::vector<float> mean;
+    bool do_pca;
+    ITQMatrix itq;
+
+    /// max training points per dimension
+    int max_train_per_dim;
+
+    // concatenation of PCA + ITQ transformation
+    LinearTransform pca_then_itq;
+
+    explicit ITQTransform (int d_in = 0, int d_out = 0, bool do_pca = false);
+
+    void train (idx_t n, const float *x) override;
+
+    void apply_noalloc (idx_t n, const float* x, float* xt) const override;
+
+};
+
+
+struct ProductQuantizer;
+
+/** Applies a rotation to align the dimensions with a PQ to minimize
+ *  the reconstruction error. Can be used before an IndexPQ or an
+ *  IndexIVFPQ. The method is the non-parametric version described in:
+ *
+ * "Optimized Product Quantization for Approximate Nearest Neighbor Search"
+ * Tiezheng Ge, Kaiming He, Qifa Ke, Jian Sun, CVPR'13
+ *
+ */
+struct OPQMatrix: LinearTransform {
+
+    int M;          ///< nb of subquantizers
+    int niter;      ///< Number of outer training iterations
+    int niter_pq;   ///< Number of training iterations for the PQ
+    int niter_pq_0; ///< same, for the first outer iteration
+
+    /// if there are too many training points, resample
+    size_t max_train_points;
+    bool verbose;
+
+    /// if non-NULL, use this product quantizer for training
+    /// should be constructed with (d_out, M, _)
+    ProductQuantizer * pq;
+
+    /// if d2 != -1, output vectors of this dimension
+    explicit OPQMatrix (int d = 0, int M = 1, int d2 = -1);
+
+    void train(idx_t n, const float* x) override;
+};
+
+
+/** remap dimensions for intput vectors, possibly inserting 0s
+ * strictly speaking this is also a linear transform but we don't want
+ * to compute it with matrix multiplies */
+struct RemapDimensionsTransform: VectorTransform {
+
+    /// map from output dimension to input, size d_out
+    /// -1 -> set output to 0
+    std::vector<int> map;
+
+    RemapDimensionsTransform (int d_in, int d_out, const int *map);
+
+    /// remap input to output, skipping or inserting dimensions as needed
+    /// if uniform: distribute dimensions uniformly
+    /// otherwise just take the d_out first ones.
+    RemapDimensionsTransform (int d_in, int d_out, bool uniform = true);
+
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// reverse transform correct only when the mapping is a permutation
+    void reverse_transform(idx_t n, const float* xt, float* x) const override;
+
+    RemapDimensionsTransform () {}
+};
+
+
+/** per-vector normalization */
+struct NormalizationTransform: VectorTransform {
+    float norm;
+
+    explicit NormalizationTransform (int d, float norm = 2.0);
+    NormalizationTransform ();
+
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// Identity transform since norm is not revertible
+    void reverse_transform(idx_t n, const float* xt, float* x) const override;
+};
+
+/** Subtract the mean of each component from the vectors. */
+struct CenteringTransform: VectorTransform {
+
+    /// Mean, size d_in = d_out
+    std::vector<float> mean;
+
+    explicit CenteringTransform (int d = 0);
+
+    /// train on n vectors.
+    void train(idx_t n, const float* x) override;
+
+    /// subtract the mean
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// add the mean
+    void reverse_transform (idx_t n, const float * xt,
+                            float *x) const override;
+
+};
+
+
+} // namespace faiss
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/acinclude/ax_blas.m4 b/core/src/index/thirdparty/faiss/acinclude/ax_blas.m4
new file mode 100644
index 0000000000..ada1b17fee
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/acinclude/ax_blas.m4
@@ -0,0 +1,234 @@
+# ===========================================================================
+#         https://www.gnu.org/software/autoconf-archive/ax_blas.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_BLAS([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
+#
+# DESCRIPTION
+#
+#   This macro looks for a library that implements the BLAS linear-algebra
+#   interface (see http://www.netlib.org/blas/). On success, it sets the
+#   BLAS_LIBS output variable to hold the requisite library linkages.
+#
+#   To link with BLAS, you should link with:
+#
+#     $BLAS_LIBS $LIBS $FLIBS
+#
+#   in that order. FLIBS is the output variable of the
+#   AC_F77_LIBRARY_LDFLAGS macro (called if necessary by AX_BLAS), and is
+#   sometimes necessary in order to link with F77 libraries. Users will also
+#   need to use AC_F77_DUMMY_MAIN (see the autoconf manual), for the same
+#   reason.
+#
+#   Many libraries are searched for, from ATLAS to CXML to ESSL. The user
+#   may also use --with-blas=<lib> in order to use some specific BLAS
+#   library <lib>. In order to link successfully, however, be aware that you
+#   will probably need to use the same Fortran compiler (which can be set
+#   via the F77 env. var.) as was used to compile the BLAS library.
+#
+#   ACTION-IF-FOUND is a list of shell commands to run if a BLAS library is
+#   found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it is
+#   not found. If ACTION-IF-FOUND is not specified, the default action will
+#   define HAVE_BLAS.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Steven G. Johnson <stevenj@alum.mit.edu>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 15
+
+AU_ALIAS([ACX_BLAS], [AX_BLAS])
+AC_DEFUN([AX_BLAS], [
+AC_PREREQ(2.50)
+# AC_REQUIRE([AC_F77_LIBRARY_LDFLAGS])
+AC_REQUIRE([AC_CANONICAL_HOST])
+ax_blas_ok=no
+
+AC_ARG_WITH(blas,
+	[AS_HELP_STRING([--with-blas=<lib>], [use BLAS library <lib>])])
+case $with_blas in
+	yes | "") ;;
+	no) ax_blas_ok=disable ;;
+	-* | */* | *.a | *.so | *.so.* | *.o) BLAS_LIBS="$with_blas" ;;
+	*) BLAS_LIBS="-l$with_blas" ;;
+esac
+
+OPENMP_LDFLAGS="$OPENMP_CXXFLAGS"
+
+# Get fortran linker names of BLAS functions to check for.
+# AC_F77_FUNC(sgemm)
+# AC_F77_FUNC(dgemm)
+sgemm=sgemm_
+dgemm=dgemm_
+
+ax_blas_save_LIBS="$LIBS"
+LIBS="$LIBS $FLIBS"
+
+# First, check BLAS_LIBS environment variable
+if test $ax_blas_ok = no; then
+if test "x$BLAS_LIBS" != x; then
+	save_LIBS="$LIBS"; LIBS="$BLAS_LIBS $LIBS"
+	AC_MSG_CHECKING([for $sgemm in $BLAS_LIBS])
+	AC_TRY_LINK_FUNC($sgemm, [ax_blas_ok=yes], [BLAS_LIBS=""])
+	AC_MSG_RESULT($ax_blas_ok)
+	LIBS="$save_LIBS"
+fi
+fi
+
+# BLAS linked to by default?  (happens on some supercomputers)
+if test $ax_blas_ok = no; then
+	save_LIBS="$LIBS"; LIBS="$LIBS"
+	AC_MSG_CHECKING([if $sgemm is being linked in already])
+	AC_TRY_LINK_FUNC($sgemm, [ax_blas_ok=yes])
+	AC_MSG_RESULT($ax_blas_ok)
+	LIBS="$save_LIBS"
+fi
+
+# BLAS in Intel MKL library?
+if test $ax_blas_ok = no; then
+  case $host_os in
+    darwin*)
+      AC_CHECK_LIB(mkl_intel_lp64, $sgemm,
+                   [ax_blas_ok=yes;BLAS_LIBS="-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread"; OPENMP_LDFLAGS=""],,
+                   [-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread])
+      ;;
+    *)
+      if test $host_cpu = x86_64; then
+        AC_CHECK_LIB(mkl_intel_lp64, $sgemm,
+                     [ax_blas_ok=yes;BLAS_LIBS="-lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl"],,
+                     [-lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl])
+      elif test $host_cpu = i686; then
+        AC_CHECK_LIB(mkl_intel, $sgemm,
+                     [ax_blas_ok=yes;BLAS_LIBS="-lmkl_intel -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl"],,
+                     [-lmkl_intel -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl])
+      fi
+    ;;
+  esac
+fi
+# Old versions of MKL
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(mkl, $sgemm, [ax_blas_ok=yes;BLAS_LIBS="-lmkl -lguide -lpthread"],,[-lguide -lpthread])
+fi
+
+# BLAS in OpenBLAS library? (http://xianyi.github.com/OpenBLAS/)
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(openblas, $sgemm, [ax_blas_ok=yes
+			                BLAS_LIBS="-lopenblas"])
+fi
+
+# BLAS in ATLAS library? (http://math-atlas.sourceforge.net/)
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(atlas, ATL_xerbla,
+		[AC_CHECK_LIB(f77blas, $sgemm,
+		[AC_CHECK_LIB(cblas, cblas_dgemm,
+			[ax_blas_ok=yes
+			 BLAS_LIBS="-lcblas -lf77blas -latlas"],
+			[], [-lf77blas -latlas])],
+			[], [-latlas])])
+fi
+
+# BLAS in PhiPACK libraries? (requires generic BLAS lib, too)
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(blas, $sgemm,
+		[AC_CHECK_LIB(dgemm, $dgemm,
+		[AC_CHECK_LIB(sgemm, $sgemm,
+			[ax_blas_ok=yes; BLAS_LIBS="-lsgemm -ldgemm -lblas"],
+			[], [-lblas])],
+			[], [-lblas])])
+fi
+
+# BLAS in Apple vecLib library?
+if test $ax_blas_ok = no; then
+	save_LIBS="$LIBS"; LIBS="-framework vecLib $LIBS"
+	AC_MSG_CHECKING([for $sgemm in -framework vecLib])
+	AC_TRY_LINK_FUNC($sgemm, [ax_blas_ok=yes;BLAS_LIBS="-framework vecLib"])
+	AC_MSG_RESULT($ax_blas_ok)
+	LIBS="$save_LIBS"
+fi
+
+# BLAS in Alpha CXML library?
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(cxml, $sgemm, [ax_blas_ok=yes;BLAS_LIBS="-lcxml"])
+fi
+
+# BLAS in Alpha DXML library? (now called CXML, see above)
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(dxml, $sgemm, [ax_blas_ok=yes;BLAS_LIBS="-ldxml"])
+fi
+
+# BLAS in Sun Performance library?
+if test $ax_blas_ok = no; then
+	if test "x$GCC" != xyes; then # only works with Sun CC
+		AC_CHECK_LIB(sunmath, acosp,
+			[AC_CHECK_LIB(sunperf, $sgemm,
+				[BLAS_LIBS="-xlic_lib=sunperf -lsunmath"
+                                 ax_blas_ok=yes],[],[-lsunmath])])
+	fi
+fi
+
+# BLAS in SCSL library?  (SGI/Cray Scientific Library)
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(scs, $sgemm, [ax_blas_ok=yes; BLAS_LIBS="-lscs"])
+fi
+
+# BLAS in SGIMATH library?
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(complib.sgimath, $sgemm,
+		     [ax_blas_ok=yes; BLAS_LIBS="-lcomplib.sgimath"])
+fi
+
+# BLAS in IBM ESSL library? (requires generic BLAS lib, too)
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(blas, $sgemm,
+		[AC_CHECK_LIB(essl, $sgemm,
+			[ax_blas_ok=yes; BLAS_LIBS="-lessl -lblas"],
+			[], [-lblas $FLIBS])])
+fi
+
+# Generic BLAS library?
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(blas, $sgemm, [ax_blas_ok=yes; BLAS_LIBS="-lblas"])
+fi
+
+AC_SUBST(BLAS_LIBS)
+AC_SUBST(OPENMP_LDFLAGS)
+
+LIBS="$ax_blas_save_LIBS"
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x"$ax_blas_ok" = xyes; then
+        ifelse([$1],,AC_DEFINE(HAVE_BLAS,1,[Define if you have a BLAS library.]),[$1])
+        :
+else
+        ax_blas_ok=no
+        $2
+fi
+])dnl AX_BLAS
diff --git a/core/src/index/thirdparty/faiss/acinclude/ax_check_cpu.m4 b/core/src/index/thirdparty/faiss/acinclude/ax_check_cpu.m4
new file mode 100644
index 0000000000..fb1e080f19
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/acinclude/ax_check_cpu.m4
@@ -0,0 +1,26 @@
+# serial 1
+
+AC_DEFUN([AX_CPU_ARCH], [
+
+AC_MSG_CHECKING([for cpu arch])
+
+  AC_CANONICAL_TARGET
+
+  case $target in
+    amd64-* | x86_64-*)
+      ARCH_CPUFLAGS="-mavx2 -mf16c -msse4 -mpopcnt"
+      ARCH_CXXFLAGS="-m64"
+      ;;
+    aarch64*-*)
+dnl This is an arch for Nvidia Xavier a proper detection would be nice.
+      ARCH_CPUFLAGS="-march=armv8.2-a"
+      ;;
+    *) ;;
+  esac
+
+AC_MSG_RESULT([$target CPUFLAGS+="$ARCH_CPUFLAGS" CXXFLAGS+="$ARCH_CXXFLAGS"])
+
+AC_SUBST(ARCH_CPUFLAGS)
+AC_SUBST(ARCH_CXXFLAGS)
+
+])dnl
diff --git a/core/src/index/thirdparty/faiss/acinclude/ax_cxx_compile_stdcxx.m4 b/core/src/index/thirdparty/faiss/acinclude/ax_cxx_compile_stdcxx.m4
new file mode 100644
index 0000000000..0b6cb3a7d7
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/acinclude/ax_cxx_compile_stdcxx.m4
@@ -0,0 +1,972 @@
+# ===========================================================================
+#  https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_CXX_COMPILE_STDCXX(VERSION, [ext|noext], [mandatory|optional])
+#
+# DESCRIPTION
+#
+#   Check for baseline language coverage in the compiler for the specified
+#   version of the C++ standard.  If necessary, add switches to CXX and
+#   CXXCPP to enable support.  VERSION may be '11' (for the C++11 standard)
+#   or '14' (for the C++14 standard).
+#
+#   The second argument, if specified, indicates whether you insist on an
+#   extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g.
+#   -std=c++11).  If neither is specified, you get whatever works, with
+#   preference for an extended mode.
+#
+#   The third argument, if specified 'mandatory' or if left unspecified,
+#   indicates that baseline support for the specified C++ standard is
+#   required and that the macro should error out if no mode with that
+#   support is found.  If specified 'optional', then configuration proceeds
+#   regardless, after defining HAVE_CXX${VERSION} if and only if a
+#   supporting mode is found.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Benjamin Kosnik <bkoz@redhat.com>
+#   Copyright (c) 2012 Zack Weinberg <zackw@panix.com>
+#   Copyright (c) 2013 Roy Stogner <roystgnr@ices.utexas.edu>
+#   Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov <sokolov@google.com>
+#   Copyright (c) 2015 Paul Norman <penorman@mac.com>
+#   Copyright (c) 2015 Moritz Klammler <moritz@klammler.eu>
+#   Copyright (c) 2016, 2018 Krzesimir Nowak <qdlacz@gmail.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved.  This file is offered as-is, without any
+#   warranty.
+
+#serial 9
+
+dnl  This macro is based on the code from the AX_CXX_COMPILE_STDCXX_11 macro
+dnl  (serial version number 13).
+
+AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl
+  m4_if([$1], [11], [ax_cxx_compile_alternatives="11 0x"],
+        [$1], [14], [ax_cxx_compile_alternatives="14 1y"],
+        [$1], [17], [ax_cxx_compile_alternatives="17 1z"],
+        [m4_fatal([invalid first argument `$1' to AX_CXX_COMPILE_STDCXX])])dnl
+  m4_if([$2], [], [],
+        [$2], [ext], [],
+        [$2], [noext], [],
+        [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX])])dnl
+  m4_if([$3], [], [ax_cxx_compile_cxx$1_required=true],
+        [$3], [mandatory], [ax_cxx_compile_cxx$1_required=true],
+        [$3], [optional], [ax_cxx_compile_cxx$1_required=false],
+        [m4_fatal([invalid third argument `$3' to AX_CXX_COMPILE_STDCXX])])
+  AC_LANG_PUSH([C++])dnl
+  ac_success=no
+
+  m4_if([$2], [noext], [], [dnl
+  if test x$ac_success = xno; then
+    for alternative in ${ax_cxx_compile_alternatives}; do
+      switch="-std=gnu++${alternative}"
+      cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch])
+      AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch,
+                     $cachevar,
+        [ac_save_CXX="$CXX"
+         CXX="$CXX $switch"
+         AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
+          [eval $cachevar=yes],
+          [eval $cachevar=no])
+         CXX="$ac_save_CXX"])
+      if eval test x\$$cachevar = xyes; then
+        CXX="$CXX $switch"
+        if test -n "$CXXCPP" ; then
+          CXXCPP="$CXXCPP $switch"
+        fi
+        ac_success=yes
+        break
+      fi
+    done
+  fi])
+
+  m4_if([$2], [ext], [], [dnl
+  if test x$ac_success = xno; then
+    dnl HP's aCC needs +std=c++11 according to:
+    dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf
+    dnl Cray's crayCC needs "-h std=c++11"
+    for alternative in ${ax_cxx_compile_alternatives}; do
+      for switch in -std=c++${alternative} +std=c++${alternative} "-h std=c++${alternative}"; do
+        cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch])
+        AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch,
+                       $cachevar,
+          [ac_save_CXX="$CXX"
+           CXX="$CXX $switch"
+           AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
+            [eval $cachevar=yes],
+            [eval $cachevar=no])
+           CXX="$ac_save_CXX"])
+        if eval test x\$$cachevar = xyes; then
+          CXX="$CXX $switch"
+          if test -n "$CXXCPP" ; then
+            CXXCPP="$CXXCPP $switch"
+          fi
+          ac_success=yes
+          break
+        fi
+      done
+      if test x$ac_success = xyes; then
+        break
+      fi
+    done
+  fi])
+  AC_LANG_POP([C++])
+  if test x$ax_cxx_compile_cxx$1_required = xtrue; then
+    if test x$ac_success = xno; then
+      AC_MSG_ERROR([*** A compiler with support for C++$1 language features is required.])
+    fi
+  fi
+  if test x$ac_success = xno; then
+    HAVE_CXX$1=0
+    AC_MSG_NOTICE([No compiler with C++$1 support was found])
+  else
+    HAVE_CXX$1=1
+    AC_DEFINE(HAVE_CXX$1,1,
+              [define if the compiler supports basic C++$1 syntax])
+  fi
+  AC_SUBST(HAVE_CXX$1)
+])
+
+
+dnl  Test body for checking C++11 support
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_11],
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+)
+
+
+dnl  Test body for checking C++14 support
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_14],
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
+)
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_17],
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_17
+)
+
+dnl  Tests for new features in C++11
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[
+
+// If the compiler admits that it is not ready for C++11, why torture it?
+// Hopefully, this will speed up the test.
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus < 201103L
+
+#error "This is not a C++11 compiler"
+
+#else
+
+namespace cxx11
+{
+
+  namespace test_static_assert
+  {
+
+    template <typename T>
+    struct check
+    {
+      static_assert(sizeof(int) <= sizeof(T), "not big enough");
+    };
+
+  }
+
+  namespace test_final_override
+  {
+
+    struct Base
+    {
+      virtual void f() {}
+    };
+
+    struct Derived : public Base
+    {
+      virtual void f() override {}
+    };
+
+  }
+
+  namespace test_double_right_angle_brackets
+  {
+
+    template < typename T >
+    struct check {};
+
+    typedef check<void> single_type;
+    typedef check<check<void>> double_type;
+    typedef check<check<check<void>>> triple_type;
+    typedef check<check<check<check<void>>>> quadruple_type;
+
+  }
+
+  namespace test_decltype
+  {
+
+    int
+    f()
+    {
+      int a = 1;
+      decltype(a) b = 2;
+      return a + b;
+    }
+
+  }
+
+  namespace test_type_deduction
+  {
+
+    template < typename T1, typename T2 >
+    struct is_same
+    {
+      static const bool value = false;
+    };
+
+    template < typename T >
+    struct is_same<T, T>
+    {
+      static const bool value = true;
+    };
+
+    template < typename T1, typename T2 >
+    auto
+    add(T1 a1, T2 a2) -> decltype(a1 + a2)
+    {
+      return a1 + a2;
+    }
+
+    int
+    test(const int c, volatile int v)
+    {
+      static_assert(is_same<int, decltype(0)>::value == true, "");
+      static_assert(is_same<int, decltype(c)>::value == false, "");
+      static_assert(is_same<int, decltype(v)>::value == false, "");
+      auto ac = c;
+      auto av = v;
+      auto sumi = ac + av + 'x';
+      auto sumf = ac + av + 1.0;
+      static_assert(is_same<int, decltype(ac)>::value == true, "");
+      static_assert(is_same<int, decltype(av)>::value == true, "");
+      static_assert(is_same<int, decltype(sumi)>::value == true, "");
+      static_assert(is_same<int, decltype(sumf)>::value == false, "");
+      static_assert(is_same<int, decltype(add(c, v))>::value == true, "");
+      return (sumf > 0.0) ? sumi : add(c, v);
+    }
+
+  }
+
+  namespace test_noexcept
+  {
+
+    int f() { return 0; }
+    int g() noexcept { return 0; }
+
+    static_assert(noexcept(f()) == false, "");
+    static_assert(noexcept(g()) == true, "");
+
+  }
+
+  namespace test_constexpr
+  {
+
+    template < typename CharT >
+    unsigned long constexpr
+    strlen_c_r(const CharT *const s, const unsigned long acc) noexcept
+    {
+      return *s ? strlen_c_r(s + 1, acc + 1) : acc;
+    }
+
+    template < typename CharT >
+    unsigned long constexpr
+    strlen_c(const CharT *const s) noexcept
+    {
+      return strlen_c_r(s, 0UL);
+    }
+
+    static_assert(strlen_c("") == 0UL, "");
+    static_assert(strlen_c("1") == 1UL, "");
+    static_assert(strlen_c("example") == 7UL, "");
+    static_assert(strlen_c("another\0example") == 7UL, "");
+
+  }
+
+  namespace test_rvalue_references
+  {
+
+    template < int N >
+    struct answer
+    {
+      static constexpr int value = N;
+    };
+
+    answer<1> f(int&)       { return answer<1>(); }
+    answer<2> f(const int&) { return answer<2>(); }
+    answer<3> f(int&&)      { return answer<3>(); }
+
+    void
+    test()
+    {
+      int i = 0;
+      const int c = 0;
+      static_assert(decltype(f(i))::value == 1, "");
+      static_assert(decltype(f(c))::value == 2, "");
+      static_assert(decltype(f(0))::value == 3, "");
+    }
+
+  }
+
+  namespace test_uniform_initialization
+  {
+
+    struct test
+    {
+      static const int zero {};
+      static const int one {1};
+    };
+
+    static_assert(test::zero == 0, "");
+    static_assert(test::one == 1, "");
+
+  }
+
+  namespace test_lambdas
+  {
+
+    void
+    test1()
+    {
+      auto lambda1 = [](){};
+      auto lambda2 = lambda1;
+      lambda1();
+      lambda2();
+    }
+
+    int
+    test2()
+    {
+      auto a = [](int i, int j){ return i + j; }(1, 2);
+      auto b = []() -> int { return '0'; }();
+      auto c = [=](){ return a + b; }();
+      auto d = [&](){ return c; }();
+      auto e = [a, &b](int x) mutable {
+        const auto identity = [](int y){ return y; };
+        for (auto i = 0; i < a; ++i)
+          a += b--;
+        return x + identity(a + b);
+      }(0);
+      return a + b + c + d + e;
+    }
+
+    int
+    test3()
+    {
+      const auto nullary = [](){ return 0; };
+      const auto unary = [](int x){ return x; };
+      using nullary_t = decltype(nullary);
+      using unary_t = decltype(unary);
+      const auto higher1st = [](nullary_t f){ return f(); };
+      const auto higher2nd = [unary](nullary_t f1){
+        return [unary, f1](unary_t f2){ return f2(unary(f1())); };
+      };
+      return higher1st(nullary) + higher2nd(nullary)(unary);
+    }
+
+  }
+
+  namespace test_variadic_templates
+  {
+
+    template <int...>
+    struct sum;
+
+    template <int N0, int... N1toN>
+    struct sum<N0, N1toN...>
+    {
+      static constexpr auto value = N0 + sum<N1toN...>::value;
+    };
+
+    template <>
+    struct sum<>
+    {
+      static constexpr auto value = 0;
+    };
+
+    static_assert(sum<>::value == 0, "");
+    static_assert(sum<1>::value == 1, "");
+    static_assert(sum<23>::value == 23, "");
+    static_assert(sum<1, 2>::value == 3, "");
+    static_assert(sum<5, 5, 11>::value == 21, "");
+    static_assert(sum<2, 3, 5, 7, 11, 13>::value == 41, "");
+
+  }
+
+  // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
+  // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function
+  // because of this.
+  namespace test_template_alias_sfinae
+  {
+
+    struct foo {};
+
+    template<typename T>
+    using member = typename T::member_type;
+
+    template<typename T>
+    void func(...) {}
+
+    template<typename T>
+    void func(member<T>*) {}
+
+    void test();
+
+    void test() { func<foo>(0); }
+
+  }
+
+}  // namespace cxx11
+
+#endif  // __cplusplus >= 201103L
+
+]])
+
+
+dnl  Tests for new features in C++14
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_14], [[
+
+// If the compiler admits that it is not ready for C++14, why torture it?
+// Hopefully, this will speed up the test.
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus < 201402L
+
+#error "This is not a C++14 compiler"
+
+#else
+
+namespace cxx14
+{
+
+  namespace test_polymorphic_lambdas
+  {
+
+    int
+    test()
+    {
+      const auto lambda = [](auto&&... args){
+        const auto istiny = [](auto x){
+          return (sizeof(x) == 1UL) ? 1 : 0;
+        };
+        const int aretiny[] = { istiny(args)... };
+        return aretiny[0];
+      };
+      return lambda(1, 1L, 1.0f, '1');
+    }
+
+  }
+
+  namespace test_binary_literals
+  {
+
+    constexpr auto ivii = 0b0000000000101010;
+    static_assert(ivii == 42, "wrong value");
+
+  }
+
+  namespace test_generalized_constexpr
+  {
+
+    template < typename CharT >
+    constexpr unsigned long
+    strlen_c(const CharT *const s) noexcept
+    {
+      auto length = 0UL;
+      for (auto p = s; *p; ++p)
+        ++length;
+      return length;
+    }
+
+    static_assert(strlen_c("") == 0UL, "");
+    static_assert(strlen_c("x") == 1UL, "");
+    static_assert(strlen_c("test") == 4UL, "");
+    static_assert(strlen_c("another\0test") == 7UL, "");
+
+  }
+
+  namespace test_lambda_init_capture
+  {
+
+    int
+    test()
+    {
+      auto x = 0;
+      const auto lambda1 = [a = x](int b){ return a + b; };
+      const auto lambda2 = [a = lambda1(x)](){ return a; };
+      return lambda2();
+    }
+
+  }
+
+  namespace test_digit_separators
+  {
+
+    constexpr auto ten_million = 100'000'000;
+    static_assert(ten_million == 100000000, "");
+
+  }
+
+  namespace test_return_type_deduction
+  {
+
+    auto f(int& x) { return x; }
+    decltype(auto) g(int& x) { return x; }
+
+    template < typename T1, typename T2 >
+    struct is_same
+    {
+      static constexpr auto value = false;
+    };
+
+    template < typename T >
+    struct is_same<T, T>
+    {
+      static constexpr auto value = true;
+    };
+
+    int
+    test()
+    {
+      auto x = 0;
+      static_assert(is_same<int, decltype(f(x))>::value, "");
+      static_assert(is_same<int&, decltype(g(x))>::value, "");
+      return x;
+    }
+
+  }
+
+}  // namespace cxx14
+
+#endif  // __cplusplus >= 201402L
+
+]])
+
+
+dnl  Tests for new features in C++17
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_17], [[
+
+// If the compiler admits that it is not ready for C++17, why torture it?
+// Hopefully, this will speed up the test.
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus <= 201402L
+
+#error "This is not a C++17 compiler"
+
+#else
+
+#if defined(__clang__)
+  #define REALLY_CLANG
+#else
+  #if defined(__GNUC__)
+    #define REALLY_GCC
+  #endif
+#endif
+
+#include <initializer_list>
+#include <utility>
+#include <type_traits>
+
+namespace cxx17
+{
+
+#if !defined(REALLY_CLANG)
+  namespace test_constexpr_lambdas
+  {
+
+    // TODO: test it with clang++ from git
+
+    constexpr int foo = [](){return 42;}();
+
+  }
+#endif // !defined(REALLY_CLANG)
+
+  namespace test::nested_namespace::definitions
+  {
+
+  }
+
+  namespace test_fold_expression
+  {
+
+    template<typename... Args>
+    int multiply(Args... args)
+    {
+      return (args * ... * 1);
+    }
+
+    template<typename... Args>
+    bool all(Args... args)
+    {
+      return (args && ...);
+    }
+
+  }
+
+  namespace test_extended_static_assert
+  {
+
+    static_assert (true);
+
+  }
+
+  namespace test_auto_brace_init_list
+  {
+
+    auto foo = {5};
+    auto bar {5};
+
+    static_assert(std::is_same<std::initializer_list<int>, decltype(foo)>::value);
+    static_assert(std::is_same<int, decltype(bar)>::value);
+  }
+
+  namespace test_typename_in_template_template_parameter
+  {
+
+    template<template<typename> typename X> struct D;
+
+  }
+
+  namespace test_fallthrough_nodiscard_maybe_unused_attributes
+  {
+
+    int f1()
+    {
+      return 42;
+    }
+
+    [[nodiscard]] int f2()
+    {
+      [[maybe_unused]] auto unused = f1();
+
+      switch (f1())
+      {
+      case 17:
+        f1();
+        [[fallthrough]];
+      case 42:
+        f1();
+      }
+      return f1();
+    }
+
+  }
+
+  namespace test_extended_aggregate_initialization
+  {
+
+    struct base1
+    {
+      int b1, b2 = 42;
+    };
+
+    struct base2
+    {
+      base2() {
+        b3 = 42;
+      }
+      int b3;
+    };
+
+    struct derived : base1, base2
+    {
+        int d;
+    };
+
+    derived d1 {{1, 2}, {}, 4};  // full initialization
+    derived d2 {{}, {}, 4};      // value-initialized bases
+
+  }
+
+  namespace test_general_range_based_for_loop
+  {
+
+    struct iter
+    {
+      int i;
+
+      int& operator* ()
+      {
+        return i;
+      }
+
+      const int& operator* () const
+      {
+        return i;
+      }
+
+      iter& operator++()
+      {
+        ++i;
+        return *this;
+      }
+    };
+
+    struct sentinel
+    {
+      int i;
+    };
+
+    bool operator== (const iter& i, const sentinel& s)
+    {
+      return i.i == s.i;
+    }
+
+    bool operator!= (const iter& i, const sentinel& s)
+    {
+      return !(i == s);
+    }
+
+    struct range
+    {
+      iter begin() const
+      {
+        return {0};
+      }
+
+      sentinel end() const
+      {
+        return {5};
+      }
+    };
+
+    void f()
+    {
+      range r {};
+
+      for (auto i : r)
+      {
+        [[maybe_unused]] auto v = i;
+      }
+    }
+
+  }
+
+  namespace test_lambda_capture_asterisk_this_by_value
+  {
+
+    struct t
+    {
+      int i;
+      int foo()
+      {
+        return [*this]()
+        {
+          return i;
+        }();
+      }
+    };
+
+  }
+
+  namespace test_enum_class_construction
+  {
+
+    enum class byte : unsigned char
+    {};
+
+    byte foo {42};
+
+  }
+
+  namespace test_constexpr_if
+  {
+
+    template <bool cond>
+    int f ()
+    {
+      if constexpr(cond)
+      {
+        return 13;
+      }
+      else
+      {
+        return 42;
+      }
+    }
+
+  }
+
+  namespace test_selection_statement_with_initializer
+  {
+
+    int f()
+    {
+      return 13;
+    }
+
+    int f2()
+    {
+      if (auto i = f(); i > 0)
+      {
+        return 3;
+      }
+
+      switch (auto i = f(); i + 4)
+      {
+      case 17:
+        return 2;
+
+      default:
+        return 1;
+      }
+    }
+
+  }
+
+#if !defined(REALLY_CLANG)
+  namespace test_template_argument_deduction_for_class_templates
+  {
+
+    // TODO: test it with clang++ from git
+
+    template <typename T1, typename T2>
+    struct pair
+    {
+      pair (T1 p1, T2 p2)
+        : m1 {p1},
+          m2 {p2}
+      {}
+
+      T1 m1;
+      T2 m2;
+    };
+
+    void f()
+    {
+      [[maybe_unused]] auto p = pair{13, 42u};
+    }
+
+  }
+#endif // !defined(REALLY_CLANG)
+
+  namespace test_non_type_auto_template_parameters
+  {
+
+    template <auto n>
+    struct B
+    {};
+
+    B<5> b1;
+    B<'a'> b2;
+
+  }
+
+#if !defined(REALLY_CLANG)
+  namespace test_structured_bindings
+  {
+
+    // TODO: test it with clang++ from git
+
+    int arr[2] = { 1, 2 };
+    std::pair<int, int> pr = { 1, 2 };
+
+    auto f1() -> int(&)[2]
+    {
+      return arr;
+    }
+
+    auto f2() -> std::pair<int, int>&
+    {
+      return pr;
+    }
+
+    struct S
+    {
+      int x1 : 2;
+      volatile double y1;
+    };
+
+    S f3()
+    {
+      return {};
+    }
+
+    auto [ x1, y1 ] = f1();
+    auto& [ xr1, yr1 ] = f1();
+    auto [ x2, y2 ] = f2();
+    auto& [ xr2, yr2 ] = f2();
+    const auto [ x3, y3 ] = f3();
+
+  }
+#endif // !defined(REALLY_CLANG)
+
+#if !defined(REALLY_CLANG)
+  namespace test_exception_spec_type_system
+  {
+
+    // TODO: test it with clang++ from git
+
+    struct Good {};
+    struct Bad {};
+
+    void g1() noexcept;
+    void g2();
+
+    template<typename T>
+    Bad
+    f(T*, T*);
+
+    template<typename T1, typename T2>
+    Good
+    f(T1*, T2*);
+
+    static_assert (std::is_same_v<Good, decltype(f(g1, g2))>);
+
+  }
+#endif // !defined(REALLY_CLANG)
+
+  namespace test_inline_variables
+  {
+
+    template<class T> void f(T)
+    {}
+
+    template<class T> inline T g(T)
+    {
+      return T{};
+    }
+
+    template<> inline void f<>(int)
+    {}
+
+    template<> int g<>(int)
+    {
+      return 5;
+    }
+
+  }
+
+}  // namespace cxx17
+
+#endif  // __cplusplus <= 201402L
+
+]])
diff --git a/core/src/index/thirdparty/faiss/acinclude/ax_lapack.m4 b/core/src/index/thirdparty/faiss/acinclude/ax_lapack.m4
new file mode 100644
index 0000000000..4993f29b9c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/acinclude/ax_lapack.m4
@@ -0,0 +1,132 @@
+# ===========================================================================
+#        https://www.gnu.org/software/autoconf-archive/ax_lapack.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_LAPACK([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
+#
+# DESCRIPTION
+#
+#   This macro looks for a library that implements the LAPACK linear-algebra
+#   interface (see http://www.netlib.org/lapack/). On success, it sets the
+#   LAPACK_LIBS output variable to hold the requisite library linkages.
+#
+#   To link with LAPACK, you should link with:
+#
+#     $LAPACK_LIBS $BLAS_LIBS $LIBS $FLIBS
+#
+#   in that order. BLAS_LIBS is the output variable of the AX_BLAS macro,
+#   called automatically. FLIBS is the output variable of the
+#   AC_F77_LIBRARY_LDFLAGS macro (called if necessary by AX_BLAS), and is
+#   sometimes necessary in order to link with F77 libraries. Users will also
+#   need to use AC_F77_DUMMY_MAIN (see the autoconf manual), for the same
+#   reason.
+#
+#   The user may also use --with-lapack=<lib> in order to use some specific
+#   LAPACK library <lib>. In order to link successfully, however, be aware
+#   that you will probably need to use the same Fortran compiler (which can
+#   be set via the F77 env. var.) as was used to compile the LAPACK and BLAS
+#   libraries.
+#
+#   ACTION-IF-FOUND is a list of shell commands to run if a LAPACK library
+#   is found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it
+#   is not found. If ACTION-IF-FOUND is not specified, the default action
+#   will define HAVE_LAPACK.
+#
+# LICENSE
+#
+#   Copyright (c) 2009 Steven G. Johnson <stevenj@alum.mit.edu>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 8
+
+AU_ALIAS([ACX_LAPACK], [AX_LAPACK])
+AC_DEFUN([AX_LAPACK], [
+AC_REQUIRE([AX_BLAS])
+ax_lapack_ok=no
+
+AC_ARG_WITH(lapack,
+        [AS_HELP_STRING([--with-lapack=<lib>], [use LAPACK library <lib>])])
+case $with_lapack in
+        yes | "") ;;
+        no) ax_lapack_ok=disable ;;
+        -* | */* | *.a | *.so | *.so.* | *.o) LAPACK_LIBS="$with_lapack" ;;
+        *) LAPACK_LIBS="-l$with_lapack" ;;
+esac
+
+# Get fortran linker name of LAPACK function to check for.
+# AC_F77_FUNC(cheev)
+cheev=cheev_
+
+# We cannot use LAPACK if BLAS is not found
+if test "x$ax_blas_ok" != xyes; then
+        ax_lapack_ok=noblas
+        LAPACK_LIBS=""
+fi
+
+# First, check LAPACK_LIBS environment variable
+if test "x$LAPACK_LIBS" != x; then
+        save_LIBS="$LIBS"; LIBS="$LAPACK_LIBS $BLAS_LIBS $LIBS $FLIBS"
+        AC_MSG_CHECKING([for $cheev in $LAPACK_LIBS])
+        AC_TRY_LINK_FUNC($cheev, [ax_lapack_ok=yes], [LAPACK_LIBS=""])
+        AC_MSG_RESULT($ax_lapack_ok)
+        LIBS="$save_LIBS"
+        if test $ax_lapack_ok = no; then
+                LAPACK_LIBS=""
+        fi
+fi
+
+# LAPACK linked to by default?  (is sometimes included in BLAS lib)
+if test $ax_lapack_ok = no; then
+        save_LIBS="$LIBS"; LIBS="$LIBS $BLAS_LIBS $FLIBS"
+        AC_CHECK_FUNC($cheev, [ax_lapack_ok=yes])
+        LIBS="$save_LIBS"
+fi
+
+# Generic LAPACK library?
+for lapack in lapack lapack_rs6k; do
+        if test $ax_lapack_ok = no; then
+                save_LIBS="$LIBS"; LIBS="$BLAS_LIBS $LIBS"
+                AC_CHECK_LIB($lapack, $cheev,
+                    [ax_lapack_ok=yes; LAPACK_LIBS="-l$lapack"], [], [$FLIBS])
+                LIBS="$save_LIBS"
+        fi
+done
+
+AC_SUBST(LAPACK_LIBS)
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x"$ax_lapack_ok" = xyes; then
+        ifelse([$1],,AC_DEFINE(HAVE_LAPACK,1,[Define if you have LAPACK library.]),[$1])
+        :
+else
+        ax_lapack_ok=no
+        $2
+fi
+])dnl AX_LAPACK
diff --git a/core/src/index/thirdparty/faiss/acinclude/fa_check_cuda.m4 b/core/src/index/thirdparty/faiss/acinclude/fa_check_cuda.m4
new file mode 100644
index 0000000000..f730bc23e2
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/acinclude/fa_check_cuda.m4
@@ -0,0 +1,67 @@
+AC_DEFUN([FA_CHECK_CUDA], [
+
+AC_ARG_WITH(cuda,
+  [AS_HELP_STRING([--with-cuda=<prefix>], [prefix of the CUDA installation])])
+AC_ARG_WITH(cuda-arch,
+  [AS_HELP_STRING([--with-cuda-arch=<gencodes>], [device specific -gencode flags])],
+  [],
+  [with_cuda_arch=default])
+
+if test x$with_cuda != xno; then
+  if test x$with_cuda != x; then
+    cuda_prefix=$with_cuda
+    AC_CHECK_PROG(NVCC, [nvcc], [$cuda_prefix/bin/nvcc], [], [$cuda_prefix/bin])
+    NVCC_CPPFLAGS="-I$cuda_prefix/include"
+    NVCC_LDFLAGS="-L$cuda_prefix/lib64"
+  else
+    AC_CHECK_PROGS(NVCC, [nvcc /usr/local/cuda/bin/nvcc], [])
+    if test "x$NVCC" == "x/usr/local/cuda/bin/nvcc"; then
+      cuda_prefix="/usr/local/cuda"
+      NVCC_CPPFLAGS="-I$cuda_prefix/include"
+      NVCC_LDFLAGS="-L$cuda_prefix/lib64"
+    else
+      cuda_prefix=""
+      NVCC_CPPFLAGS=""
+      NVCC_LDFLAGS=""
+    fi
+  fi
+
+  if test "x$NVCC" == x; then
+    AC_MSG_ERROR([Couldn't find nvcc])
+  fi
+
+  if test "x$with_cuda_arch" == xdefault; then
+    with_cuda_arch="-gencode=arch=compute_35,code=compute_35 \\
+-gencode=arch=compute_52,code=compute_52 \\
+-gencode=arch=compute_60,code=compute_60 \\
+-gencode=arch=compute_61,code=compute_61 \\
+-gencode=arch=compute_70,code=compute_70 \\
+-gencode=arch=compute_75,code=compute_75"
+  fi
+
+  fa_save_CPPFLAGS="$CPPFLAGS"
+  fa_save_LDFLAGS="$LDFLAGS"
+  fa_save_LIBS="$LIBS"
+
+  CPPFLAGS="$NVCC_CPPFLAGS $CPPFLAGS"
+  LDFLAGS="$NVCC_LDFLAGS $LDFLAGS"
+
+  AC_CHECK_HEADER([cuda.h], [], AC_MSG_FAILURE([Couldn't find cuda.h]))
+  AC_CHECK_LIB([cublas], [cublasAlloc], [], AC_MSG_FAILURE([Couldn't find libcublas]))
+  AC_CHECK_LIB([cudart], [cudaSetDevice], [], AC_MSG_FAILURE([Couldn't find libcudart]))
+
+  NVCC_LIBS="$LIBS"
+  NVCC_CPPFLAGS="$CPPFLAGS"
+  NVCC_LDFLAGS="$LDFLAGS"
+  CPPFLAGS="$fa_save_CPPFLAGS"
+  LDFLAGS="$fa_save_LDFLAGS"
+  LIBS="$fa_save_LIBS"
+fi
+
+AC_SUBST(NVCC)
+AC_SUBST(NVCC_CPPFLAGS)
+AC_SUBST(NVCC_LDFLAGS)
+AC_SUBST(NVCC_LIBS)
+AC_SUBST(CUDA_PREFIX, $cuda_prefix)
+AC_SUBST(CUDA_ARCH, $with_cuda_arch)
+])
diff --git a/core/src/index/thirdparty/faiss/acinclude/fa_numpy.m4 b/core/src/index/thirdparty/faiss/acinclude/fa_numpy.m4
new file mode 100644
index 0000000000..6e3dcde531
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/acinclude/fa_numpy.m4
@@ -0,0 +1,20 @@
+AC_DEFUN([FA_NUMPY], [
+AC_REQUIRE([FA_PYTHON])
+
+AC_MSG_CHECKING([for numpy headers path])
+
+fa_numpy_headers=`$PYTHON -c "import numpy; print(numpy.get_include())"`
+
+if test $? == 0; then
+  if test x$fa_numpy_headers != x; then
+    AC_MSG_RESULT($fa_numpy_headers)
+    AC_SUBST(NUMPY_INCLUDE, $fa_numpy_headers)
+  else
+    AC_MSG_RESULT([not found])
+    AC_MSG_WARN([You won't be able to build the python interface.])
+  fi
+else
+  AC_MSG_RESULT([not found])
+  AC_MSG_WARN([You won't be able to build the python interface.])
+fi
+])dnl
diff --git a/core/src/index/thirdparty/faiss/acinclude/fa_prog_nm.m4 b/core/src/index/thirdparty/faiss/acinclude/fa_prog_nm.m4
new file mode 100644
index 0000000000..f450ba7645
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/acinclude/fa_prog_nm.m4
@@ -0,0 +1,16 @@
+dnl
+dnl Check for an nm(1) utility.
+dnl
+AC_DEFUN([FA_PROG_NM],
+[
+    case "${NM-unset}" in
+        unset) AC_CHECK_PROGS(NM, nm, nm) ;;
+        *) AC_CHECK_PROGS(NM, $NM nm, nm) ;;
+    esac
+    AC_MSG_CHECKING(nm flags)
+    case "${NMFLAGS-unset}" in
+        unset) NMFLAGS= ;;
+    esac
+    AC_MSG_RESULT($NMFLAGS)
+    AC_SUBST(NMFLAGS)
+])
diff --git a/core/src/index/thirdparty/faiss/acinclude/fa_prog_swig.m4 b/core/src/index/thirdparty/faiss/acinclude/fa_prog_swig.m4
new file mode 100644
index 0000000000..1e6ab8e49d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/acinclude/fa_prog_swig.m4
@@ -0,0 +1,11 @@
+AC_DEFUN([FA_PROG_SWIG], [
+
+AC_ARG_WITH(swig,
+[AS_HELP_STRING([--with-swig=<bin>], [use SWIG binary <bin>])])
+case $with_swig in
+ "") AC_CHECK_PROG(SWIG, swig, swig);;
+  *) SWIG="$with_swig"
+esac
+
+AC_SUBST(SWIG)
+])
diff --git a/core/src/index/thirdparty/faiss/acinclude/fa_python.m4 b/core/src/index/thirdparty/faiss/acinclude/fa_python.m4
new file mode 100644
index 0000000000..a58a9d15ec
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/acinclude/fa_python.m4
@@ -0,0 +1,21 @@
+AC_DEFUN([FA_PYTHON], [
+
+AC_ARG_WITH(python,
+  [AS_HELP_STRING([--with-python=<bin>], [use Python binary <bin>])])
+case $with_python in
+  "") PYTHON_BIN=python ;;
+  *) PYTHON_BIN="$with_python"
+esac
+
+AC_CHECK_PROG(PYTHON, $PYTHON_BIN, $PYTHON_BIN)
+fa_python_bin=$PYTHON
+
+AC_MSG_CHECKING([for Python C flags])
+fa_python_cflags=`$PYTHON -c "
+import sysconfig
+paths = [['-I' + sysconfig.get_path(p) for p in ['include', 'platinclude']]]
+print(' '.join(paths))"`
+AC_MSG_RESULT($fa_python_cflags)
+AC_SUBST(PYTHON_CFLAGS, "$PYTHON_CFLAGS $fa_python_cflags")
+
+])dnl FA_PYTHON
diff --git a/core/src/index/thirdparty/faiss/benchs/README.md b/core/src/index/thirdparty/faiss/benchs/README.md
new file mode 100644
index 0000000000..7e95a7673d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/README.md
@@ -0,0 +1,338 @@
+
+# Benchmarking scripts
+
+This directory contains benchmarking scripts that can reproduce the
+numbers reported in the two papers
+
+```
+@inproceedings{DJP16,
+  Author = {Douze, Matthijs and J{\'e}gou, Herv{\'e} and Perronnin, Florent},
+  Booktitle = "ECCV",
+  Organization = {Springer},
+  Title = {Polysemous codes},
+  Year = {2016}
+}
+```
+and
+
+```
+@inproceedings{JDJ17,
+   Author = {Jeff Johnson and Matthijs Douze and Herv{\'e} J{\'e}gou},
+   journal= {arXiv:1702.08734},,
+   Title = {Billion-scale similarity search with GPUs},
+   Year = {2017},
+}
+```
+
+Note that the numbers (especially timings) change slightly due to changes in the implementation, different machines, etc.
+
+The scripts are self-contained. They depend only on Faiss and external training data that should be stored in sub-directories.
+
+## SIFT1M experiments
+
+The script [`bench_polysemous_sift1m.py`](bench_polysemous_sift1m.py) reproduces the numbers in
+Figure 3 from the "Polysemous" paper.
+
+### Getting SIFT1M
+
+To run it, please download the ANN_SIFT1M dataset from
+
+http://corpus-texmex.irisa.fr/
+
+and unzip it to the subdirectory sift1M.
+
+### Result
+
+The output looks like:
+
+```
+PQ training on 100000 points, remains 0 points: training polysemous on centroids
+add vectors to index
+PQ baseline        7.517 ms per query, R@1 0.4474
+Polysemous 64      9.875 ms per query, R@1 0.4474
+Polysemous 62      8.358 ms per query, R@1 0.4474
+Polysemous 58      5.531 ms per query, R@1 0.4474
+Polysemous 54      3.420 ms per query, R@1 0.4478
+Polysemous 50      2.182 ms per query, R@1 0.4475
+Polysemous 46      1.621 ms per query, R@1 0.4408
+Polysemous 42      1.448 ms per query, R@1 0.4174
+Polysemous 38      1.331 ms per query, R@1 0.3563
+Polysemous 34      1.334 ms per query, R@1 0.2661
+Polysemous 30      1.272 ms per query, R@1 0.1794
+```
+
+
+## Experiments on 1B elements dataset
+
+The script [`bench_polysemous_1bn.py`](bench_polysemous_1bn.py) reproduces a few experiments on
+two datasets of size 1B from the Polysemous codes" paper.
+
+
+### Getting BIGANN
+
+Download the four files of ANN_SIFT1B from
+http://corpus-texmex.irisa.fr/ to subdirectory bigann/
+
+### Getting Deep1B
+
+The ground-truth and queries are available here 
+
+https://yadi.sk/d/11eDCm7Dsn9GA
+
+For the learning and database vectors, use the script
+
+https://github.com/arbabenko/GNOIMI/blob/master/downloadDeep1B.py
+
+to download the data to subdirectory deep1b/, then concatenate the
+database files to base.fvecs and the training files to learn.fvecs
+
+### Running the experiments
+
+These experiments are quite long. To support resuming, the script
+stores the result of training to a temporary directory, `/tmp/bench_polysemous`.
+
+The script `bench_polysemous_1bn.py` takes at least two arguments:
+
+- the dataset name: SIFT1000M (aka SIFT1B, aka BIGANN) or Deep1B. SIFT1M, SIFT2M,... are also supported to make subsets of for small experiments (note that SIFT1M as a subset of SIFT1B is not the same as the SIFT1M above)
+
+- the type of index to build, which should be a valid [index_factory key](https://github.com/facebookresearch/faiss/wiki/High-level-interface-and-auto-tuning#index-factory) (see below for examples)
+
+- the remaining arguments are parsed as search-time parameters.
+
+### Experiments of Table 2
+
+The `IMI*+PolyD+ADC` results in Table 2 can be reproduced with (for 16 bytes):
+
+```
+python bench_polysemous_1bn.par SIFT1000M IMI2x12,PQ16 nprobe=16,max_codes={10000,30000},ht={44..54}
+```
+
+Training takes about 2 minutes and adding vectors to the dataset
+takes 3.1 h. These operations are multithreaded. Note that in the command
+above, we use bash's [brace expansion](https://www.gnu.org/software/bash/manual/html_node/Brace-Expansion.html) to set a grid of parameters.
+
+The search is *not* multithreaded, and the output looks like:
+
+```
+                                        R@1    R@10   R@100     time    %pass
+nprobe=16,max_codes=10000,ht=44         0.1779 0.2994 0.3139    0.194   12.45
+nprobe=16,max_codes=10000,ht=45         0.1859 0.3183 0.3339    0.197   14.24
+nprobe=16,max_codes=10000,ht=46         0.1930 0.3366 0.3543    0.202   16.22
+nprobe=16,max_codes=10000,ht=47         0.1993 0.3550 0.3745    0.209   18.39
+nprobe=16,max_codes=10000,ht=48         0.2033 0.3694 0.3917    0.640   20.77
+nprobe=16,max_codes=10000,ht=49         0.2070 0.3839 0.4077    0.229   23.36
+nprobe=16,max_codes=10000,ht=50         0.2101 0.3949 0.4205    0.232   26.17
+nprobe=16,max_codes=10000,ht=51         0.2120 0.4042 0.4310    0.239   29.21
+nprobe=16,max_codes=10000,ht=52         0.2134 0.4113 0.4402    0.245   32.47
+nprobe=16,max_codes=10000,ht=53         0.2157 0.4184 0.4482    0.250   35.96
+nprobe=16,max_codes=10000,ht=54         0.2170 0.4240 0.4546    0.256   39.66
+nprobe=16,max_codes=30000,ht=44         0.1882 0.3327 0.3555    0.226   11.29
+nprobe=16,max_codes=30000,ht=45         0.1964 0.3525 0.3771    0.231   13.05
+nprobe=16,max_codes=30000,ht=46         0.2039 0.3713 0.3987    0.236   15.01
+nprobe=16,max_codes=30000,ht=47         0.2103 0.3907 0.4202    0.245   17.19
+nprobe=16,max_codes=30000,ht=48         0.2145 0.4055 0.4384    0.251   19.60
+nprobe=16,max_codes=30000,ht=49         0.2179 0.4198 0.4550    0.257   22.25
+nprobe=16,max_codes=30000,ht=50         0.2208 0.4305 0.4681    0.268   25.15
+nprobe=16,max_codes=30000,ht=51         0.2227 0.4402 0.4791    0.275   28.30
+nprobe=16,max_codes=30000,ht=52         0.2241 0.4473 0.4884    0.284   31.70
+nprobe=16,max_codes=30000,ht=53         0.2265 0.4544 0.4965    0.294   35.34
+nprobe=16,max_codes=30000,ht=54         0.2278 0.4601 0.5031    0.303   39.20
+```
+
+The result reported in table 2 is the one for which the %pass (percentage of code comparisons that pass the Hamming check) is around 20%, which occurs for Hamming threshold `ht=48`.
+
+The 8-byte results can be reproduced with the factory key `IMI2x12,PQ8`
+
+### Experiments of the appendix
+
+The experiments in the appendix are only in the ArXiv version of the paper (table 3). 
+
+```
+python bench_polysemous_1bn.py SIFT1000M OPQ8_64,IMI2x13,PQ8 nprobe={1,2,4,8,16,32,64,128},ht={20,24,26,28,30}
+
+               	R@1    R@10   R@100     time    %pass
+nprobe=1,ht=20 	0.0351 0.0616 0.0751    0.158   19.01
+...
+nprobe=32,ht=28 	0.1256 0.3563 0.5026    0.561   52.61
+...
+```
+Here again the runs are not exactly the same but the original result was obtained from nprobe=32,ht=28.
+
+For Deep1B, we used a simple version of [auto-tuning](https://github.com/facebookresearch/faiss/wiki/High-level-interface-and-auto-tuning/_edit#auto-tuning-the-runtime-parameters) to sweep through the set of operating points:
+
+```
+python bench_polysemous_1bn.py Deep1B OPQ20_80,IMI2x14,PQ20 autotune
+...
+Done in 4067.555 s, available OPs:
+Parameters                                1-R@1     time
+                                          0.0000    0.000
+nprobe=1,ht=22,max_codes=256              0.0215    3.115
+nprobe=1,ht=30,max_codes=256              0.0381    3.120
+...
+nprobe=512,ht=68,max_codes=524288         0.4478   36.903
+nprobe=1024,ht=80,max_codes=131072        0.4557   46.363
+nprobe=1024,ht=78,max_codes=262144        0.4616   61.939
+...
+```
+The original results were obtained with `nprobe=1024,ht=66,max_codes=262144`.
+
+
+## GPU experiments
+
+The benchmarks below run 1 or 4 Titan X GPUs and reproduce the results of the "GPU paper". They are also a good starting point on how to use GPU Faiss. 
+
+### Search on SIFT1M
+
+See above on how to get SIFT1M into subdirectory sift1M/. The script [`bench_gpu_sift1m.py`](bench_gpu_sift1m.py) reproduces the "exact k-NN time" plot in the ArXiv paper, and the SIFT1M numbers. 
+
+The output is:
+```
+============ Exact search
+add vectors to index
+warmup
+benchmark
+k=1 0.715 s, R@1 0.9914
+k=2 0.729 s, R@1 0.9935
+k=4 0.731 s, R@1 0.9935
+k=8 0.732 s, R@1 0.9935
+k=16 0.742 s, R@1 0.9935
+k=32 0.737 s, R@1 0.9935
+k=64 0.753 s, R@1 0.9935
+k=128 0.761 s, R@1 0.9935
+k=256 0.799 s, R@1 0.9935
+k=512 0.975 s, R@1 0.9935
+k=1024 1.424 s, R@1 0.9935
+============ Approximate search
+train
+WARNING clustering 100000 points to 4096 centroids: please provide at least 159744 training points
+add vectors to index
+WARN: increase temp memory to avoid cudaMalloc, or decrease query/add size (alloc 256000000 B, highwater 256000000 B)
+warmup
+benchmark
+nprobe=   1 0.043 s recalls= 0.3909 0.4312 0.4312
+nprobe=   2 0.040 s recalls= 0.5041 0.5636 0.5636
+nprobe=   4 0.048 s recalls= 0.6048 0.6897 0.6897
+nprobe=   8 0.064 s recalls= 0.6879 0.8028 0.8028
+nprobe=  16 0.088 s recalls= 0.7534 0.8940 0.8940
+nprobe=  32 0.134 s recalls= 0.7957 0.9549 0.9550
+nprobe=  64 0.224 s recalls= 0.8125 0.9833 0.9834
+nprobe= 128 0.395 s recalls= 0.8205 0.9953 0.9954
+nprobe= 256 0.717 s recalls= 0.8227 0.9993 0.9994
+nprobe= 512 1.348 s recalls= 0.8228 0.9999 1.0000
+```
+The run produces two warnings:
+
+- the clustering complains that it does not have enough training data, there is not much we can do about this.
+
+- the add() function complains that there is an inefficient memory allocation, but this is a concern only when it happens often, and we are not benchmarking the add time anyways.
+
+To index small datasets, it is more efficient to use a `GpuIVFFlat`, which just stores the full vectors in the inverted lists. We did not mention this in the the paper because it is not as scalable. To experiment with this setting, change the `index_factory` string from "IVF4096,PQ64" to "IVF16384,Flat". This gives:
+
+```
+nprobe=   1 0.025 s recalls= 0.4084 0.4105 0.4105
+nprobe=   2 0.033 s recalls= 0.5235 0.5264 0.5264
+nprobe=   4 0.033 s recalls= 0.6332 0.6367 0.6367
+nprobe=   8 0.040 s recalls= 0.7358 0.7403 0.7403
+nprobe=  16 0.049 s recalls= 0.8273 0.8324 0.8324
+nprobe=  32 0.068 s recalls= 0.8957 0.9024 0.9024
+nprobe=  64 0.104 s recalls= 0.9477 0.9549 0.9549
+nprobe= 128 0.174 s recalls= 0.9760 0.9837 0.9837
+nprobe= 256 0.299 s recalls= 0.9866 0.9944 0.9944
+nprobe= 512 0.527 s recalls= 0.9907 0.9987 0.9987
+```
+
+### Clustering on MNIST8m
+
+To get the "infinite MNIST dataset", follow the instructions on [Léon Bottou's website](http://leon.bottou.org/projects/infimnist). The script assumes the file `mnist8m-patterns-idx3-ubyte` is in subdirectory `mnist8m`
+
+The script [`kmeans_mnist.py`](kmeans_mnist.py) produces the following output: 
+
+```
+python kmeans_mnist.py 1 256
+...
+Clustering 8100000 points in 784D to 256 clusters, redo 1 times, 20 iterations
+  Preprocessing in 7.94526 s
+  Iteration 19 (131.697 s, search 114.78 s): objective=1.44881e+13 imbalance=1.05963 nsplit=0        
+final objective: 1.449e+13
+total runtime: 140.615 s
+```
+
+### search on SIFT1B
+
+The script [`bench_gpu_1bn.py`](bench_gpu_1bn.py) runs multi-gpu searches on the two 1-billion vector datasets we considered. It is more complex than the previous scripts, because it supports many search options and decomposes the dataset build process in Python to exploit the best possible CPU/GPU parallelism and GPU distribution.
+
+Even on multiple GPUs, building the 1B datasets can last several hours. It is often a good idea to validate that everything is working fine on smaller datasets like SIFT1M, SIFT2M, etc.
+
+The search results on SIFT1B in the "GPU paper" can be obtained with 
+
+<!-- see P57124181 -->
+
+```
+python bench_gpu_1bn.py SIFT1000M OPQ8_32,IVF262144,PQ8 -nnn 10 -ngpu 1 -tempmem $[1536*1024*1024]
+...
+0/10000 (0.024 s)      probe=1  : 0.161 s 1-R@1: 0.0752 1-R@10: 0.1924
+0/10000 (0.005 s)      probe=2  : 0.150 s 1-R@1: 0.0964 1-R@10: 0.2693
+0/10000 (0.005 s)      probe=4  : 0.153 s 1-R@1: 0.1102 1-R@10: 0.3328
+0/10000 (0.005 s)      probe=8  : 0.170 s 1-R@1: 0.1220 1-R@10: 0.3827
+0/10000 (0.005 s)      probe=16 : 0.196 s 1-R@1: 0.1290 1-R@10: 0.4151
+0/10000 (0.006 s)      probe=32 : 0.244 s 1-R@1: 0.1314 1-R@10: 0.4345
+0/10000 (0.006 s)      probe=64 : 0.353 s 1-R@1: 0.1332 1-R@10: 0.4461
+0/10000 (0.005 s)      probe=128: 0.587 s 1-R@1: 0.1341 1-R@10: 0.4502
+0/10000 (0.006 s)      probe=256: 1.160 s 1-R@1: 0.1342 1-R@10: 0.4511
+```
+
+We use the `-tempmem` option to reduce the temporary memory allocation to 1.5G, otherwise the dataset does not fit in GPU memory
+
+### search on Deep1B
+
+The same script generates the GPU search results on Deep1B. 
+
+```
+python bench_gpu_1bn.py  Deep1B OPQ20_80,IVF262144,PQ20 -nnn 10 -R 2 -ngpu 4 -altadd -noptables -tempmem $[1024*1024*1024]
+...
+
+0/10000 (0.115 s)      probe=1  : 0.239 s 1-R@1: 0.2387 1-R@10: 0.3420
+0/10000 (0.006 s)      probe=2  : 0.103 s 1-R@1: 0.3110 1-R@10: 0.4623
+0/10000 (0.005 s)      probe=4  : 0.105 s 1-R@1: 0.3772 1-R@10: 0.5862
+0/10000 (0.005 s)      probe=8  : 0.116 s 1-R@1: 0.4235 1-R@10: 0.6889
+0/10000 (0.005 s)      probe=16 : 0.133 s 1-R@1: 0.4517 1-R@10: 0.7693
+0/10000 (0.005 s)      probe=32 : 0.168 s 1-R@1: 0.4713 1-R@10: 0.8281
+0/10000 (0.005 s)      probe=64 : 0.238 s 1-R@1: 0.4841 1-R@10: 0.8649
+0/10000 (0.007 s)      probe=128: 0.384 s 1-R@1: 0.4900 1-R@10: 0.8816
+0/10000 (0.005 s)      probe=256: 0.736 s 1-R@1: 0.4933 1-R@10: 0.8912
+```
+
+Here we are a bit tight on memory so we disable precomputed tables (`-noptables`) and restrict the amount of temporary memory. The `-altadd` option avoids GPU memory overflows during add.
+
+
+### knn-graph on Deep1B
+
+The same script generates the KNN-graph on Deep1B. Note that the inverted file from above will not be re-used because the training sets are different. For the knngraph, the script will first do a pass over the whole dataset to compute the ground-truth knn for a subset of 10k nodes, for evaluation.
+
+```
+python bench_gpu_1bn.py Deep1B OPQ20_80,IVF262144,PQ20 -nnn 10 -altadd -knngraph  -R 2 -noptables -tempmem $[1<<30] -ngpu 4
+...
+CPU index contains 1000000000 vectors, move to GPU
+Copy CPU index to 2 sharded GPU indexes
+   dispatch to GPUs 0:2
+IndexShards shard 0 indices 0:500000000
+  IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
+IndexShards shard 1 indices 500000000:1000000000
+  IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
+   dispatch to GPUs 2:4
+IndexShards shard 0 indices 0:500000000
+  IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
+IndexShards shard 1 indices 500000000:1000000000
+  IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
+move to GPU done in 151.535 s
+search...
+999997440/1000000000 (8389.961 s, 0.3379)      probe=1  : 8389.990 s rank-10 intersection results: 0.3379
+999997440/1000000000 (9205.934 s, 0.4079)      probe=2  : 9205.966 s rank-10 intersection results: 0.4079
+999997440/1000000000 (9741.095 s, 0.4722)      probe=4  : 9741.128 s rank-10 intersection results: 0.4722
+999997440/1000000000 (10830.420 s, 0.5256)      probe=8  : 10830.455 s rank-10 intersection results: 0.5256
+999997440/1000000000 (12531.716 s, 0.5603)      probe=16 : 12531.758 s rank-10 intersection results: 0.5603
+999997440/1000000000 (15922.519 s, 0.5825)      probe=32 : 15922.571 s rank-10 intersection results: 0.5825
+999997440/1000000000 (22774.153 s, 0.5950)      probe=64 : 22774.220 s rank-10 intersection results: 0.5950
+999997440/1000000000 (36717.207 s, 0.6015)      probe=128: 36717.309 s rank-10 intersection results: 0.6015
+999997440/1000000000 (70616.392 s, 0.6047)      probe=256: 70616.581 s rank-10 intersection results: 0.6047
+```
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/README.md b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/README.md
new file mode 100644
index 0000000000..2f7c76b5ac
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/README.md
@@ -0,0 +1,20 @@
+# Benchmark of IVF variants
+
+This is a benchmark of IVF index variants, looking at compression vs. speed vs. accuracy. 
+The results are in [this wiki chapter](https://github.com/facebookresearch/faiss/wiki/Indexing-1G-vectors)
+
+
+The code is organized as: 
+
+- `datasets.py`: code to access the datafiles, compute the ground-truth and report accuracies
+
+- `bench_all_ivf.py`: evaluate one type of inverted file
+
+- `run_on_cluster_generic.bash`: call `bench_all_ivf.py` for all tested types of indices. 
+Since the number of experiments is quite large the script is structued so that the benchmark can be run on a cluster.
+
+- `parse_bench_all_ivf.py`: make nice tradeoff plots from all the results. 
+
+The code depends on Faiss and can use 1 to 8 GPUs to do the k-means clustering for large vocabularies. 
+
+It was run in October 2018 for the results in the wiki. 
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/bench_all_ivf.py b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/bench_all_ivf.py
new file mode 100644
index 0000000000..ee53018828
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/bench_all_ivf.py
@@ -0,0 +1,308 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python2
+
+import os
+import sys
+import time
+import numpy as np
+import faiss
+import argparse
+import datasets
+from datasets import sanitize
+
+######################################################
+# Command-line parsing
+######################################################
+
+
+parser = argparse.ArgumentParser()
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('dataset options')
+
+aa('--db', default='deep1M', help='dataset')
+aa('--compute_gt', default=False, action='store_true',
+    help='compute and store the groundtruth')
+
+group = parser.add_argument_group('index consturction')
+
+aa('--indexkey', default='HNSW32', help='index_factory type')
+aa('--efConstruction', default=200, type=int,
+   help='HNSW construction factor')
+aa('--M0', default=-1, type=int, help='size of base level')
+aa('--maxtrain', default=256 * 256, type=int,
+   help='maximum number of training points (0 to set automatically)')
+aa('--indexfile', default='', help='file to read or write index from')
+aa('--add_bs', default=-1, type=int,
+   help='add elements index by batches of this size')
+aa('--no_precomputed_tables', action='store_true', default=False,
+   help='disable precomputed tables (uses less memory)')
+aa('--clustering_niter', default=-1, type=int,
+   help='number of clustering iterations (-1 = leave default)')
+aa('--train_on_gpu', default=False, action='store_true',
+   help='do training on GPU')
+aa('--get_centroids_from', default='',
+   help='get the centroids from this index (to speed up training)')
+
+group = parser.add_argument_group('searching')
+
+aa('--k', default=100, type=int, help='nb of nearest neighbors')
+aa('--searchthreads', default=-1, type=int,
+   help='nb of threads to use at search time')
+aa('--searchparams', nargs='+', default=['autotune'],
+   help="search parameters to use (can be autotune or a list of params)")
+aa('--n_autotune', default=500, type=int,
+   help="max nb of autotune experiments")
+aa('--autotune_max', default=[], nargs='*',
+   help='set max value for autotune variables format "var:val" (exclusive)')
+aa('--autotune_range', default=[], nargs='*',
+   help='set complete autotune range, format "var:val1,val2,..."')
+aa('--min_test_duration', default=0, type=float,
+   help='run test at least for so long to avoid jitter')
+
+args = parser.parse_args()
+
+print("args:", args)
+
+os.system('echo -n "nb processors "; '
+          'cat /proc/cpuinfo | grep ^processor | wc -l; '
+          'cat /proc/cpuinfo | grep ^"model name" | tail -1')
+
+######################################################
+# Load dataset
+######################################################
+
+xt, xb, xq, gt = datasets.load_data(
+    dataset=args.db, compute_gt=args.compute_gt)
+
+
+print("dataset sizes: train %s base %s query %s GT %s" % (
+    xt.shape, xb.shape, xq.shape, gt.shape))
+
+nq, d = xq.shape
+nb, d = xb.shape
+
+
+######################################################
+# Make index
+######################################################
+
+if args.indexfile and os.path.exists(args.indexfile):
+
+    print("reading", args.indexfile)
+    index = faiss.read_index(args.indexfile)
+
+    if isinstance(index, faiss.IndexPreTransform):
+        index_ivf = faiss.downcast_index(index.index)
+    else:
+        index_ivf = index
+        assert isinstance(index_ivf, faiss.IndexIVF)
+        vec_transform = lambda x: x
+    assert isinstance(index_ivf, faiss.IndexIVF)
+
+else:
+
+    print("build index, key=", args.indexkey)
+
+    index = faiss.index_factory(d, args.indexkey)
+
+    if isinstance(index, faiss.IndexPreTransform):
+        index_ivf = faiss.downcast_index(index.index)
+        vec_transform = index.chain.at(0).apply_py
+    else:
+        index_ivf = index
+        vec_transform = lambda x:x
+    assert isinstance(index_ivf, faiss.IndexIVF)
+    index_ivf.verbose = True
+    index_ivf.quantizer.verbose = True
+    index_ivf.cp.verbose = True
+
+    maxtrain = args.maxtrain
+    if maxtrain == 0:
+        if 'IMI' in args.indexkey:
+            maxtrain = int(256 * 2 ** (np.log2(index_ivf.nlist) / 2))
+        else:
+            maxtrain = 50 * index_ivf.nlist
+        print("setting maxtrain to %d" % maxtrain)
+        args.maxtrain = maxtrain
+
+    xt2 = sanitize(xt[:args.maxtrain])
+    assert np.all(np.isfinite(xt2))
+
+    print("train, size", xt2.shape)
+
+    if args.get_centroids_from == '':
+
+        if args.clustering_niter >= 0:
+            print(("setting nb of clustering iterations to %d" %
+                   args.clustering_niter))
+            index_ivf.cp.niter = args.clustering_niter
+
+        if args.train_on_gpu:
+            print("add a training index on GPU")
+            train_index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
+            index_ivf.clustering_index = train_index
+
+    else:
+        print("Getting centroids from", args.get_centroids_from)
+        src_index = faiss.read_index(args.get_centroids_from)
+        src_quant = faiss.downcast_index(src_index.quantizer)
+        centroids = faiss.vector_to_array(src_quant.xb)
+        centroids = centroids.reshape(-1, d)
+        print("  centroid table shape", centroids.shape)
+
+        if isinstance(index, faiss.IndexPreTransform):
+            print("  training vector transform")
+            assert index.chain.size() == 1
+            vt = index.chain.at(0)
+            vt.train(xt2)
+            print("  transform centroids")
+            centroids = vt.apply_py(centroids)
+
+        print("  add centroids to quantizer")
+        index_ivf.quantizer.add(centroids)
+        del src_index
+
+    t0 = time.time()
+    index.train(xt2)
+    print("  train in %.3f s" % (time.time() - t0))
+
+    print("adding")
+    t0 = time.time()
+    if args.add_bs == -1:
+        index.add(sanitize(xb))
+    else:
+        for i0 in range(0, nb, args.add_bs):
+            i1 = min(nb, i0 + args.add_bs)
+            print("  adding %d:%d / %d" % (i0, i1, nb))
+            index.add(sanitize(xb[i0:i1]))
+
+    print("  add in %.3f s" % (time.time() - t0))
+    if args.indexfile:
+        print("storing", args.indexfile)
+        faiss.write_index(index, args.indexfile)
+
+if args.no_precomputed_tables:
+    if isinstance(index_ivf, faiss.IndexIVFPQ):
+        print("disabling precomputed table")
+        index_ivf.use_precomputed_table = -1
+        index_ivf.precomputed_table.clear()
+
+if args.indexfile:
+    print("index size on disk: ", os.stat(args.indexfile).st_size)
+
+print("current RSS:", faiss.get_mem_usage_kb() * 1024)
+
+precomputed_table_size = 0
+if hasattr(index_ivf, 'precomputed_table'):
+    precomputed_table_size = index_ivf.precomputed_table.size() * 4
+
+print("precomputed tables size:", precomputed_table_size)
+
+
+#############################################################
+# Index is ready
+#############################################################
+
+xq = sanitize(xq)
+
+if args.searchthreads != -1:
+    print("Setting nb of threads to", args.searchthreads)
+    faiss.omp_set_num_threads(args.searchthreads)
+
+
+ps = faiss.ParameterSpace()
+ps.initialize(index)
+
+
+parametersets = args.searchparams
+
+header = '%-40s     R@1   R@10  R@100  time(ms/q)   nb distances #runs' % "parameters"
+
+
+def eval_setting(index, xq, gt, min_time):
+    nq = xq.shape[0]
+    ivf_stats = faiss.cvar.indexIVF_stats
+    ivf_stats.reset()
+    nrun = 0
+    t0 = time.time()
+    while True:
+        D, I = index.search(xq, 100)
+        nrun += 1
+        t1 = time.time()
+        if t1 - t0 > min_time:
+            break
+    ms_per_query = ((t1 - t0) * 1000.0 / nq / nrun)
+    for rank in 1, 10, 100:
+        n_ok = (I[:, :rank] == gt[:, :1]).sum()
+        print("%.4f" % (n_ok / float(nq)), end=' ')
+    print("   %8.3f  " % ms_per_query, end=' ')
+    print("%12d   " % (ivf_stats.ndis / nrun), end=' ')
+    print(nrun)
+
+
+if parametersets == ['autotune']:
+
+    ps.n_experiments = args.n_autotune
+    ps.min_test_duration = args.min_test_duration
+
+    for kv in args.autotune_max:
+        k, vmax = kv.split(':')
+        vmax = float(vmax)
+        print("limiting %s to %g" % (k, vmax))
+        pr = ps.add_range(k)
+        values = faiss.vector_to_array(pr.values)
+        values = np.array([v for v in values if v < vmax])
+        faiss.copy_array_to_vector(values, pr.values)
+
+    for kv in args.autotune_range:
+        k, vals = kv.split(':')
+        vals = np.fromstring(vals, sep=',')
+        print("setting %s to %s" % (k, vals))
+        pr = ps.add_range(k)
+        faiss.copy_array_to_vector(vals, pr.values)
+
+    # setup the Criterion object: optimize for 1-R@1
+    crit = faiss.OneRecallAtRCriterion(nq, 1)
+
+    # by default, the criterion will request only 1 NN
+    crit.nnn = 100
+    crit.set_groundtruth(None, gt.astype('int64'))
+
+    # then we let Faiss find the optimal parameters by itself
+    print("exploring operating points")
+    ps.display()
+
+    t0 = time.time()
+    op = ps.explore(index, xq, crit)
+    print("Done in %.3f s, available OPs:" % (time.time() - t0))
+
+    op.display()
+
+    print(header)
+    opv = op.optimal_pts
+    for i in range(opv.size()):
+        opt = opv.at(i)
+
+        ps.set_index_parameters(index, opt.key)
+
+        print("%-40s " % opt.key, end=' ')
+        sys.stdout.flush()
+
+        eval_setting(index, xq, gt, args.min_test_duration)
+
+else:
+    print(header)
+    for param in parametersets:
+        print("%-40s " % param, end=' ')
+        sys.stdout.flush()
+        ps.set_index_parameters(index, param)
+
+        eval_setting(index, xq, gt, args.min_test_duration)
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/bench_kmeans.py b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/bench_kmeans.py
new file mode 100644
index 0000000000..90cb4e83d9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/bench_kmeans.py
@@ -0,0 +1,118 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python2
+
+from __future__ import print_function
+import os
+import numpy as np
+import faiss
+import argparse
+import datasets
+from datasets import sanitize
+
+######################################################
+# Command-line parsing
+######################################################
+
+parser = argparse.ArgumentParser()
+
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('dataset options')
+
+aa('--db', default='deep1M', help='dataset')
+aa('--nt', default=65536, type=int)
+aa('--nb', default=100000, type=int)
+aa('--nt_sample', default=0, type=int)
+
+group = parser.add_argument_group('kmeans options')
+aa('--k', default=256, type=int)
+aa('--seed', default=12345, type=int)
+aa('--pcadim', default=-1, type=int, help='PCA to this dimension')
+aa('--niter', default=25, type=int)
+aa('--eval_freq', default=100, type=int)
+
+
+args = parser.parse_args()
+
+print("args:", args)
+
+os.system('echo -n "nb processors "; '
+          'cat /proc/cpuinfo | grep ^processor | wc -l; '
+          'cat /proc/cpuinfo | grep ^"model name" | tail -1')
+
+ngpu = faiss.get_num_gpus()
+print("nb GPUs:", ngpu)
+
+######################################################
+# Load dataset
+######################################################
+
+xt, xb, xq, gt = datasets.load_data(dataset=args.db)
+
+
+if args.nt_sample == 0:
+    xt_pca = xt[args.nt:args.nt + 10000]
+    xt = xt[:args.nt]
+else:
+    xt_pca = xt[args.nt_sample:args.nt_sample + 10000]
+    rs = np.random.RandomState(args.seed)
+    idx = rs.choice(args.nt_sample, size=args.nt, replace=False)
+    xt = xt[idx]
+
+xb = xb[:args.nb]
+
+d = xb.shape[1]
+
+if args.pcadim != -1:
+    print("training PCA: %d -> %d" % (d, args.pcadim))
+    pca = faiss.PCAMatrix(d, args.pcadim)
+    pca.train(sanitize(xt_pca))
+    xt = pca.apply_py(sanitize(xt))
+    xb = pca.apply_py(sanitize(xb))
+    d = xb.shape[1]
+
+
+######################################################
+# Run clustering
+######################################################
+
+
+index = faiss.IndexFlatL2(d)
+
+if ngpu > 0:
+    print("moving index to GPU")
+    index = faiss.index_cpu_to_all_gpus(index)
+
+
+clustering = faiss.Clustering(d, args.k)
+
+clustering.verbose = True
+clustering.seed = args.seed
+clustering.max_points_per_centroid = 10**6
+clustering.min_points_per_centroid = 1
+
+
+for iter0 in range(0, args.niter, args.eval_freq):
+    iter1 = min(args.niter, iter0 + args.eval_freq)
+    clustering.niter = iter1 - iter0
+
+    if iter0 > 0:
+        faiss.copy_array_to_vector(centroids.ravel(), clustering.centroids)
+
+    clustering.train(sanitize(xt), index)
+    index.reset()
+    centroids = faiss.vector_to_array(clustering.centroids).reshape(args.k, d)
+    index.add(centroids)
+
+    _, I = index.search(sanitize(xb), 1)
+
+    error = ((xb - centroids[I.ravel()]) ** 2).sum()
+
+    print("iter1=%d quantization error on test: %.4f" % (iter1, error))
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/datasets.py b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/datasets.py
new file mode 100644
index 0000000000..63377bc9a8
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/datasets.py
@@ -0,0 +1,234 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+"""
+Common functions to load datasets and compute their ground-truth
+"""
+
+from __future__ import print_function
+import time
+import numpy as np
+import faiss
+import sys
+
+# set this to the directory that contains the datafiles.
+# deep1b data should be at simdir + 'deep1b'
+# bigann data should be at simdir + 'bigann'
+simdir = '/mnt/vol/gfsai-east/ai-group/datasets/simsearch/'
+
+#################################################################
+# Small I/O functions
+#################################################################
+
+
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype='int32')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+
+
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+
+
+def ivecs_mmap(fname):
+    a = np.memmap(fname, dtype='int32', mode='r')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:]
+
+
+def fvecs_mmap(fname):
+    return ivecs_mmap(fname).view('float32')
+
+
+def bvecs_mmap(fname):
+    x = np.memmap(fname, dtype='uint8', mode='r')
+    d = x[:4].view('int32')[0]
+    return x.reshape(-1, d + 4)[:, 4:]
+
+
+def ivecs_write(fname, m):
+    n, d = m.shape
+    m1 = np.empty((n, d + 1), dtype='int32')
+    m1[:, 0] = d
+    m1[:, 1:] = m
+    m1.tofile(fname)
+
+
+def fvecs_write(fname, m):
+    m = m.astype('float32')
+    ivecs_write(fname, m.view('int32'))
+
+
+#################################################################
+# Dataset
+#################################################################
+
+def sanitize(x):
+    return np.ascontiguousarray(x, dtype='float32')
+
+
+class ResultHeap:
+    """ Combine query results from a sliced dataset """
+
+    def __init__(self, nq, k):
+        " nq: number of query vectors, k: number of results per query "
+        self.I = np.zeros((nq, k), dtype='int64')
+        self.D = np.zeros((nq, k), dtype='float32')
+        self.nq, self.k = nq, k
+        heaps = faiss.float_maxheap_array_t()
+        heaps.k = k
+        heaps.nh = nq
+        heaps.val = faiss.swig_ptr(self.D)
+        heaps.ids = faiss.swig_ptr(self.I)
+        heaps.heapify()
+        self.heaps = heaps
+
+    def add_batch_result(self, D, I, i0):
+        assert D.shape == (self.nq, self.k)
+        assert I.shape == (self.nq, self.k)
+        I += i0
+        self.heaps.addn_with_ids(
+            self.k, faiss.swig_ptr(D),
+            faiss.swig_ptr(I), self.k)
+
+    def finalize(self):
+        self.heaps.reorder()
+
+
+def compute_GT_sliced(xb, xq, k):
+    print("compute GT")
+    t0 = time.time()
+    nb, d = xb.shape
+    nq, d = xq.shape
+    rh = ResultHeap(nq, k)
+    bs = 10 ** 5
+
+    xqs = sanitize(xq)
+
+    db_gt = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
+
+    # compute ground-truth by blocks of bs, and add to heaps
+    for i0 in range(0, nb, bs):
+        i1 = min(nb, i0 + bs)
+        xsl = sanitize(xb[i0:i1])
+        db_gt.add(xsl)
+        D, I = db_gt.search(xqs, k)
+        rh.add_batch_result(D, I, i0)
+        db_gt.reset()
+        print("\r   %d/%d, %.3f s" % (i0, nb, time.time() - t0), end=' ')
+        sys.stdout.flush()
+    print()
+    rh.finalize()
+    gt_I = rh.I
+
+    print("GT time: %.3f s" % (time.time() - t0))
+    return gt_I
+
+
+def do_compute_gt(xb, xq, k):
+    print("computing GT")
+    nb, d = xb.shape
+    index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
+    if nb < 100 * 1000:
+        print("   add")
+        index.add(np.ascontiguousarray(xb, dtype='float32'))
+        print("   search")
+        D, I = index.search(np.ascontiguousarray(xq, dtype='float32'), k)
+    else:
+        I = compute_GT_sliced(xb, xq, k)
+
+    return I.astype('int32')
+
+
+def load_data(dataset='deep1M', compute_gt=False):
+
+    print("load data", dataset)
+
+    if dataset == 'sift1M':
+        basedir = simdir + 'sift1M/'
+
+        xt = fvecs_read(basedir + "sift_learn.fvecs")
+        xb = fvecs_read(basedir + "sift_base.fvecs")
+        xq = fvecs_read(basedir + "sift_query.fvecs")
+        gt = ivecs_read(basedir + "sift_groundtruth.ivecs")
+
+    elif dataset.startswith('bigann'):
+        basedir = simdir + 'bigann/'
+
+        dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
+        xb = bvecs_mmap(basedir + 'bigann_base.bvecs')
+        xq = bvecs_mmap(basedir + 'bigann_query.bvecs')
+        xt = bvecs_mmap(basedir + 'bigann_learn.bvecs')
+        # trim xb to correct size
+        xb = xb[:dbsize * 1000 * 1000]
+        gt = ivecs_read(basedir + 'gnd/idx_%dM.ivecs' % dbsize)
+
+    elif dataset.startswith("deep"):
+        basedir = simdir + 'deep1b/'
+        szsuf = dataset[4:]
+        if szsuf[-1] == 'M':
+            dbsize = 10 ** 6 * int(szsuf[:-1])
+        elif szsuf == '1B':
+            dbsize = 10 ** 9
+        elif szsuf[-1] == 'k':
+            dbsize = 1000 * int(szsuf[:-1])
+        else:
+            assert False, "did not recognize suffix " + szsuf
+
+        xt = fvecs_mmap(basedir + "learn.fvecs")
+        xb = fvecs_mmap(basedir + "base.fvecs")
+        xq = fvecs_read(basedir + "deep1B_queries.fvecs")
+
+        xb = xb[:dbsize]
+
+        gt_fname = basedir + "%s_groundtruth.ivecs" % dataset
+        if compute_gt:
+            gt = do_compute_gt(xb, xq, 100)
+            print("store", gt_fname)
+            ivecs_write(gt_fname, gt)
+
+        gt = ivecs_read(gt_fname)
+
+    else:
+        assert False
+
+    print("dataset %s sizes: B %s Q %s T %s" % (
+        dataset, xb.shape, xq.shape, xt.shape))
+
+    return xt, xb, xq, gt
+
+#################################################################
+# Evaluation
+#################################################################
+
+
+def evaluate_DI(D, I, gt):
+    nq = gt.shape[0]
+    k = I.shape[1]
+    rank = 1
+    while rank <= k:
+        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+        print("R@%d: %.4f" % (rank, recall), end=' ')
+        rank *= 10
+
+
+def evaluate(xq, gt, index, k=100, endl=True):
+    t0 = time.time()
+    D, I = index.search(xq, k)
+    t1 = time.time()
+    nq = xq.shape[0]
+    print("\t %8.4f ms per query, " % (
+        (t1 - t0) * 1000.0 / nq), end=' ')
+    rank = 1
+    while rank <= k:
+        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+        print("R@%d: %.4f" % (rank, recall), end=' ')
+        rank *= 10
+    if endl:
+        print()
+    return D, I
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/parse_bench_all_ivf.py b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/parse_bench_all_ivf.py
new file mode 100644
index 0000000000..1a4d260ea5
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/parse_bench_all_ivf.py
@@ -0,0 +1,268 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+import os
+import numpy as np
+from matplotlib import pyplot
+
+import re
+
+from argparse import Namespace
+
+
+# the directory used in run_on_cluster.bash
+basedir = '/mnt/vol/gfsai-east/ai-group/users/matthijs/bench_all_ivf/'
+logdir = basedir + 'logs/'
+
+
+# which plot to output
+db = 'bigann1B'
+code_size = 8
+
+
+
+def unitsize(indexkey):
+    """ size of one vector in the index """
+    mo = re.match('.*,PQ(\\d+)', indexkey)
+    if mo:
+        return int(mo.group(1))
+    if indexkey.endswith('SQ8'):
+        bits_per_d = 8
+    elif indexkey.endswith('SQ4'):
+        bits_per_d = 4
+    elif indexkey.endswith('SQfp16'):
+        bits_per_d = 16
+    else:
+        assert False
+    mo = re.match('PCAR(\\d+),.*', indexkey)
+    if mo:
+        return bits_per_d * int(mo.group(1)) / 8
+    mo = re.match('OPQ\\d+_(\\d+),.*', indexkey)
+    if mo:
+        return bits_per_d * int(mo.group(1)) / 8
+    mo = re.match('RR(\\d+),.*', indexkey)
+    if mo:
+        return bits_per_d * int(mo.group(1)) / 8
+    assert False
+
+
+def dbsize_from_name(dbname):
+    sufs = {
+        '1B': 10**9,
+        '100M': 10**8,
+        '10M': 10**7,
+        '1M': 10**6,
+    }
+    for s in sufs:
+        if dbname.endswith(s):
+            return sufs[s]
+    else:
+        assert False
+
+
+def keep_latest_stdout(fnames):
+    fnames = [fname for fname in fnames if fname.endswith('.stdout')]
+    fnames.sort()
+    n = len(fnames)
+    fnames2 = []
+    for i, fname in enumerate(fnames):
+        if i + 1 < n and fnames[i + 1][:-8] == fname[:-8]:
+            continue
+        fnames2.append(fname)
+    return fnames2
+
+
+def parse_result_file(fname):
+    # print fname
+    st = 0
+    res = []
+    keys = []
+    stats = {}
+    stats['run_version'] = fname[-8]
+    for l in open(fname):
+        if st == 0:
+            if l.startswith('CHRONOS_JOB_INSTANCE_ID'):
+                stats['CHRONOS_JOB_INSTANCE_ID'] = l.split()[-1]
+            if l.startswith('index size on disk:'):
+                stats['index_size'] = int(l.split()[-1])
+            if l.startswith('current RSS:'):
+                stats['RSS'] = int(l.split()[-1])
+            if l.startswith('precomputed tables size:'):
+                stats['tables_size'] = int(l.split()[-1])
+            if l.startswith('Setting nb of threads to'):
+                stats['n_threads'] = int(l.split()[-1])
+            if l.startswith('  add in'):
+                stats['add_time'] = float(l.split()[-2])
+            if l.startswith('args:'):
+                args = eval(l[l.find(' '):])
+                indexkey = args.indexkey
+            elif 'R@1   R@10  R@100' in l:
+                st = 1
+            elif 'index size on disk:' in l:
+                index_size = int(l.split()[-1])
+        elif st == 1:
+            st = 2
+        elif st == 2:
+            fi = l.split()
+            keys.append(fi[0])
+            res.append([float(x) for x in fi[1:]])
+    return indexkey, np.array(res), keys, stats
+
+# run parsing
+allres = {}
+allstats = {}
+nts = []
+missing = []
+versions = {}
+
+fnames = keep_latest_stdout(os.listdir(logdir))
+# print fnames
+# filenames are in the form <key>.x.stdout
+# where x is a version number (from a to z)
+# keep only latest version of each name
+
+for fname in fnames:
+    if not ('db' + db in fname and fname.endswith('.stdout')):
+        continue
+    indexkey, res, _, stats = parse_result_file(logdir + fname)
+    if res.size == 0:
+        missing.append(fname)
+        errorline = open(
+            logdir + fname.replace('.stdout', '.stderr')).readlines()
+        if len(errorline) > 0:
+            errorline = errorline[-1]
+        else:
+            errorline = 'NO STDERR'
+        print fname, stats['CHRONOS_JOB_INSTANCE_ID'], errorline
+
+    else:
+        if indexkey in allres:
+            if allstats[indexkey]['run_version'] > stats['run_version']:
+                # don't use this run
+                continue
+        n_threads = stats.get('n_threads', 1)
+        nts.append(n_threads)
+        allres[indexkey] = res
+        allstats[indexkey] = stats
+
+assert len(set(nts)) == 1
+n_threads = nts[0]
+
+
+def plot_tradeoffs(allres, code_size, recall_rank):
+    dbsize = dbsize_from_name(db)
+    recall_idx = int(np.log10(recall_rank))
+
+    bigtab = []
+    names = []
+
+    for k,v in sorted(allres.items()):
+        if v.ndim != 2: continue
+        us = unitsize(k)
+        if us != code_size: continue
+        perf = v[:, recall_idx]
+        times = v[:, 3]
+        bigtab.append(
+            np.vstack((
+                np.ones(times.size, dtype=int) * len(names),
+                perf, times
+            ))
+        )
+        names.append(k)
+
+    bigtab = np.hstack(bigtab)
+
+    perm = np.argsort(bigtab[1, :])
+    bigtab = bigtab[:, perm]
+
+    times = np.minimum.accumulate(bigtab[2, ::-1])[::-1]
+    selection = np.where(bigtab[2, :] == times)
+
+    selected_methods = [names[i] for i in
+                        np.unique(bigtab[0, selection].astype(int))]
+    not_selected = list(set(names) - set(selected_methods))
+
+    print "methods without an optimal OP: ", not_selected
+
+    nq = 10000
+    pyplot.title('database ' + db + ' code_size=%d' % code_size)
+
+    # grayed out lines
+
+    for k in not_selected:
+        v = allres[k]
+        if v.ndim != 2: continue
+        us = unitsize(k)
+        if us != code_size: continue
+
+        linestyle = (':' if 'PQ' in k else
+                     '-.' if 'SQ4' in k else
+                     '--' if 'SQ8' in k else '-')
+
+        pyplot.semilogy(v[:, recall_idx], v[:, 3], label=None,
+                        linestyle=linestyle,
+                        marker='o' if 'HNSW' in k else '+',
+                        color='#cccccc', linewidth=0.2)
+
+    # important methods
+    for k in selected_methods:
+        v = allres[k]
+        if v.ndim != 2: continue
+        us = unitsize(k)
+        if us != code_size: continue
+
+        stats = allstats[k]
+        tot_size = stats['index_size'] + stats['tables_size']
+        id_size = 8 # 64 bit
+
+        addt = ''
+        if 'add_time' in stats:
+            add_time = stats['add_time']
+            if add_time > 7200:
+                add_min = add_time / 60
+                addt = ', %dh%02d' % (add_min / 60, add_min % 60)
+            else:
+                add_sec = int(add_time)
+                addt = ', %dm%02d' % (add_sec / 60, add_sec % 60)
+
+
+        label = k + ' (size+%.1f%%%s)' % (
+            tot_size / float((code_size + id_size) * dbsize) * 100 - 100,
+            addt)
+
+        linestyle = (':' if 'PQ' in k else
+                     '-.' if 'SQ4' in k else
+                     '--' if 'SQ8' in k else '-')
+
+        pyplot.semilogy(v[:, recall_idx], v[:, 3], label=label,
+                        linestyle=linestyle,
+                        marker='o' if 'HNSW' in k else '+')
+
+    if len(not_selected) == 0:
+        om = ''
+    else:
+        om = '\nomitted:'
+        nc = len(om)
+        for m in not_selected:
+            if nc > 80:
+                om += '\n'
+                nc = 0
+            om += ' ' + m
+            nc += len(m) + 1
+
+    pyplot.xlabel('1-recall at %d %s' % (recall_rank, om) )
+    pyplot.ylabel('search time per query (ms, %d threads)' % n_threads)
+    pyplot.legend()
+    pyplot.grid()
+    pyplot.savefig('figs/tradeoffs_%s_cs%d_r%d.png' % (
+        db, code_size, recall_rank))
+    return selected_methods, not_selected
+
+
+pyplot.gcf().set_size_inches(15, 10)
+
+plot_tradeoffs(allres, code_size=code_size, recall_rank=1)
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/run_on_cluster_generic.bash b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/run_on_cluster_generic.bash
new file mode 100644
index 0000000000..6d88f43d9a
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/run_on_cluster_generic.bash
@@ -0,0 +1,249 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# @nolint
+
+# This script launches the experiments on a cluster
+# It assumes two shell functions are defined:
+#
+#    run_on_1machine: runs a command on one (full) machine on a cluster
+#
+#    run_on_8gpu: runs a command on one machine with 8 GPUs
+#
+# the two functions are called as:
+#
+#    run_on_1machine <name> <command>
+#
+# the stdout of the command should be stored in $logdir/<name>.stdout
+
+function run_on_1machine () {
+    # To be implemented
+}
+
+function run_on_8gpu () {
+    # To be implemented
+}
+
+
+# prepare output directories
+# set to some directory where all indexes, can be written.
+basedir=XXXXX
+
+logdir=$basedir/logs
+indexdir=$basedir/indexes
+
+mkdir -p $lars $logdir $indexdir
+
+
+############################### 1M experiments
+
+for db in sift1M deep1M bigann1M; do
+
+    for coarse in IMI2x9 IMI2x10 IVF1024_HNSW32 IVF4096_HNSW32 IVF16384_HNSW32
+    do
+
+        for indexkey in \
+            OPQ8_64,$coarse,PQ8 \
+            PCAR16,$coarse,SQ4 \
+            OPQ16_64,$coarse,PQ16 \
+            PCAR32,$coarse,SQ4 \
+            PCAR16,$coarse,SQ8 \
+            OPQ32_128,$coarse,PQ32 \
+            PCAR64,$coarse,SQ4 \
+            PCAR32,$coarse,SQ8 \
+            PCAR16,$coarse,SQfp16 \
+            PCAR64,$coarse,SQ8 \
+            PCAR32,$coarse,SQfp16 \
+            PCAR128,$coarse,SQ4
+        do
+            key=autotune.db$db.${indexkey//,/_}
+            run_on_1machine $key \
+                 python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey $indexkey \
+                    --maxtrain 0  \
+                    --indexfile $indexdir/$key.faissindex
+
+        done
+    done
+done
+
+
+
+############################### 10M experiments
+
+
+for db in deep10M bigann10M; do
+
+    for coarse in \
+        IMI2x10 IMI2x11 IMI2x12 IMI2x13 IVF4096_HNSW32 \
+        IVF16384_HNSW32 IVF65536_HNSW32 IVF262144_HNSW32
+    do
+
+        for indexkey in \
+            OPQ8_64,$coarse,PQ8 \
+            PCAR16,$coarse,SQ4 \
+            OPQ16_64,$coarse,PQ16 \
+            PCAR32,$coarse,SQ4 \
+            PCAR16,$coarse,SQ8 \
+            OPQ32_128,$coarse,PQ32 \
+            PCAR64,$coarse,SQ4 \
+            PCAR32,$coarse,SQ8 \
+            PCAR16,$coarse,SQfp16 \
+            PCAR64,$coarse,SQ8 \
+            PCAR32,$coarse,SQfp16 \
+            PCAR128,$coarse,SQ4 \
+            OPQ64_128,$coarse,PQ64
+        do
+            key=autotune.db$db.${indexkey//,/_}
+            run_on_1machine $key \
+                 python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey $indexkey \
+                    --maxtrain 0  \
+                    --indexfile $indexdir/$key.faissindex \
+                    --searchthreads 16 \
+                    --min_test_duration 3 \
+
+        done
+    done
+done
+
+
+############################### 100M experiments
+
+for db in deep100M bigann100M; do
+
+    for coarse in IMI2x11 IMI2x12 IVF65536_HNSW32 IVF262144_HNSW32
+    do
+
+        for indexkey in \
+            OPQ8_64,$coarse,PQ8 \
+            OPQ16_64,$coarse,PQ16 \
+            PCAR32,$coarse,SQ4 \
+            OPQ32_128,$coarse,PQ32 \
+            PCAR64,$coarse,SQ4 \
+            PCAR32,$coarse,SQ8 \
+            PCAR64,$coarse,SQ8 \
+            PCAR32,$coarse,SQfp16 \
+            PCAR128,$coarse,SQ4 \
+            OPQ64_128,$coarse,PQ64
+        do
+            key=autotune.db$db.${indexkey//,/_}
+            run_on_1machine $key \
+                 python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey $indexkey \
+                    --maxtrain 0  \
+                    --indexfile $indexdir/$key.faissindex \
+                    --searchthreads 16 \
+                    --min_test_duration 3 \
+                    --add_bs 1000000
+
+        done
+    done
+done
+
+
+############################### 1B experiments
+
+for db in deep1B bigann1B; do
+
+    for coarse in  IMI2x12 IMI2x13 IVF262144_HNSW32
+    do
+
+        for indexkey in \
+            OPQ8_64,$coarse,PQ8 \
+            OPQ16_64,$coarse,PQ16 \
+            PCAR32,$coarse,SQ4 \
+            OPQ32_128,$coarse,PQ32 \
+            PCAR64,$coarse,SQ4 \
+            PCAR32,$coarse,SQ8 \
+            PCAR64,$coarse,SQ8 \
+            PCAR32,$coarse,SQfp16 \
+            PCAR128,$coarse,SQ4 \
+            PQ64_128,$coarse,PQ64 \
+            RR128,$coarse,SQ4
+        do
+            key=autotune.db$db.${indexkey//,/_}
+            run_on_1machine $key \
+                 python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey $indexkey \
+                    --maxtrain 0  \
+                    --indexfile $indexdir/$key.faissindex \
+                    --searchthreads 16 \
+                    --min_test_duration 3 \
+                    --add_bs 1000000
+
+        done
+    done
+
+done
+
+############################################
+# precompute centroids on GPU for large vocabularies
+
+
+for db in deep1M bigann1M; do
+
+    for ncent in 1048576 4194304; do
+
+        key=clustering.db$db.IVF$ncent
+        run_on_8gpu $key \
+            python -u bench_all_ivf.py \
+                --db $db \
+                --indexkey IVF$ncent,SQ8 \
+                --maxtrain 100000000  \
+                --indexfile $indexdir/$key.faissindex \
+                --searchthreads 16 \
+                --min_test_duration 3 \
+                --add_bs 1000000 \
+                --train_on_gpu
+
+    done
+done
+
+
+#################################
+# Run actual experiment
+
+for db in deep1B bigann1B; do
+
+    for ncent in 1048576 4194304; do
+        coarse=IVF${ncent}_HNSW32
+        centroidsname=clustering.db${db/1B/1M}.IVF${ncent}.faissindex
+
+        for indexkey in \
+            OPQ8_64,$coarse,PQ8 \
+            OPQ16_64,$coarse,PQ16 \
+            PCAR32,$coarse,SQ4 \
+            OPQ32_128,$coarse,PQ32 \
+            PCAR64,$coarse,SQ4 \
+            PCAR32,$coarse,SQ8 \
+            PCAR64,$coarse,SQ8 \
+            PCAR32,$coarse,SQfp16 \
+            OPQ64_128,$coarse,PQ64 \
+            RR128,$coarse,SQ4 \
+            OPQ64_128,$coarse,PQ64 \
+            RR128,$coarse,SQ4
+        do
+            key=autotune.db$db.${indexkey//,/_}
+
+            run_on_1machine $key.c $key \
+                 python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey $indexkey \
+                    --maxtrain 256000  \
+                    --indexfile $indexdir/$key.faissindex \
+                    --get_centroids_from $indexdir/$centroidsname \
+                    --searchthreads 16 \
+                    --min_test_duration 3 \
+                    --add_bs 1000000
+
+        done
+    done
+
+done
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_for_interrupt.py b/core/src/index/thirdparty/faiss/benchs/bench_for_interrupt.py
new file mode 100644
index 0000000000..b72d825ef9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/bench_for_interrupt.py
@@ -0,0 +1,155 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python3
+
+from __future__ import print_function
+import numpy as np
+import faiss
+import time
+import os
+import argparse
+
+
+parser = argparse.ArgumentParser()
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+group = parser.add_argument_group('dataset options')
+aa('--dim', type=int, default=64)
+aa('--nb', type=int, default=int(1e6))
+aa('--subset_len', type=int, default=int(1e5))
+aa('--key', default='IVF1000,Flat')
+aa('--nprobe', type=int, default=640)
+aa('--no_intcallback', default=False, action='store_true')
+aa('--twostage', default=False, action='store_true')
+aa('--nt', type=int, default=-1)
+
+
+args = parser.parse_args()
+print("args:", args)
+
+
+d = args.dim  # dimension
+nb = args.nb  # database size
+nq = 1000  # nb of queries
+nt = 100000
+subset_len = args.subset_len
+
+
+np.random.seed(1234)  # make reproducible
+xb = np.random.random((nb, d)).astype('float32')
+xq = np.random.random((nq, d)).astype('float32')
+xt = np.random.random((nt, d)).astype('float32')
+k = 100
+
+if args.no_intcallback:
+    faiss.InterruptCallback.clear_instance()
+
+if args.nt != -1:
+    faiss.omp_set_num_threads(args.nt)
+
+nprobe = args.nprobe
+key = args.key
+#key = 'IVF1000,Flat'
+# key = 'IVF1000,PQ64'
+# key = 'IVF100_HNSW32,PQ64'
+
+# faiss.omp_set_num_threads(1)
+
+pf = 'dim%d_' % d
+if d == 64:
+    pf = ''
+
+basename = '/tmp/base%s%s.index' % (pf, key)
+
+if os.path.exists(basename):
+    print('load', basename)
+    index_1 = faiss.read_index(basename)
+else:
+    print('train + write', basename)
+    index_1 = faiss.index_factory(d, key)
+    index_1.train(xt)
+    faiss.write_index(index_1, basename)
+
+print('add')
+index_1.add(xb)
+
+print('set nprobe=', nprobe)
+faiss.ParameterSpace().set_index_parameter(index_1, 'nprobe', nprobe)
+
+class ResultHeap:
+    """ Combine query results from a sliced dataset """
+
+    def __init__(self, nq, k):
+        " nq: number of query vectors, k: number of results per query "
+        self.I = np.zeros((nq, k), dtype='int64')
+        self.D = np.zeros((nq, k), dtype='float32')
+        self.nq, self.k = nq, k
+        heaps = faiss.float_maxheap_array_t()
+        heaps.k = k
+        heaps.nh = nq
+        heaps.val = faiss.swig_ptr(self.D)
+        heaps.ids = faiss.swig_ptr(self.I)
+        heaps.heapify()
+        self.heaps = heaps
+
+    def add_batch_result(self, D, I, i0):
+        assert D.shape == (self.nq, self.k)
+        assert I.shape == (self.nq, self.k)
+        I += i0
+        self.heaps.addn_with_ids(
+            self.k, faiss.swig_ptr(D),
+            faiss.swig_ptr(I), self.k)
+
+    def finalize(self):
+        self.heaps.reorder()
+
+stats = faiss.cvar.indexIVF_stats
+stats.reset()
+
+print('index size', index_1.ntotal,
+      'imbalance', index_1.invlists.imbalance_factor())
+start = time.time()
+Dref, Iref = index_1.search(xq, k)
+print('time of searching: %.3f s = %.3f + %.3f ms' % (
+    time.time() - start, stats.quantization_time, stats.search_time))
+
+indexes = {}
+if args.twostage:
+
+    for i in range(0, nb, subset_len):
+        index = faiss.read_index(basename)
+        faiss.ParameterSpace().set_index_parameter(index, 'nprobe', nprobe)
+        print("add %d:%d" %(i, i+subset_len))
+        index.add(xb[i:i + subset_len])
+        indexes[i] = index
+
+rh = ResultHeap(nq, k)
+sum_time = tq = ts = 0
+for i in range(0, nb, subset_len):
+    if not args.twostage:
+        index = faiss.read_index(basename)
+        faiss.ParameterSpace().set_index_parameter(index, 'nprobe', nprobe)
+        print("add %d:%d" %(i, i+subset_len))
+        index.add(xb[i:i + subset_len])
+    else:
+        index = indexes[i]
+
+    stats.reset()
+    start = time.time()
+    Di, Ii = index.search(xq, k)
+    sum_time = sum_time + time.time() - start
+    tq += stats.quantization_time
+    ts += stats.search_time
+    rh.add_batch_result(Di, Ii, i)
+
+print('time of searching separately: %.3f s = %.3f + %.3f ms' %
+      (sum_time, tq, ts))
+
+rh.finalize()
+
+print('diffs: %d / %d'  % ((Iref != rh.I).sum(), Iref.size))
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_gpu_1bn.py b/core/src/index/thirdparty/faiss/benchs/bench_gpu_1bn.py
new file mode 100644
index 0000000000..f404605a22
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/bench_gpu_1bn.py
@@ -0,0 +1,745 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+from __future__ import print_function
+import numpy as np
+import time
+import os
+import sys
+import faiss
+import re
+
+from multiprocessing.dummy import Pool as ThreadPool
+from datasets import ivecs_read
+
+####################################################################
+# Parse command line
+####################################################################
+
+
+def usage():
+    print("""
+
+Usage: bench_gpu_1bn.py dataset indextype [options]
+
+dataset: set of vectors to operate on.
+   Supported: SIFT1M, SIFT2M, ..., SIFT1000M or Deep1B
+
+indextype: any index type supported by index_factory that runs on GPU.
+
+    General options
+
+-ngpu ngpu         nb of GPUs to use (default = all)
+-tempmem N         use N bytes of temporary GPU memory
+-nocache           do not read or write intermediate files
+-float16           use 16-bit floats on the GPU side
+
+    Add options
+
+-abs N             split adds in blocks of no more than N vectors
+-max_add N         copy sharded dataset to CPU each max_add additions
+                   (to avoid memory overflows with geometric reallocations)
+-altadd            Alternative add function, where the index is not stored
+                   on GPU during add. Slightly faster for big datasets on
+                   slow GPUs
+
+    Search options
+
+-R R:              nb of replicas of the same dataset (the dataset
+                   will be copied across ngpu/R, default R=1)
+-noptables         do not use precomputed tables in IVFPQ.
+-qbs N             split queries in blocks of no more than N vectors
+-nnn N             search N neighbors for each query
+-nprobe 4,16,64    try this number of probes
+-knngraph          instead of the standard setup for the dataset,
+                   compute a k-nn graph with nnn neighbors per element
+-oI xx%d.npy       output the search result indices to this numpy file,
+                   %d will be replaced with the nprobe
+-oD xx%d.npy       output the search result distances to this file
+
+""", file=sys.stderr)
+    sys.exit(1)
+
+
+# default values
+
+dbname = None
+index_key = None
+
+ngpu = faiss.get_num_gpus()
+
+replicas = 1  # nb of replicas of sharded dataset
+add_batch_size = 32768
+query_batch_size = 16384
+nprobes = [1 << l for l in range(9)]
+knngraph = False
+use_precomputed_tables = True
+tempmem = -1  # if -1, use system default
+max_add = -1
+use_float16 = False
+use_cache = True
+nnn = 10
+altadd = False
+I_fname = None
+D_fname = None
+
+args = sys.argv[1:]
+
+while args:
+    a = args.pop(0)
+    if a == '-h': usage()
+    elif a == '-ngpu':      ngpu = int(args.pop(0))
+    elif a == '-R':         replicas = int(args.pop(0))
+    elif a == '-noptables': use_precomputed_tables = False
+    elif a == '-abs':       add_batch_size = int(args.pop(0))
+    elif a == '-qbs':       query_batch_size = int(args.pop(0))
+    elif a == '-nnn':       nnn = int(args.pop(0))
+    elif a == '-tempmem':   tempmem = int(args.pop(0))
+    elif a == '-nocache':   use_cache = False
+    elif a == '-knngraph':  knngraph = True
+    elif a == '-altadd':    altadd = True
+    elif a == '-float16':   use_float16 = True
+    elif a == '-nprobe':    nprobes = [int(x) for x in args.pop(0).split(',')]
+    elif a == '-max_add':   max_add = int(args.pop(0))
+    elif not dbname:        dbname = a
+    elif not index_key:     index_key = a
+    else:
+        print("argument %s unknown" % a, file=sys.stderr)
+        sys.exit(1)
+
+cacheroot = '/tmp/bench_gpu_1bn'
+
+if not os.path.isdir(cacheroot):
+    print("%s does not exist, creating it" % cacheroot)
+    os.mkdir(cacheroot)
+
+#################################################################
+# Small Utility Functions
+#################################################################
+
+# we mem-map the biggest files to avoid having them in memory all at
+# once
+
+def mmap_fvecs(fname):
+    x = np.memmap(fname, dtype='int32', mode='r')
+    d = x[0]
+    return x.view('float32').reshape(-1, d + 1)[:, 1:]
+
+def mmap_bvecs(fname):
+    x = np.memmap(fname, dtype='uint8', mode='r')
+    d = x[:4].view('int32')[0]
+    return x.reshape(-1, d + 4)[:, 4:]
+
+
+def rate_limited_imap(f, l):
+    """A threaded imap that does not produce elements faster than they
+    are consumed"""
+    pool = ThreadPool(1)
+    res = None
+    for i in l:
+        res_next = pool.apply_async(f, (i, ))
+        if res:
+            yield res.get()
+        res = res_next
+    yield res.get()
+
+
+class IdentPreproc:
+    """a pre-processor is either a faiss.VectorTransform or an IndentPreproc"""
+
+    def __init__(self, d):
+        self.d_in = self.d_out = d
+
+    def apply_py(self, x):
+        return x
+
+
+def sanitize(x):
+    """ convert array to a c-contiguous float array """
+    return np.ascontiguousarray(x.astype('float32'))
+
+
+def dataset_iterator(x, preproc, bs):
+    """ iterate over the lines of x in blocks of size bs"""
+
+    nb = x.shape[0]
+    block_ranges = [(i0, min(nb, i0 + bs))
+                    for i0 in range(0, nb, bs)]
+
+    def prepare_block((i0, i1)):
+        xb = sanitize(x[i0:i1])
+        return i0, preproc.apply_py(xb)
+
+    return rate_limited_imap(prepare_block, block_ranges)
+
+
+def eval_intersection_measure(gt_I, I):
+    """ measure intersection measure (used for knngraph)"""
+    inter = 0
+    rank = I.shape[1]
+    assert gt_I.shape[1] >= rank
+    for q in range(nq_gt):
+        inter += faiss.ranklist_intersection_size(
+            rank, faiss.swig_ptr(gt_I[q, :]),
+            rank, faiss.swig_ptr(I[q, :].astype('int64')))
+    return inter / float(rank * nq_gt)
+
+
+#################################################################
+# Prepare dataset
+#################################################################
+
+print("Preparing dataset", dbname)
+
+if dbname.startswith('SIFT'):
+    # SIFT1M to SIFT1000M
+    dbsize = int(dbname[4:-1])
+    xb = mmap_bvecs('bigann/bigann_base.bvecs')
+    xq = mmap_bvecs('bigann/bigann_query.bvecs')
+    xt = mmap_bvecs('bigann/bigann_learn.bvecs')
+
+    # trim xb to correct size
+    xb = xb[:dbsize * 1000 * 1000]
+
+    gt_I = ivecs_read('bigann/gnd/idx_%dM.ivecs' % dbsize)
+
+elif dbname == 'Deep1B':
+    xb = mmap_fvecs('deep1b/base.fvecs')
+    xq = mmap_fvecs('deep1b/deep1B_queries.fvecs')
+    xt = mmap_fvecs('deep1b/learn.fvecs')
+    # deep1B's train is is outrageously big
+    xt = xt[:10 * 1000 * 1000]
+    gt_I = ivecs_read('deep1b/deep1B_groundtruth.ivecs')
+
+else:
+    print('unknown dataset', dbname, file=sys.stderr)
+    sys.exit(1)
+
+
+if knngraph:
+    # convert to knn-graph dataset
+    xq = xb
+    xt = xb
+    # we compute the ground-truth on this number of queries for validation
+    nq_gt = 10000
+    gt_sl = 100
+
+    # ground truth will be computed below
+    gt_I = None
+
+
+print("sizes: B %s Q %s T %s gt %s" % (
+    xb.shape, xq.shape, xt.shape,
+    gt_I.shape if gt_I is not None else None))
+
+
+
+#################################################################
+# Parse index_key and set cache files
+#
+# The index_key is a valid factory key that would work, but we
+# decompose the training to do it faster
+#################################################################
+
+
+pat = re.compile('(OPQ[0-9]+(_[0-9]+)?,|PCAR[0-9]+,)?' +
+                 '(IVF[0-9]+),' +
+                 '(PQ[0-9]+|Flat)')
+
+matchobject = pat.match(index_key)
+
+assert matchobject, 'could not parse ' + index_key
+
+mog = matchobject.groups()
+
+preproc_str = mog[0]
+ivf_str = mog[2]
+pqflat_str = mog[3]
+
+ncent = int(ivf_str[3:])
+
+prefix = ''
+
+if knngraph:
+    gt_cachefile = '%s/BK_gt_%s.npy' % (cacheroot, dbname)
+    prefix = 'BK_'
+    # files must be kept distinct because the training set is not the
+    # same for the knngraph
+
+if preproc_str:
+    preproc_cachefile = '%s/%spreproc_%s_%s.vectrans' % (
+        cacheroot, prefix, dbname, preproc_str[:-1])
+else:
+    preproc_cachefile = None
+    preproc_str = ''
+
+cent_cachefile = '%s/%scent_%s_%s%s.npy' % (
+    cacheroot, prefix, dbname, preproc_str, ivf_str)
+
+index_cachefile = '%s/%s%s_%s%s,%s.index' % (
+    cacheroot, prefix, dbname, preproc_str, ivf_str, pqflat_str)
+
+
+if not use_cache:
+    preproc_cachefile = None
+    cent_cachefile = None
+    index_cachefile = None
+
+print("cachefiles:")
+print(preproc_cachefile)
+print(cent_cachefile)
+print(index_cachefile)
+
+
+#################################################################
+# Wake up GPUs
+#################################################################
+
+print("preparing resources for %d GPUs" % ngpu)
+
+gpu_resources = []
+
+for i in range(ngpu):
+    res = faiss.StandardGpuResources()
+    if tempmem >= 0:
+        res.setTempMemory(tempmem)
+    gpu_resources.append(res)
+
+
+def make_vres_vdev(i0=0, i1=-1):
+    " return vectors of device ids and resources useful for gpu_multiple"
+    vres = faiss.GpuResourcesVector()
+    vdev = faiss.IntVector()
+    if i1 == -1:
+        i1 = ngpu
+    for i in range(i0, i1):
+        vdev.push_back(i)
+        vres.push_back(gpu_resources[i])
+    return vres, vdev
+
+
+#################################################################
+# Prepare ground truth (for the knngraph)
+#################################################################
+
+
+def compute_GT():
+    print("compute GT")
+    t0 = time.time()
+
+    gt_I = np.zeros((nq_gt, gt_sl), dtype='int64')
+    gt_D = np.zeros((nq_gt, gt_sl), dtype='float32')
+    heaps = faiss.float_maxheap_array_t()
+    heaps.k = gt_sl
+    heaps.nh = nq_gt
+    heaps.val = faiss.swig_ptr(gt_D)
+    heaps.ids = faiss.swig_ptr(gt_I)
+    heaps.heapify()
+    bs = 10 ** 5
+
+    n, d = xb.shape
+    xqs = sanitize(xq[:nq_gt])
+
+    db_gt = faiss.IndexFlatL2(d)
+    vres, vdev = make_vres_vdev()
+    db_gt_gpu = faiss.index_cpu_to_gpu_multiple(
+        vres, vdev, db_gt)
+
+    # compute ground-truth by blocks of bs, and add to heaps
+    for i0, xsl in dataset_iterator(xb, IdentPreproc(d), bs):
+        db_gt_gpu.add(xsl)
+        D, I = db_gt_gpu.search(xqs, gt_sl)
+        I += i0
+        heaps.addn_with_ids(
+            gt_sl, faiss.swig_ptr(D), faiss.swig_ptr(I), gt_sl)
+        db_gt_gpu.reset()
+        print("\r   %d/%d, %.3f s" % (i0, n, time.time() - t0), end=' ')
+    print()
+    heaps.reorder()
+
+    print("GT time: %.3f s" % (time.time() - t0))
+    return gt_I
+
+
+if knngraph:
+
+    if gt_cachefile and os.path.exists(gt_cachefile):
+        print("load GT", gt_cachefile)
+        gt_I = np.load(gt_cachefile)
+    else:
+        gt_I = compute_GT()
+        if gt_cachefile:
+            print("store GT", gt_cachefile)
+            np.save(gt_cachefile, gt_I)
+
+#################################################################
+# Prepare the vector transformation object (pure CPU)
+#################################################################
+
+
+def train_preprocessor():
+    print("train preproc", preproc_str)
+    d = xt.shape[1]
+    t0 = time.time()
+    if preproc_str.startswith('OPQ'):
+        fi = preproc_str[3:-1].split('_')
+        m = int(fi[0])
+        dout = int(fi[1]) if len(fi) == 2 else d
+        preproc = faiss.OPQMatrix(d, m, dout)
+    elif preproc_str.startswith('PCAR'):
+        dout = int(preproc_str[4:-1])
+        preproc = faiss.PCAMatrix(d, dout, 0, True)
+    else:
+        assert False
+    preproc.train(sanitize(xt[:1000000]))
+    print("preproc train done in %.3f s" % (time.time() - t0))
+    return preproc
+
+
+def get_preprocessor():
+    if preproc_str:
+        if not preproc_cachefile or not os.path.exists(preproc_cachefile):
+            preproc = train_preprocessor()
+            if preproc_cachefile:
+                print("store", preproc_cachefile)
+                faiss.write_VectorTransform(preproc, preproc_cachefile)
+        else:
+            print("load", preproc_cachefile)
+            preproc = faiss.read_VectorTransform(preproc_cachefile)
+    else:
+        d = xb.shape[1]
+        preproc = IdentPreproc(d)
+    return preproc
+
+
+#################################################################
+# Prepare the coarse quantizer
+#################################################################
+
+
+def train_coarse_quantizer(x, k, preproc):
+    d = preproc.d_out
+    clus = faiss.Clustering(d, k)
+    clus.verbose = True
+    # clus.niter = 2
+    clus.max_points_per_centroid = 10000000
+
+    print("apply preproc on shape", x.shape, 'k=', k)
+    t0 = time.time()
+    x = preproc.apply_py(sanitize(x))
+    print("   preproc %.3f s output shape %s" % (
+        time.time() - t0, x.shape))
+
+    vres, vdev = make_vres_vdev()
+    index = faiss.index_cpu_to_gpu_multiple(
+        vres, vdev, faiss.IndexFlatL2(d))
+
+    clus.train(x, index)
+    centroids = faiss.vector_float_to_array(clus.centroids)
+
+    return centroids.reshape(k, d)
+
+
+def prepare_coarse_quantizer(preproc):
+
+    if cent_cachefile and os.path.exists(cent_cachefile):
+        print("load centroids", cent_cachefile)
+        centroids = np.load(cent_cachefile)
+    else:
+        nt = max(1000000, 256 * ncent)
+        print("train coarse quantizer...")
+        t0 = time.time()
+        centroids = train_coarse_quantizer(xt[:nt], ncent, preproc)
+        print("Coarse train time: %.3f s" % (time.time() - t0))
+        if cent_cachefile:
+            print("store centroids", cent_cachefile)
+            np.save(cent_cachefile, centroids)
+
+    coarse_quantizer = faiss.IndexFlatL2(preproc.d_out)
+    coarse_quantizer.add(centroids)
+
+    return coarse_quantizer
+
+
+#################################################################
+# Make index and add elements to it
+#################################################################
+
+
+def prepare_trained_index(preproc):
+
+    coarse_quantizer = prepare_coarse_quantizer(preproc)
+    d = preproc.d_out
+    if pqflat_str == 'Flat':
+        print("making an IVFFlat index")
+        idx_model = faiss.IndexIVFFlat(coarse_quantizer, d, ncent,
+                                       faiss.METRIC_L2)
+    else:
+        m = int(pqflat_str[2:])
+        assert m < 56 or use_float16, "PQ%d will work only with -float16" % m
+        print("making an IVFPQ index, m = ", m)
+        idx_model = faiss.IndexIVFPQ(coarse_quantizer, d, ncent, m, 8)
+
+    coarse_quantizer.this.disown()
+    idx_model.own_fields = True
+
+    # finish training on CPU
+    t0 = time.time()
+    print("Training vector codes")
+    x = preproc.apply_py(sanitize(xt[:1000000]))
+    idx_model.train(x)
+    print("  done %.3f s" % (time.time() - t0))
+
+    return idx_model
+
+
+def compute_populated_index(preproc):
+    """Add elements to a sharded index. Return the index and if available
+    a sharded gpu_index that contains the same data. """
+
+    indexall = prepare_trained_index(preproc)
+
+    co = faiss.GpuMultipleClonerOptions()
+    co.useFloat16 = use_float16
+    co.useFloat16CoarseQuantizer = False
+    co.usePrecomputed = use_precomputed_tables
+    co.indicesOptions = faiss.INDICES_CPU
+    co.verbose = True
+    co.reserveVecs = max_add if max_add > 0 else xb.shape[0]
+    co.shard = True
+    assert co.shard_type in (0, 1, 2)
+    vres, vdev = make_vres_vdev()
+    gpu_index = faiss.index_cpu_to_gpu_multiple(
+        vres, vdev, indexall, co)
+
+    print("add...")
+    t0 = time.time()
+    nb = xb.shape[0]
+    for i0, xs in dataset_iterator(xb, preproc, add_batch_size):
+        i1 = i0 + xs.shape[0]
+        gpu_index.add_with_ids(xs, np.arange(i0, i1))
+        if max_add > 0 and gpu_index.ntotal > max_add:
+            print("Flush indexes to CPU")
+            for i in range(ngpu):
+                index_src_gpu = faiss.downcast_index(gpu_index.at(i))
+                index_src = faiss.index_gpu_to_cpu(index_src_gpu)
+                print("  index %d size %d" % (i, index_src.ntotal))
+                index_src.copy_subset_to(indexall, 0, 0, nb)
+                index_src_gpu.reset()
+                index_src_gpu.reserveMemory(max_add)
+            gpu_index.sync_with_shard_indexes()
+
+        print('\r%d/%d (%.3f s)  ' % (
+            i0, nb, time.time() - t0), end=' ')
+        sys.stdout.flush()
+    print("Add time: %.3f s" % (time.time() - t0))
+
+    print("Aggregate indexes to CPU")
+    t0 = time.time()
+
+    if hasattr(gpu_index, 'at'):
+        # it is a sharded index
+        for i in range(ngpu):
+            index_src = faiss.index_gpu_to_cpu(gpu_index.at(i))
+            print("  index %d size %d" % (i, index_src.ntotal))
+            index_src.copy_subset_to(indexall, 0, 0, nb)
+    else:
+        # simple index
+        index_src = faiss.index_gpu_to_cpu(gpu_index)
+        index_src.copy_subset_to(indexall, 0, 0, nb)
+
+    print("  done in %.3f s" % (time.time() - t0))
+
+    if max_add > 0:
+        # it does not contain all the vectors
+        gpu_index = None
+
+    return gpu_index, indexall
+
+def compute_populated_index_2(preproc):
+
+    indexall = prepare_trained_index(preproc)
+
+    # set up a 3-stage pipeline that does:
+    # - stage 1: load + preproc
+    # - stage 2: assign on GPU
+    # - stage 3: add to index
+
+    stage1 = dataset_iterator(xb, preproc, add_batch_size)
+
+    vres, vdev = make_vres_vdev()
+    coarse_quantizer_gpu = faiss.index_cpu_to_gpu_multiple(
+        vres, vdev, indexall.quantizer)
+
+    def quantize((i0, xs)):
+        _, assign = coarse_quantizer_gpu.search(xs, 1)
+        return i0, xs, assign.ravel()
+
+    stage2 = rate_limited_imap(quantize, stage1)
+
+    print("add...")
+    t0 = time.time()
+    nb = xb.shape[0]
+
+    for i0, xs, assign in stage2:
+        i1 = i0 + xs.shape[0]
+        if indexall.__class__ == faiss.IndexIVFPQ:
+            indexall.add_core_o(i1 - i0, faiss.swig_ptr(xs),
+                                None, None, faiss.swig_ptr(assign))
+        elif indexall.__class__ == faiss.IndexIVFFlat:
+            indexall.add_core(i1 - i0, faiss.swig_ptr(xs), None,
+                              faiss.swig_ptr(assign))
+        else:
+            assert False
+
+        print('\r%d/%d (%.3f s)  ' % (
+            i0, nb, time.time() - t0), end=' ')
+        sys.stdout.flush()
+    print("Add time: %.3f s" % (time.time() - t0))
+
+    return None, indexall
+
+
+
+def get_populated_index(preproc):
+
+    if not index_cachefile or not os.path.exists(index_cachefile):
+        if not altadd:
+            gpu_index, indexall = compute_populated_index(preproc)
+        else:
+            gpu_index, indexall = compute_populated_index_2(preproc)
+        if index_cachefile:
+            print("store", index_cachefile)
+            faiss.write_index(indexall, index_cachefile)
+    else:
+        print("load", index_cachefile)
+        indexall = faiss.read_index(index_cachefile)
+        gpu_index = None
+
+    co = faiss.GpuMultipleClonerOptions()
+    co.useFloat16 = use_float16
+    co.useFloat16CoarseQuantizer = False
+    co.usePrecomputed = use_precomputed_tables
+    co.indicesOptions = 0
+    co.verbose = True
+    co.shard = True    # the replicas will be made "manually"
+    t0 = time.time()
+    print("CPU index contains %d vectors, move to GPU" % indexall.ntotal)
+    if replicas == 1:
+
+        if not gpu_index:
+            print("copying loaded index to GPUs")
+            vres, vdev = make_vres_vdev()
+            index = faiss.index_cpu_to_gpu_multiple(
+                vres, vdev, indexall, co)
+        else:
+            index = gpu_index
+
+    else:
+        del gpu_index # We override the GPU index
+
+        print("Copy CPU index to %d sharded GPU indexes" % replicas)
+
+        index = faiss.IndexReplicas()
+
+        for i in range(replicas):
+            gpu0 = ngpu * i / replicas
+            gpu1 = ngpu * (i + 1) / replicas
+            vres, vdev = make_vres_vdev(gpu0, gpu1)
+
+            print("   dispatch to GPUs %d:%d" % (gpu0, gpu1))
+
+            index1 = faiss.index_cpu_to_gpu_multiple(
+                vres, vdev, indexall, co)
+            index1.this.disown()
+            index.addIndex(index1)
+        index.own_fields = True
+    del indexall
+    print("move to GPU done in %.3f s" % (time.time() - t0))
+    return index
+
+
+
+#################################################################
+# Perform search
+#################################################################
+
+
+def eval_dataset(index, preproc):
+
+    ps = faiss.GpuParameterSpace()
+    ps.initialize(index)
+
+    nq_gt = gt_I.shape[0]
+    print("search...")
+    sl = query_batch_size
+    nq = xq.shape[0]
+    for nprobe in nprobes:
+        ps.set_index_parameter(index, 'nprobe', nprobe)
+        t0 = time.time()
+
+        if sl == 0:
+            D, I = index.search(preproc.apply_py(sanitize(xq)), nnn)
+        else:
+            I = np.empty((nq, nnn), dtype='int32')
+            D = np.empty((nq, nnn), dtype='float32')
+
+            inter_res = ''
+
+            for i0, xs in dataset_iterator(xq, preproc, sl):
+                print('\r%d/%d (%.3f s%s)   ' % (
+                    i0, nq, time.time() - t0, inter_res), end=' ')
+                sys.stdout.flush()
+
+                i1 = i0 + xs.shape[0]
+                Di, Ii = index.search(xs, nnn)
+
+                I[i0:i1] = Ii
+                D[i0:i1] = Di
+
+                if knngraph and not inter_res and i1 >= nq_gt:
+                    ires = eval_intersection_measure(
+                        gt_I[:, :nnn], I[:nq_gt])
+                    inter_res = ', %.4f' % ires
+
+        t1 = time.time()
+        if knngraph:
+            ires = eval_intersection_measure(gt_I[:, :nnn], I[:nq_gt])
+            print("  probe=%-3d: %.3f s rank-%d intersection results: %.4f" % (
+                nprobe, t1 - t0, nnn, ires))
+        else:
+            print("  probe=%-3d: %.3f s" % (nprobe, t1 - t0), end=' ')
+            gtc = gt_I[:, :1]
+            nq = xq.shape[0]
+            for rank in 1, 10, 100:
+                if rank > nnn: continue
+                nok = (I[:, :rank] == gtc).sum()
+                print("1-R@%d: %.4f" % (rank, nok / float(nq)), end=' ')
+            print()
+        if I_fname:
+            I_fname_i = I_fname % I
+            print("storing", I_fname_i)
+            np.save(I, I_fname_i)
+        if D_fname:
+            D_fname_i = I_fname % I
+            print("storing", D_fname_i)
+            np.save(D, D_fname_i)
+
+
+#################################################################
+# Driver
+#################################################################
+
+
+preproc = get_preprocessor()
+
+index = get_populated_index(preproc)
+
+eval_dataset(index, preproc)
+
+# make sure index is deleted before the resources
+del index
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_gpu_sift1m.py b/core/src/index/thirdparty/faiss/benchs/bench_gpu_sift1m.py
new file mode 100644
index 0000000000..76c312b5c5
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/bench_gpu_sift1m.py
@@ -0,0 +1,94 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python2
+
+from __future__ import print_function
+import os
+import time
+import numpy as np
+import pdb
+
+import faiss
+from datasets import load_sift1M, evaluate
+
+
+print("load data")
+
+xb, xq, xt, gt = load_sift1M()
+nq, d = xq.shape
+
+# we need only a StandardGpuResources per GPU
+res = faiss.StandardGpuResources()
+
+
+#################################################################
+#  Exact search experiment
+#################################################################
+
+print("============ Exact search")
+
+flat_config = faiss.GpuIndexFlatConfig()
+flat_config.device = 0
+
+index = faiss.GpuIndexFlatL2(res, d, flat_config)
+
+print("add vectors to index")
+
+index.add(xb)
+
+print("warmup")
+
+index.search(xq, 123)
+
+print("benchmark")
+
+for lk in range(11):
+    k = 1 << lk
+    t, r = evaluate(index, xq, gt, k)
+
+    # the recall should be 1 at all times
+    print("k=%d %.3f ms, R@1 %.4f" % (k, t, r[1]))
+
+
+#################################################################
+#  Approximate search experiment
+#################################################################
+
+print("============ Approximate search")
+
+index = faiss.index_factory(d, "IVF4096,PQ64")
+
+# faster, uses more memory
+# index = faiss.index_factory(d, "IVF16384,Flat")
+
+co = faiss.GpuClonerOptions()
+
+# here we are using a 64-byte PQ, so we must set the lookup tables to
+# 16 bit float (this is due to the limited temporary memory).
+co.useFloat16 = True
+
+index = faiss.index_cpu_to_gpu(res, 0, index, co)
+
+print("train")
+
+index.train(xt)
+
+print("add vectors to index")
+
+index.add(xb)
+
+print("warmup")
+
+index.search(xq, 123)
+
+print("benchmark")
+
+for lnprobe in range(10):
+    nprobe = 1 << lnprobe
+    index.setNumProbes(nprobe)
+    t, r = evaluate(index, xq, gt, 100)
+
+    print("nprobe=%4d %.3f ms recalls= %.4f %.4f %.4f" % (nprobe, t, r[1], r[10], r[100]))
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_hnsw.py b/core/src/index/thirdparty/faiss/benchs/bench_hnsw.py
new file mode 100644
index 0000000000..dea13da8c2
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/bench_hnsw.py
@@ -0,0 +1,158 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python2
+
+from __future__ import print_function
+import time
+import sys
+import numpy as np
+import faiss
+from datasets import load_sift1M
+
+
+k = int(sys.argv[1])
+todo = sys.argv[1:]
+
+print("load data")
+xb, xq, xt, gt = load_sift1M()
+nq, d = xq.shape
+
+if todo == []:
+    todo = 'hnsw hnsw_sq ivf ivf_hnsw_quantizer kmeans kmeans_hnsw'.split()
+
+
+def evaluate(index):
+    # for timing with a single core
+    # faiss.omp_set_num_threads(1)
+
+    t0 = time.time()
+    D, I = index.search(xq, k)
+    t1 = time.time()
+
+    missing_rate = (I == -1).sum() / float(k * nq)
+    recall_at_1 = (I == gt[:, :1]).sum() / float(nq)
+    print("\t %7.3f ms per query, R@1 %.4f, missing rate %.4f" % (
+        (t1 - t0) * 1000.0 / nq, recall_at_1, missing_rate))
+
+
+if 'hnsw' in todo:
+
+    print("Testing HNSW Flat")
+
+    index = faiss.IndexHNSWFlat(d, 32)
+
+    # training is not needed
+
+    # this is the default, higher is more accurate and slower to
+    # construct
+    index.hnsw.efConstruction = 40
+
+    print("add")
+    # to see progress
+    index.verbose = True
+    index.add(xb)
+
+    print("search")
+    for efSearch in 16, 32, 64, 128, 256:
+        for bounded_queue in [True, False]:
+            print("efSearch", efSearch, "bounded queue", bounded_queue, end=' ')
+            index.hnsw.search_bounded_queue = bounded_queue
+            index.hnsw.efSearch = efSearch
+            evaluate(index)
+
+if 'hnsw_sq' in todo:
+
+    print("Testing HNSW with a scalar quantizer")
+    # also set M so that the vectors and links both use 128 bytes per
+    # entry (total 256 bytes)
+    index = faiss.IndexHNSWSQ(d, faiss.ScalarQuantizer.QT_8bit, 16)
+
+    print("training")
+    # training for the scalar quantizer
+    index.train(xt)
+
+    # this is the default, higher is more accurate and slower to
+    # construct
+    index.hnsw.efConstruction = 40
+
+    print("add")
+    # to see progress
+    index.verbose = True
+    index.add(xb)
+
+    print("search")
+    for efSearch in 16, 32, 64, 128, 256:
+        print("efSearch", efSearch, end=' ')
+        index.hnsw.efSearch = efSearch
+        evaluate(index)
+
+if 'ivf' in todo:
+
+    print("Testing IVF Flat (baseline)")
+    quantizer = faiss.IndexFlatL2(d)
+    index = faiss.IndexIVFFlat(quantizer, d, 16384)
+    index.cp.min_points_per_centroid = 5   # quiet warning
+
+    # to see progress
+    index.verbose = True
+
+    print("training")
+    index.train(xt)
+
+    print("add")
+    index.add(xb)
+
+    print("search")
+    for nprobe in 1, 4, 16, 64, 256:
+        print("nprobe", nprobe, end=' ')
+        index.nprobe = nprobe
+        evaluate(index)
+
+if 'ivf_hnsw_quantizer' in todo:
+
+    print("Testing IVF Flat with HNSW quantizer")
+    quantizer = faiss.IndexHNSWFlat(d, 32)
+    index = faiss.IndexIVFFlat(quantizer, d, 16384)
+    index.cp.min_points_per_centroid = 5   # quiet warning
+    index.quantizer_trains_alone = 2
+
+    # to see progress
+    index.verbose = True
+
+    print("training")
+    index.train(xt)
+
+    print("add")
+    index.add(xb)
+
+    print("search")
+    quantizer.hnsw.efSearch = 64
+    for nprobe in 1, 4, 16, 64, 256:
+        print("nprobe", nprobe, end=' ')
+        index.nprobe = nprobe
+        evaluate(index)
+
+# Bonus: 2 kmeans tests
+
+if 'kmeans' in todo:
+    print("Performing kmeans on sift1M database vectors (baseline)")
+    clus = faiss.Clustering(d, 16384)
+    clus.verbose = True
+    clus.niter = 10
+    index = faiss.IndexFlatL2(d)
+    clus.train(xb, index)
+
+
+if 'kmeans_hnsw' in todo:
+    print("Performing kmeans on sift1M using HNSW assignment")
+    clus = faiss.Clustering(d, 16384)
+    clus.verbose = True
+    clus.niter = 10
+    index = faiss.IndexHNSWFlat(d, 32)
+    # increase the default efSearch, otherwise the number of empty
+    # clusters is too high.
+    index.hnsw.efSearch = 128
+    clus.train(xb, index)
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_index_pq.py b/core/src/index/thirdparty/faiss/benchs/bench_index_pq.py
new file mode 100644
index 0000000000..4fd5ccfeb0
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/bench_index_pq.py
@@ -0,0 +1,22 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import print_function
+import faiss
+from datasets import load_sift1M, evaluate
+
+xb, xq, xt, gt = load_sift1M()
+nq, d = xq.shape
+
+k = 32
+
+for nbits in 4, 6, 8, 10, 12:
+    index = faiss.IndexPQ(d, 8, nbits)
+    index.train(xt)
+    index.add(xb)
+
+    t, r = evaluate(index, xq, gt, k)
+    print("\t %7.3f ms per query, R@1 %.4f" % (t, r[1]))
+    del index
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_pairwise_distances.py b/core/src/index/thirdparty/faiss/benchs/bench_pairwise_distances.py
new file mode 100644
index 0000000000..bde8cc908e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/bench_pairwise_distances.py
@@ -0,0 +1,36 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python3
+
+"""small test script to benchmark the SIMD implementation of the
+distance computations for the additional metrics. Call eg. with L1 to
+get L1 distance computations.
+"""
+
+import faiss
+
+import sys
+import time
+
+d = 64
+nq = 4096
+nb = 16384
+
+print("sample")
+
+xq = faiss.randn((nq, d), 123)
+xb = faiss.randn((nb, d), 123)
+
+mt_name = "L2" if len(sys.argv) < 2 else sys.argv[1]
+
+mt = getattr(faiss, "METRIC_" + mt_name)
+
+print("distances")
+t0 = time.time()
+dis = faiss.pairwise_distances(xq, xb, mt)
+t1 = time.time()
+
+print("nq=%d nb=%d d=%d %s: %.3f s" % (nq, nb, d, mt_name, t1 - t0))
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_polysemous_1bn.py b/core/src/index/thirdparty/faiss/benchs/bench_polysemous_1bn.py
new file mode 100644
index 0000000000..0445c4a8be
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/bench_polysemous_1bn.py
@@ -0,0 +1,252 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python2
+
+from __future__ import print_function
+import os
+import sys
+import time
+import numpy as np
+import re
+import faiss
+from multiprocessing.dummy import Pool as ThreadPool
+from datasets import ivecs_read
+
+
+# we mem-map the biggest files to avoid having them in memory all at
+# once
+
+
+def mmap_fvecs(fname):
+    x = np.memmap(fname, dtype='int32', mode='r')
+    d = x[0]
+    return x.view('float32').reshape(-1, d + 1)[:, 1:]
+
+
+def mmap_bvecs(fname):
+    x = np.memmap(fname, dtype='uint8', mode='r')
+    d = x[:4].view('int32')[0]
+    return x.reshape(-1, d + 4)[:, 4:]
+
+
+#################################################################
+# Bookkeeping
+#################################################################
+
+
+dbname        = sys.argv[1]
+index_key     = sys.argv[2]
+parametersets = sys.argv[3:]
+
+
+tmpdir = '/tmp/bench_polysemous'
+
+if not os.path.isdir(tmpdir):
+    print("%s does not exist, creating it" % tmpdir)
+    os.mkdir(tmpdir)
+
+
+#################################################################
+# Prepare dataset
+#################################################################
+
+
+print("Preparing dataset", dbname)
+
+if dbname.startswith('SIFT'):
+    # SIFT1M to SIFT1000M
+    dbsize = int(dbname[4:-1])
+    xb = mmap_bvecs('bigann/bigann_base.bvecs')
+    xq = mmap_bvecs('bigann/bigann_query.bvecs')
+    xt = mmap_bvecs('bigann/bigann_learn.bvecs')
+
+    # trim xb to correct size
+    xb = xb[:dbsize * 1000 * 1000]
+
+    gt = ivecs_read('bigann/gnd/idx_%dM.ivecs' % dbsize)
+
+elif dbname == 'Deep1B':
+    xb = mmap_fvecs('deep1b/base.fvecs')
+    xq = mmap_fvecs('deep1b/deep1B_queries.fvecs')
+    xt = mmap_fvecs('deep1b/learn.fvecs')
+    # deep1B's train is is outrageously big
+    xt = xt[:10 * 1000 * 1000]
+    gt = ivecs_read('deep1b/deep1B_groundtruth.ivecs')
+
+else:
+    print('unknown dataset', dbname, file=sys.stderr)
+    sys.exit(1)
+
+
+print("sizes: B %s Q %s T %s gt %s" % (
+    xb.shape, xq.shape, xt.shape, gt.shape))
+
+nq, d = xq.shape
+nb, d = xb.shape
+assert gt.shape[0] == nq
+
+
+#################################################################
+# Training
+#################################################################
+
+
+def choose_train_size(index_key):
+
+    # some training vectors for PQ and the PCA
+    n_train = 256 * 1000
+
+    if "IVF" in index_key:
+        matches = re.findall('IVF([0-9]+)', index_key)
+        ncentroids = int(matches[0])
+        n_train = max(n_train, 100 * ncentroids)
+    elif "IMI" in index_key:
+        matches = re.findall('IMI2x([0-9]+)', index_key)
+        nbit = int(matches[0])
+        n_train = max(n_train, 256 * (1 << nbit))
+    return n_train
+
+
+def get_trained_index():
+    filename = "%s/%s_%s_trained.index" % (
+        tmpdir, dbname, index_key)
+
+    if not os.path.exists(filename):
+        index = faiss.index_factory(d, index_key)
+
+        n_train = choose_train_size(index_key)
+
+        xtsub = xt[:n_train]
+        print("Keeping %d train vectors" % xtsub.shape[0])
+        # make sure the data is actually in RAM and in float
+        xtsub = xtsub.astype('float32').copy()
+        index.verbose = True
+
+        t0 = time.time()
+        index.train(xtsub)
+        index.verbose = False
+        print("train done in %.3f s" % (time.time() - t0))
+        print("storing", filename)
+        faiss.write_index(index, filename)
+    else:
+        print("loading", filename)
+        index = faiss.read_index(filename)
+    return index
+
+
+#################################################################
+# Adding vectors to dataset
+#################################################################
+
+def rate_limited_imap(f, l):
+    'a thread pre-processes the next element'
+    pool = ThreadPool(1)
+    res = None
+    for i in l:
+        res_next = pool.apply_async(f, (i, ))
+        if res:
+            yield res.get()
+        res = res_next
+    yield res.get()
+
+
+def matrix_slice_iterator(x, bs):
+    " iterate over the lines of x in blocks of size bs"
+    nb = x.shape[0]
+    block_ranges = [(i0, min(nb, i0 + bs))
+                    for i0 in range(0, nb, bs)]
+
+    return rate_limited_imap(
+        lambda (i0, i1): x[i0:i1].astype('float32').copy(),
+        block_ranges)
+
+
+def get_populated_index():
+
+    filename = "%s/%s_%s_populated.index" % (
+        tmpdir, dbname, index_key)
+
+    if not os.path.exists(filename):
+        index = get_trained_index()
+        i0 = 0
+        t0 = time.time()
+        for xs in matrix_slice_iterator(xb, 100000):
+            i1 = i0 + xs.shape[0]
+            print('\radd %d:%d, %.3f s' % (i0, i1, time.time() - t0), end=' ')
+            sys.stdout.flush()
+            index.add(xs)
+            i0 = i1
+        print()
+        print("Add done in %.3f s" % (time.time() - t0))
+        print("storing", filename)
+        faiss.write_index(index, filename)
+    else:
+        print("loading", filename)
+        index = faiss.read_index(filename)
+    return index
+
+
+#################################################################
+# Perform searches
+#################################################################
+
+index = get_populated_index()
+
+ps = faiss.ParameterSpace()
+ps.initialize(index)
+
+# make sure queries are in RAM
+xq = xq.astype('float32').copy()
+
+# a static C++ object that collects statistics about searches
+ivfpq_stats = faiss.cvar.indexIVFPQ_stats
+
+
+if parametersets == ['autotune'] or parametersets == ['autotuneMT']:
+
+    if parametersets == ['autotune']:
+        faiss.omp_set_num_threads(1)
+
+    # setup the Criterion object: optimize for 1-R@1
+    crit = faiss.OneRecallAtRCriterion(nq, 1)
+    # by default, the criterion will request only 1 NN
+    crit.nnn = 100
+    crit.set_groundtruth(None, gt.astype('int64'))
+
+    # then we let Faiss find the optimal parameters by itself
+    print("exploring operating points")
+
+    t0 = time.time()
+    op = ps.explore(index, xq, crit)
+    print("Done in %.3f s, available OPs:" % (time.time() - t0))
+
+    # opv is a C++ vector, so it cannot be accessed like a Python array
+    opv = op.optimal_pts
+    print("%-40s  1-R@1     time" % "Parameters")
+    for i in range(opv.size()):
+        opt = opv.at(i)
+        print("%-40s  %.4f  %7.3f" % (opt.key, opt.perf, opt.t))
+
+else:
+
+    # we do queries in a single thread
+    faiss.omp_set_num_threads(1)
+
+    print(' ' * len(parametersets[0]), '\t', 'R@1    R@10   R@100     time    %pass')
+
+    for param in parametersets:
+        print(param, '\t', end=' ')
+        sys.stdout.flush()
+        ps.set_index_parameters(index, param)
+        t0 = time.time()
+        ivfpq_stats.reset()
+        D, I = index.search(xq, 100)
+        t1 = time.time()
+        for rank in 1, 10, 100:
+            n_ok = (I[:, :rank] == gt[:, :1]).sum()
+            print("%.4f" % (n_ok / float(nq)), end=' ')
+        print("%8.3f  " % ((t1 - t0) * 1000.0 / nq), end=' ')
+        print("%5.2f" % (ivfpq_stats.n_hamming_pass * 100.0 / ivfpq_stats.ncode))
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_polysemous_sift1m.py b/core/src/index/thirdparty/faiss/benchs/bench_polysemous_sift1m.py
new file mode 100644
index 0000000000..7dbb79ec0c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/bench_polysemous_sift1m.py
@@ -0,0 +1,46 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python2
+
+from __future__ import print_function
+import time
+import numpy as np
+
+import faiss
+from datasets import load_sift1M, evaluate
+
+
+print("load data")
+xb, xq, xt, gt = load_sift1M()
+nq, d = xq.shape
+
+# index with 16 subquantizers, 8 bit each
+index = faiss.IndexPQ(d, 16, 8)
+index.do_polysemous_training = True
+index.verbose = True
+
+print("train")
+
+index.train(xt)
+
+print("add vectors to index")
+
+index.add(xb)
+
+nt = 1
+faiss.omp_set_num_threads(1)
+
+
+print("PQ baseline", end=' ')
+index.search_type = faiss.IndexPQ.ST_PQ
+evaluate()
+
+for ht in 64, 62, 58, 54, 50, 46, 42, 38, 34, 30:
+    print("Polysemous", ht, end=' ')
+    index.search_type = faiss.IndexPQ.ST_polysemous
+    index.polysemous_ht = ht
+    t, r = evaluate(index, xq, gt, 1)
+    print("\t %7.3f ms per query, R@1 %.4f" % (t, r[1]))
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_scalar_quantizer.py b/core/src/index/thirdparty/faiss/benchs/bench_scalar_quantizer.py
new file mode 100644
index 0000000000..a990b485f1
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/bench_scalar_quantizer.py
@@ -0,0 +1,85 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python2
+
+from __future__ import print_function
+import time
+import numpy as np
+import faiss
+from datasets import load_sift1M
+
+
+print("load data")
+
+xb, xq, xt, gt = load_sift1M()
+nq, d = xq.shape
+
+ncent = 256
+
+variants = [(name, getattr(faiss.ScalarQuantizer, name))
+            for name in dir(faiss.ScalarQuantizer)
+            if name.startswith('QT_')]
+
+quantizer = faiss.IndexFlatL2(d)
+# quantizer.add(np.zeros((1, d), dtype='float32'))
+
+if False:
+    for name, qtype in [('flat', 0)] + variants:
+
+        print("============== test", name)
+        t0 = time.time()
+
+        if name == 'flat':
+            index = faiss.IndexIVFFlat(quantizer, d, ncent,
+                                       faiss.METRIC_L2)
+        else:
+            index = faiss.IndexIVFScalarQuantizer(quantizer, d, ncent,
+                                                  qtype, faiss.METRIC_L2)
+
+        index.nprobe = 16
+        print("[%.3f s] train" % (time.time() - t0))
+        index.train(xt)
+        print("[%.3f s] add" % (time.time() - t0))
+        index.add(xb)
+        print("[%.3f s] search" % (time.time() - t0))
+        D, I = index.search(xq, 100)
+        print("[%.3f s] eval" % (time.time() - t0))
+
+        for rank in 1, 10, 100:
+            n_ok = (I[:, :rank] == gt[:, :1]).sum()
+            print("%.4f" % (n_ok / float(nq)), end=' ')
+        print()
+
+if True:
+    for name, qtype in variants:
+
+        print("============== test", name)
+
+        for rsname, vals in [('RS_minmax',
+                              [-0.4, -0.2, -0.1, -0.05, 0.0, 0.1, 0.5]),
+                             ('RS_meanstd', [0.8, 1.0, 1.5, 2.0, 3.0, 5.0, 10.0]),
+                             ('RS_quantiles', [0.02, 0.05, 0.1, 0.15]),
+                             ('RS_optim', [0.0])]:
+            for val in vals:
+                print("%-15s %5g    " % (rsname, val), end=' ')
+                index = faiss.IndexIVFScalarQuantizer(quantizer, d, ncent,
+                                                      qtype, faiss.METRIC_L2)
+                index.nprobe = 16
+                index.sq.rangestat = getattr(faiss.ScalarQuantizer,
+                                          rsname)
+
+                index.rangestat_arg = val
+
+                index.train(xt)
+                index.add(xb)
+                t0 = time.time()
+                D, I = index.search(xq, 100)
+                t1 = time.time()
+
+                for rank in 1, 10, 100:
+                    n_ok = (I[:, :rank] == gt[:, :1]).sum()
+                    print("%.4f" % (n_ok / float(nq)), end=' ')
+                print("   %.3f s" % (t1 - t0))
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_vector_ops.py b/core/src/index/thirdparty/faiss/benchs/bench_vector_ops.py
new file mode 100644
index 0000000000..aed1083d46
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/bench_vector_ops.py
@@ -0,0 +1,85 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+from __future__ import print_function
+import numpy as np
+import faiss
+import time
+
+swig_ptr = faiss.swig_ptr
+
+if False:
+    a = np.arange(10, 14).astype('float32')
+    b = np.arange(20, 24).astype('float32')
+
+    faiss.fvec_inner_product (swig_ptr(a), swig_ptr(b), 4)
+
+    1/0
+
+xd = 100
+yd = 1000000
+
+np.random.seed(1234)
+
+faiss.omp_set_num_threads(1)
+
+print('xd=%d yd=%d' % (xd, yd))
+
+print('Running inner products test..')
+for d in 3, 4, 12, 36, 64:
+
+    x = faiss.rand(xd * d).reshape(xd, d)
+    y = faiss.rand(yd * d).reshape(yd, d)
+
+    distances = np.empty((xd, yd), dtype='float32')
+
+    t0 = time.time()
+    for i in xrange(xd):
+        faiss.fvec_inner_products_ny(swig_ptr(distances[i]),
+                                     swig_ptr(x[i]),
+                                     swig_ptr(y),
+                                     d, yd)
+    t1 = time.time()
+
+    # sparse verification
+    ntry = 100
+    num, denom = 0, 0
+    for t in range(ntry):
+        xi = np.random.randint(xd)
+        yi = np.random.randint(yd)
+        num += abs(distances[xi, yi] - np.dot(x[xi], y[yi]))
+        denom += abs(distances[xi, yi])
+
+    print('d=%d t=%.3f s diff=%g' % (d, t1 - t0, num / denom))
+
+
+print('Running L2sqr test..')
+for d in 3, 4, 12, 36, 64:
+
+    x = faiss.rand(xd * d).reshape(xd, d)
+    y = faiss.rand(yd * d).reshape(yd, d)
+
+    distances = np.empty((xd, yd), dtype='float32')
+
+    t0 = time.time()
+    for i in xrange(xd):
+        faiss.fvec_L2sqr_ny(swig_ptr(distances[i]),
+                            swig_ptr(x[i]),
+                            swig_ptr(y),
+                            d, yd)
+    t1 = time.time()
+
+    # sparse verification
+    ntry = 100
+    num, denom = 0, 0
+    for t in range(ntry):
+        xi = np.random.randint(xd)
+        yi = np.random.randint(yd)
+        num += abs(distances[xi, yi] - np.sum((x[xi] - y[yi]) ** 2))
+        denom += abs(distances[xi, yi])
+
+    print('d=%d t=%.3f s diff=%g' % (d, t1 - t0, num / denom))
diff --git a/core/src/index/thirdparty/faiss/benchs/datasets.py b/core/src/index/thirdparty/faiss/benchs/datasets.py
new file mode 100644
index 0000000000..3971f278f9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/datasets.py
@@ -0,0 +1,45 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import print_function
+import sys
+import time
+import numpy as np
+
+
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype='int32')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+
+
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+
+
+def load_sift1M():
+    print("Loading sift1M...", end='', file=sys.stderr)
+    xt = fvecs_read("sift1M/sift_learn.fvecs")
+    xb = fvecs_read("sift1M/sift_base.fvecs")
+    xq = fvecs_read("sift1M/sift_query.fvecs")
+    gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
+    print("done", file=sys.stderr)
+
+    return xb, xq, xt, gt
+
+
+def evaluate(index, xq, gt, k):
+    nq = xq.shape[0]
+    t0 = time.time()
+    D, I = index.search(xq, k)  # noqa: E741
+    t1 = time.time()
+
+    recalls = {}
+    i = 1
+    while i <= k:
+        recalls[i] = (I[:, :i] == gt[:, :1]).sum() / float(nq)
+        i *= 10
+
+    return (t1 - t0) * 1000.0 / nq, recalls
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/README.md b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/README.md
new file mode 100644
index 0000000000..643a99a1dd
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/README.md
@@ -0,0 +1,197 @@
+# Distributed on-disk index for 1T-scale datasets 
+
+This is code corresponding to the description in [Indexing 1T vectors](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors). 
+All the code is in python 3 (and not compatible with Python 2). 
+The current code uses the Deep1B dataset for demonstration purposes, but can scale to 1000x larger.
+To run it, download the Deep1B dataset as explained [here](../#getting-deep1b), and edit paths to the dataset in the scripts. 
+
+The cluster commands are written for the Slurm batch scheduling system. 
+Hopefully, changing to another type of scheduler should be quite straightforward.
+
+## Distributed k-means
+
+To cluster 500M vectors to 10M centroids, it is useful to have a distriubuted k-means implementation. 
+The distribution simply consists in splitting the training vectors across machines (servers) and have them do the assignment. 
+The master/client then synthesizes the results and updates the centroids.
+
+The distributed k-means implementation here is based on 3 files:
+
+- [`rpc.py`](rpc.py) is a very simple remote procedure call implementation based on sockets and pickle. 
+It exposes the methods of an object on the server side so that they can be called from the client as if the object was local.
+
+- [`distributed_kmeans.py`](distributed_kmeans.py) contains the k-means implementation. 
+The main loop of k-means is re-implemented in python but follows closely the Faiss C++ implementation, and should not be significantly less efficient. 
+It relies on a `DatasetAssign` object that does the assignement to centrtoids, which is the bulk of the computation. 
+The object can be a Faiss CPU index, a GPU index or a set of remote GPU or CPU indexes.
+
+- [`run_on_cluster.bash`](run_on_cluster.bash) contains the shell code to run the distributed k-means on a cluster. 
+
+The distributed k-means works with a Python install that contains faiss and scipy (for sparse matrices).
+It clusters the training data of Deep1B, this can be changed easily to any file in fvecs, bvecs or npy format that contains the training set. 
+The training vectors may be too large to fit in RAM, but they are memory-mapped so that should not be a problem. 
+The file is also assumed to be accessible from all server machines with eg. a distributed file system.
+
+### Local tests 
+
+Edit `distibuted_kmeans.py` to point `testdata` to your local copy of the dataset. 
+
+Then, 4 levels of sanity check can be run: 
+```bash
+# reference Faiss C++ run
+python distributed_kmeans.py --test 0
+# using the Python implementation
+python distributed_kmeans.py --test 1
+# use the dispatch object (on local datasets)
+python distributed_kmeans.py --test 2
+# same, with GPUs
+python distributed_kmeans.py --test 3
+```
+The output should look like [This gist](https://gist.github.com/mdouze/ffa01fe666a9325761266fe55ead72ad).
+
+### Distributed sanity check
+
+To run the distributed k-means, `distibuted_kmeans.py` has to be run both on the servers (`--server` option) and client sides (`--client` option). 
+Edit the top of `run_on_cluster.bash` to set the path of the data to cluster. 
+
+Sanity checks can be run with 
+```bash 
+# non distributed baseline
+bash run_on_cluster.bash test_kmeans_0
+# using all the machine's GPUs
+bash run_on_cluster.bash test_kmeans_1
+# distrbuted run, with one local server per GPU
+bash run_on_cluster.bash test_kmeans_2
+```
+The test `test_kmeans_2` simulates a distributed run on a single machine by starting one server process per GPU and connecting to the servers via the rpc protocol. 
+The output should look like [this gist](https://gist.github.com/mdouze/5b2dc69b74579ecff04e1686a277d32e).
+
+
+
+### Distributed run
+
+The way the script can be distributed depends on the cluster's scheduling system. 
+Here we use Slurm, but it should be relatively easy to adapt to any scheduler that can allocate a set of matchines and start the same exectuable on all of them. 
+
+The command 
+```
+bash run_on_cluster.bash slurm_distributed_kmeans
+```
+asks SLURM for 5 machines with 4 GPUs each with the `srun` command. 
+All 5 machines run the script with the `slurm_within_kmeans_server` option. 
+They determine the number of servers and their own server id via the `SLURM_NPROCS` and `SLURM_PROCID` environment variables.
+
+All machines start `distributed_kmeans.py` in server mode for the slice of the dataset they are responsible for.
+
+In addition, the machine #0 also starts the client. 
+The client knows who are the other servers via the variable `SLURM_JOB_NODELIST`. 
+It connects to all clients and performs the clustering. 
+
+The output should look like [this gist](https://gist.github.com/mdouze/8d25e89fb4af5093057cae0f917da6cd).
+
+### Run used for deep1B
+
+For the real run, we run the clustering on 50M vectors to 1M centroids. 
+This is just a matter of using as many machines / GPUs as possible in setting the output centroids with the `--out filename` option.
+Then run
+```
+bash run_on_cluster.bash deep1b_clustering
+```
+
+The last lines of output read like: 
+```
+  Iteration 19 (898.92 s, search 875.71 s): objective=1.33601e+07 imbalance=1.303 nsplit=0
+ 0: writing centroids to /checkpoint/matthijs/ondisk_distributed/1M_centroids.npy
+```
+
+This means that the total training time was 899s, of which 876s were used for computation. 
+However, the computation includes the I/O overhead to the assignment servers. 
+In this implementation, the overhead of transmitting the data is non-negligible and so is the centroid computation stage. 
+This is due to the inefficient Python implementation and the RPC protocol that is not optimized for broadcast / gather (like MPI). 
+However, it is a simple implementation that should run on most clusters.
+
+## Making the trained index
+
+After the centroids are obtained, an empty trained index must be constructed. 
+This is done by: 
+
+- applying a pre-processing stage (a random rotation) to balance the dimensions of the vectors. This can be done after clustering, the clusters are just rotated as well.
+
+- wrapping the centroids into a HNSW index to speed up the CPU-based assignment of vectors
+
+- training the 6-bit scalar quantizer used to encode the vectors
+
+This is performed by the script [`make_trained_index.py`](make_trained_index.py). 
+
+## Building the index by slices
+
+We call the slices "vslisces" as they are vertical slices of the big matrix, see explanation in the wiki section [Split across datanbase partitions](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors#split-across-database-partitions).
+
+The script [make_index_vslice.py](make_index_vslice.py) makes an index for a subset of the vectors of the input data and stores it as an independent index. 
+There are 200 slices of 5M vectors each for Deep1B.
+It can be run in a brute-force parallel fashion, there is no constraint on ordering. 
+To run the script in parallel on a slurm cluster, use: 
+```
+bash run_on_cluster.bash make_index_vslices
+```
+For a real dataset, the data would be read from a DBMS. 
+In that case, reading the data and indexing it in parallel is worthwhile because reading is very slow.
+
+## Splitting accross inverted lists
+
+The 200 slices need to be merged together. 
+This is done with the script [merge_to_ondisk.py](merge_to_ondisk.py), that memory maps the 200 vertical slice indexes, extracts a subset of the inverted lists and writes them to a contiguous horizontal slice. 
+We slice the inverted lists into 50 horizontal slices. 
+This is run with 
+```
+bash run_on_cluster.bash make_index_hslices
+```
+
+## Querying the index
+
+At this point the index is ready. 
+The horizontal slices need to be loaded in the right order and combined into an index to be usable. 
+This is done in the [combined_index.py](combined_index.py) script. 
+It provides a `CombinedIndexDeep1B` object that contains an index object that can be searched. 
+To test, run: 
+```
+python combined_index.py
+```
+The output should look like: 
+```
+(faiss_1.5.2) matthijs@devfair0144:~/faiss_versions/faiss_1Tcode/faiss/benchs/distributed_ondisk$ python combined_index.py
+reading /checkpoint/matthijs/ondisk_distributed//hslices/slice49.faissindex
+loading empty index /checkpoint/matthijs/ondisk_distributed/trained.faissindex
+replace invlists
+loaded index of size  1000000000
+nprobe=1 1-recall@1=0.2904 t=12.35s
+nnprobe=10 1-recall@1=0.6499 t=17.67s
+nprobe=100 1-recall@1=0.8673 t=29.23s
+nprobe=1000 1-recall@1=0.9132 t=129.58s
+```
+ie. searching is a lot slower than from RAM. 
+
+## Distributed query
+
+To reduce the bandwidth required from the machine that does the queries, it is possible to split the search accross several search servers. 
+This way, only the effective results are returned to the main machine.
+
+The search client and server are implemented in [`search_server.py`](search_server.py). 
+It can be used as a script to start a search server for `CombinedIndexDeep1B` or as a module to load the clients.
+
+The search servers can be started with 
+```
+bash run_on_cluster.bash run_search_servers
+```
+(adjust to the number of servers that can be used). 
+
+Then an example of search client is [`distributed_query_demo.py`](distributed_query_demo.py). 
+It connects to the servers and assigns subsets of inverted lists to visit to each of them.
+
+A typical output is [this gist](https://gist.github.com/mdouze/1585b9854a9a2437d71f2b2c3c05c7c5). 
+The number in MiB indicates the amount of data that is read from disk to perform the search.
+In this case, the scale of the dataset is too small for the distributed search to have much impact, but on datasets > 10x larger, the difference becomes more significant.
+
+## Conclusion
+
+This code contains the core components to make an index that scales up to 1T vectors. 
+There are a few simplifications wrt. the index that was effectively used in [Indexing 1T vectors](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors).  
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/combined_index.py b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/combined_index.py
new file mode 100644
index 0000000000..c2583bc450
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/combined_index.py
@@ -0,0 +1,192 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import faiss
+import numpy as np
+
+
+class CombinedIndex:
+    """
+    combines a set of inverted lists into a hstack
+    masks part of those lists
+    adds these inverted lists to an empty index that contains
+    the info on how to perform searches
+    """
+
+    def __init__(self, invlist_fnames, empty_index_fname,
+                 masked_index_fname=None):
+
+        self.indexes = indexes = []
+        ilv = faiss.InvertedListsPtrVector()
+
+        for fname in invlist_fnames:
+            if os.path.exists(fname):
+                print('reading', fname, end='\r', flush=True)
+                index = faiss.read_index(fname)
+                indexes.append(index)
+                il = faiss.extract_index_ivf(index).invlists
+            else:
+                assert False
+            ilv.push_back(il)
+        print()
+
+        self.big_il = faiss.VStackInvertedLists(ilv.size(), ilv.data())
+        if masked_index_fname:
+            self.big_il_base = self.big_il
+            print('loading', masked_index_fname)
+            self.masked_index = faiss.read_index(
+                masked_index_fname,
+                faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
+            self.big_il = faiss.MaskedInvertedLists(
+                faiss.extract_index_ivf(self.masked_index).invlists,
+                self.big_il_base)
+
+        print('loading empty index', empty_index_fname)
+        self.index = faiss.read_index(empty_index_fname)
+        ntotal = self.big_il.compute_ntotal()
+
+        print('replace invlists')
+        index_ivf = faiss.extract_index_ivf(self.index)
+        index_ivf.replace_invlists(self.big_il, False)
+        index_ivf.ntotal = self.index.ntotal = ntotal
+        index_ivf.parallel_mode = 1   # seems reasonable to do this all the time
+
+        quantizer = faiss.downcast_index(index_ivf.quantizer)
+        quantizer.hnsw.efSearch = 1024
+
+    ############################################################
+    # Expose fields and functions of the index as methods so that they
+    # can be called by RPC
+
+    def search(self, x, k):
+        return self.index.search(x, k)
+
+    def range_search(self, x, radius):
+        return self.index.range_search(x, radius)
+
+    def transform_and_assign(self, xq):
+        index = self.index
+
+        if isinstance(index, faiss.IndexPreTransform):
+            assert index.chain.size() == 1
+            vt = index.chain.at(0)
+            xq = vt.apply_py(xq)
+
+        # perform quantization
+        index_ivf = faiss.extract_index_ivf(index)
+        quantizer = index_ivf.quantizer
+        coarse_dis, list_nos = quantizer.search(xq, index_ivf.nprobe)
+        return xq, list_nos, coarse_dis
+
+
+    def ivf_search_preassigned(self, xq, list_nos, coarse_dis, k):
+        index_ivf = faiss.extract_index_ivf(self.index)
+        n, d = xq.shape
+        assert d == index_ivf.d
+        n2, d2 = list_nos.shape
+        assert list_nos.shape == coarse_dis.shape
+        assert n2 == n
+        assert d2 == index_ivf.nprobe
+        D = np.empty((n, k), dtype='float32')
+        I = np.empty((n, k), dtype='int64')
+        index_ivf.search_preassigned(
+            n, faiss.swig_ptr(xq), k,
+            faiss.swig_ptr(list_nos), faiss.swig_ptr(coarse_dis),
+            faiss.swig_ptr(D), faiss.swig_ptr(I), False)
+        return D, I
+
+
+    def ivf_range_search_preassigned(self, xq, list_nos, coarse_dis, radius):
+        index_ivf = faiss.extract_index_ivf(self.index)
+        n, d = xq.shape
+        assert d == index_ivf.d
+        n2, d2 = list_nos.shape
+        assert list_nos.shape == coarse_dis.shape
+        assert n2 == n
+        assert d2 == index_ivf.nprobe
+        res = faiss.RangeSearchResult(n)
+
+        index_ivf.range_search_preassigned(
+            n, faiss.swig_ptr(xq), radius,
+            faiss.swig_ptr(list_nos), faiss.swig_ptr(coarse_dis),
+            res)
+
+        lims = faiss.rev_swig_ptr(res.lims, n + 1).copy()
+        nd = int(lims[-1])
+        D = faiss.rev_swig_ptr(res.distances, nd).copy()
+        I = faiss.rev_swig_ptr(res.labels, nd).copy()
+        return lims, D, I
+
+    def set_nprobe(self, nprobe):
+        index_ivf = faiss.extract_index_ivf(self.index)
+        index_ivf.nprobe = nprobe
+
+    def set_parallel_mode(self, pm):
+        index_ivf = faiss.extract_index_ivf(self.index)
+        index_ivf.parallel_mode = pm
+
+    def get_ntotal(self):
+        return self.index.ntotal
+
+    def set_prefetch_nthread(self, nt):
+        for idx in self.indexes:
+            il = faiss.downcast_InvertedLists(
+                faiss.extract_index_ivf(idx).invlists)
+            il.prefetch_nthread
+            il.prefetch_nthread = nt
+
+    def set_omp_num_threads(self, nt):
+        faiss.omp_set_num_threads(nt)
+
+class CombinedIndexDeep1B(CombinedIndex):
+    """ loads a CombinedIndex with the data from the big photodna index """
+
+    def __init__(self):
+        # set some paths
+        workdir = "/checkpoint/matthijs/ondisk_distributed/"
+
+        # empty index with the proper quantizer
+        indexfname = workdir + 'trained.faissindex'
+
+        # index that has some invlists that override the big one
+        masked_index_fname = None
+        invlist_fnames = [
+            '%s/hslices/slice%d.faissindex' % (workdir, i)
+            for i in range(50)
+        ]
+        CombinedIndex.__init__(self, invlist_fnames, indexfname, masked_index_fname)
+
+
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype='int32')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+
+
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+
+
+if __name__ == '__main__':
+    import time
+    ci = CombinedIndexDeep1B()
+    print('loaded index of size ', ci.index.ntotal)
+
+    deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
+
+    xq = fvecs_read(deep1bdir + "deep1B_queries.fvecs")
+    gt_fname = deep1bdir + "deep1B_groundtruth.ivecs"
+    gt = ivecs_read(gt_fname)
+
+    for nprobe in 1, 10, 100, 1000:
+        ci.set_nprobe(nprobe)
+        t0 = time.time()
+        D, I = ci.search(xq, 100)
+        t1 = time.time()
+        print('nprobe=%d 1-recall@1=%.4f t=%.2fs' % (
+            nprobe, (I[:, 0] == gt[:, 0]).sum() / len(xq),
+            t1 - t0
+        ))
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_kmeans.py b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_kmeans.py
new file mode 100644
index 0000000000..423f88127c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_kmeans.py
@@ -0,0 +1,409 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#! /usr/bin/env python3
+
+"""
+Simple distributed kmeans implementation Relies on an abstraction
+for the training matrix, that can be sharded over several machines.
+"""
+
+import faiss
+import time
+import numpy as np
+import sys
+import pdb
+import argparse
+
+from scipy.sparse import csc_matrix
+
+from multiprocessing.dummy import Pool as ThreadPool
+
+import rpc
+
+
+
+
+class DatasetAssign:
+    """Wrapper for a matrix that offers a function to assign the vectors
+    to centroids. All other implementations offer the same interface"""
+
+    def __init__(self, x):
+        self.x = np.ascontiguousarray(x, dtype='float32')
+
+    def count(self):
+        return self.x.shape[0]
+
+    def dim(self):
+        return self.x.shape[1]
+
+    def get_subset(self, indices):
+        return self.x[indices]
+
+    def perform_search(self, centroids):
+        index = faiss.IndexFlatL2(self.x.shape[1])
+        index.add(centroids)
+        return index.search(self.x, 1)
+
+    def assign_to(self, centroids, weights=None):
+        D, I = self.perform_search(centroids)
+
+        I = I.ravel()
+        D = D.ravel()
+        n = len(self.x)
+        if weights is None:
+            weights = np.ones(n, dtype='float32')
+        nc = len(centroids)
+        m = csc_matrix((weights, I, np.arange(n + 1)),
+                       shape=(nc, n))
+        sum_per_centroid = m * self.x
+
+        return I, D, sum_per_centroid
+
+
+class DatasetAssignGPU(DatasetAssign):
+    """ GPU version of the previous """
+
+    def __init__(self, x, gpu_id, verbose=False):
+        DatasetAssign.__init__(self, x)
+        index = faiss.IndexFlatL2(x.shape[1])
+        if gpu_id >= 0:
+            self.index = faiss.index_cpu_to_gpu(
+                faiss.StandardGpuResources(),
+                gpu_id, index)
+        else:
+            # -1 -> assign to all GPUs
+            self.index = faiss.index_cpu_to_all_gpus(index)
+
+
+    def perform_search(self, centroids):
+        self.index.reset()
+        self.index.add(centroids)
+        return self.index.search(self.x, 1)
+
+
+class DatasetAssignDispatch:
+    """dispatches to several other DatasetAssigns and combines the
+    results"""
+
+    def __init__(self, xes, in_parallel):
+        self.xes = xes
+        self.d = xes[0].dim()
+        if not in_parallel:
+            self.imap = map
+        else:
+            self.pool = ThreadPool(len(self.xes))
+            self.imap = self.pool.imap
+        self.sizes = list(map(lambda x: x.count(), self.xes))
+        self.cs = np.cumsum([0] + self.sizes)
+
+    def count(self):
+        return self.cs[-1]
+
+    def dim(self):
+        return self.d
+
+    def get_subset(self, indices):
+        res = np.zeros((len(indices), self.d), dtype='float32')
+        nos = np.searchsorted(self.cs[1:], indices, side='right')
+
+        def handle(i):
+            mask = nos == i
+            sub_indices = indices[mask] - self.cs[i]
+            subset = self.xes[i].get_subset(sub_indices)
+            res[mask] = subset
+
+        list(self.imap(handle, range(len(self.xes))))
+        return res
+
+    def assign_to(self, centroids, weights=None):
+        src = self.imap(
+            lambda x: x.assign_to(centroids, weights),
+            self.xes
+        )
+        I = []
+        D = []
+        sum_per_centroid = None
+        for Ii, Di, sum_per_centroid_i in src:
+            I.append(Ii)
+            D.append(Di)
+            if sum_per_centroid is None:
+                sum_per_centroid = sum_per_centroid_i
+            else:
+                sum_per_centroid += sum_per_centroid_i
+        return np.hstack(I), np.hstack(D), sum_per_centroid
+
+
+def imbalance_factor(k , assign):
+    return faiss.imbalance_factor(len(assign), k, faiss.swig_ptr(assign))
+
+
+def reassign_centroids(hassign, centroids, rs=None):
+    """ reassign centroids when some of them collapse """
+    if rs is None:
+        rs = np.random
+    k, d = centroids.shape
+    nsplit = 0
+    empty_cents = np.where(hassign == 0)[0]
+
+    if empty_cents.size == 0:
+        return 0
+
+    fac = np.ones(d)
+    fac[::2] += 1 / 1024.
+    fac[1::2] -= 1 / 1024.
+
+    # this is a single pass unless there are more than k/2
+    # empty centroids
+    while empty_cents.size > 0:
+        # choose which centroids to split
+        probas = hassign.astype('float') - 1
+        probas[probas < 0] = 0
+        probas /= probas.sum()
+        nnz = (probas > 0).sum()
+
+        nreplace = min(nnz, empty_cents.size)
+        cjs = rs.choice(k, size=nreplace, p=probas)
+
+        for ci, cj in zip(empty_cents[:nreplace], cjs):
+
+            c = centroids[cj]
+            centroids[ci] = c * fac
+            centroids[cj] = c / fac
+
+            hassign[ci] = hassign[cj] // 2
+            hassign[cj] -= hassign[ci]
+            nsplit += 1
+
+        empty_cents = empty_cents[nreplace:]
+
+    return nsplit
+
+
+def kmeans(k, data, niter=25, seed=1234, checkpoint=None):
+    """Pure python kmeans implementation. Follows the Faiss C++ version
+    quite closely, but takes a DatasetAssign instead of a training data
+    matrix. Also redo is not implemented. """
+    n, d = data.count(), data.dim()
+
+    print(("Clustering %d points in %dD to %d clusters, " +
+            "%d iterations seed %d") % (n, d, k, niter, seed))
+
+    rs = np.random.RandomState(seed)
+    print("preproc...")
+    t0 = time.time()
+    # initialization
+    perm = rs.choice(n, size=k, replace=False)
+    centroids = data.get_subset(perm)
+
+    print("  done")
+    t_search_tot = 0
+    obj = []
+    for i in range(niter):
+        t0s = time.time()
+
+        print('assigning', end='\r', flush=True)
+        assign, D, sums = data.assign_to(centroids)
+
+        print('compute centroids', end='\r', flush=True)
+
+        # pdb.set_trace()
+
+        t_search_tot += time.time() - t0s;
+
+        err = D.sum()
+        obj.append(err)
+
+        hassign = np.bincount(assign, minlength=k)
+
+        fac = hassign.reshape(-1, 1).astype('float32')
+        fac[fac == 0] = 1 # quiet warning
+
+        centroids = sums / fac
+
+        nsplit = reassign_centroids(hassign, centroids, rs)
+
+        print(("  Iteration %d (%.2f s, search %.2f s): "
+               "objective=%g imbalance=%.3f nsplit=%d") % (
+                   i, (time.time() - t0), t_search_tot,
+                   err, imbalance_factor (k, assign),
+                   nsplit)
+        )
+
+        if checkpoint is not None:
+            print('storing centroids in', checkpoint)
+            np.save(checkpoint, centroids)
+
+    return centroids
+
+
+class AssignServer(rpc.Server):
+    """ Assign version that can be exposed via RPC """
+
+    def __init__(self, s, assign, log_prefix=''):
+        rpc.Server.__init__(self, s, log_prefix=log_prefix)
+        self.assign = assign
+
+    def __getattr__(self, f):
+        return getattr(self.assign, f)
+
+
+
+def bvecs_mmap(fname):
+    x = np.memmap(fname, dtype='uint8', mode='r')
+    d = x[:4].view('int32')[0]
+    return x.reshape(-1, d + 4)[:, 4:]
+
+
+def ivecs_mmap(fname):
+    a = np.memmap(fname, dtype='int32', mode='r')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:]
+
+def fvecs_mmap(fname):
+    return ivecs_mmap(fname).view('float32')
+
+
+def do_test(todo):
+    testdata = '/datasets01_101/simsearch/041218/bigann/bigann_learn.bvecs'
+
+    x = bvecs_mmap(testdata)
+
+    # bad distribution to stress-test split code
+    xx = x[:100000].copy()
+    xx[:50000] = x[0]
+
+    todo = sys.argv[1:]
+
+    if "0" in todo:
+        # reference C++ run
+        km = faiss.Kmeans(x.shape[1], 1000, niter=20, verbose=True)
+        km.train(xx.astype('float32'))
+
+    if "1" in todo:
+        # using the Faiss c++ implementation
+        data = DatasetAssign(xx)
+        kmeans(1000, data, 20)
+
+    if "2" in todo:
+        # use the dispatch object (on local datasets)
+        data = DatasetAssignDispatch([
+            DatasetAssign(xx[20000 * i : 20000 * (i + 1)])
+            for i in range(5)
+            ], False
+        )
+        kmeans(1000, data, 20)
+
+    if "3" in todo:
+        # same, with GPU
+        ngpu = faiss.get_num_gpus()
+        print('using %d GPUs' % ngpu)
+        data = DatasetAssignDispatch([
+            DatasetAssignGPU(xx[100000 * i // ngpu: 100000 * (i + 1) // ngpu], i)
+            for i in range(ngpu)
+            ], True
+        )
+        kmeans(1000, data, 20)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    def aa(*args, **kwargs):
+        group.add_argument(*args, **kwargs)
+
+    group = parser.add_argument_group('general options')
+    aa('--test', default='', help='perform tests (comma-separated numbers)')
+
+    aa('--k', default=0, type=int, help='nb centroids')
+    aa('--seed', default=1234, type=int, help='random seed')
+    aa('--niter', default=20, type=int, help='nb iterations')
+    aa('--gpu', default=-2, type=int, help='GPU to use (-2:none, -1: all)')
+
+    group = parser.add_argument_group('I/O options')
+    aa('--indata', default='',
+       help='data file to load (supported formats fvecs, bvecs, npy')
+    aa('--i0', default=0, type=int, help='first vector to keep')
+    aa('--i1', default=-1, type=int, help='last vec to keep + 1')
+    aa('--out', default='', help='file to store centroids')
+    aa('--store_each_iteration', default=False, action='store_true',
+       help='store centroid checkpoints')
+
+    group = parser.add_argument_group('server options')
+    aa('--server', action='store_true', default=False, help='run server')
+    aa('--port', default=12345, type=int, help='server port')
+    aa('--when_ready', default=None, help='store host:port to this file when ready')
+    aa('--ipv4', default=False, action='store_true', help='force ipv4')
+
+    group = parser.add_argument_group('client options')
+    aa('--client', action='store_true', default=False, help='run client')
+    aa('--servers', default='', help='list of server:port separated by spaces')
+
+    args = parser.parse_args()
+
+    if args.test:
+        do_test(args.test.split(','))
+        return
+
+    # prepare data matrix (either local or remote)
+    if args.indata:
+        print('loading ', args.indata)
+        if args.indata.endswith('.bvecs'):
+            x = bvecs_mmap(args.indata)
+        elif args.indata.endswith('.fvecs'):
+            x = fvecs_mmap(args.indata)
+        elif args.indata.endswith('.npy'):
+            x = np.load(args.indata, mmap_mode='r')
+        else:
+            assert False
+
+        if args.i1 == -1:
+            args.i1 = len(x)
+        x = x[args.i0:args.i1]
+        if args.gpu == -2:
+            data = DatasetAssign(x)
+        else:
+            print('moving to GPU')
+            data = DatasetAssignGPU(x, args.gpu)
+
+    elif args.client:
+        print('connecting to servers')
+
+        def connect_client(hostport):
+            host, port = hostport.split(':')
+            port = int(port)
+            print('connecting %s:%d' % (host, port))
+            client = rpc.Client(host, port, v6=not args.ipv4)
+            print('client %s:%d ready' % (host, port))
+            return client
+
+        hostports = args.servers.strip().split(' ')
+        # pool = ThreadPool(len(hostports))
+
+        data = DatasetAssignDispatch(
+            list(map(connect_client, hostports)),
+            True
+        )
+    else:
+        assert False
+
+    if args.server:
+        print('starting server')
+        log_prefix = f"{rpc.socket.gethostname()}:{args.port}"
+        rpc.run_server(
+            lambda s: AssignServer(s, data, log_prefix=log_prefix),
+            args.port, report_to_file=args.when_ready,
+            v6=not args.ipv4)
+
+    else:
+        print('running kmeans')
+        centroids = kmeans(args.k, data, niter=args.niter, seed=args.seed,
+                           checkpoint=args.out if args.store_each_iteration else None)
+        if args.out != '':
+            print('writing centroids to', args.out)
+            np.save(args.out, centroids)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_query_demo.py b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_query_demo.py
new file mode 100644
index 0000000000..401f056056
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_query_demo.py
@@ -0,0 +1,69 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import faiss
+import numpy as np
+import time
+import rpc
+import sys
+
+import combined_index
+import search_server
+
+hostnames = sys.argv[1:]
+
+print("Load local index")
+ci = combined_index.CombinedIndexDeep1B()
+
+print("connect to clients")
+clients = []
+for host in hostnames:
+    client = rpc.Client(host, 12012, v6=False)
+    clients.append(client)
+
+# check if all servers respond
+print("sizes seen by servers:", [cl.get_ntotal() for cl in clients])
+
+
+# aggregate all clients into a one that uses them all for speed
+# note that it also requires a local index ci
+sindex = search_server.SplitPerListIndex(ci, clients)
+sindex.verbose = True
+
+# set reasonable parameters
+ci.set_parallel_mode(1)
+ci.set_prefetch_nthread(0)
+ci.set_omp_num_threads(64)
+
+# initialize params
+sindex.set_parallel_mode(1)
+sindex.set_prefetch_nthread(0)
+sindex.set_omp_num_threads(64)
+
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype='int32')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+
+
+deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
+
+xq = fvecs_read(deep1bdir + "deep1B_queries.fvecs")
+gt_fname = deep1bdir + "deep1B_groundtruth.ivecs"
+gt = ivecs_read(gt_fname)
+
+
+for nprobe in 1, 10, 100, 1000:
+    sindex.set_nprobe(nprobe)
+    t0 = time.time()
+    D, I = sindex.search(xq, 100)
+    t1 = time.time()
+    print('nprobe=%d 1-recall@1=%.4f t=%.2fs' % (
+        nprobe, (I[:, 0] == gt[:, 0]).sum() / len(xq),
+        t1 - t0
+    ))
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/make_index_vslice.py b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/make_index_vslice.py
new file mode 100644
index 0000000000..3364919403
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/make_index_vslice.py
@@ -0,0 +1,116 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import time
+import numpy as np
+import faiss
+import argparse
+from multiprocessing.dummy import Pool as ThreadPool
+
+def ivecs_mmap(fname):
+    a = np.memmap(fname, dtype='int32', mode='r')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:]
+
+def fvecs_mmap(fname):
+    return ivecs_mmap(fname).view('float32')
+
+
+def produce_batches(args):
+
+    x = fvecs_mmap(args.input)
+
+    if args.i1 == -1:
+        args.i1 = len(x)
+
+    print("Iterating on vectors %d:%d from %s by batches of size %d" % (
+        args.i0, args.i1, args.input, args.bs))
+
+    for j0 in range(args.i0, args.i1, args.bs):
+        j1 = min(j0 + args.bs, args.i1)
+        yield np.arange(j0, j1), x[j0:j1]
+
+
+def rate_limited_iter(l):
+    'a thread pre-processes the next element'
+    pool = ThreadPool(1)
+    res = None
+
+    def next_or_None():
+        try:
+            return next(l)
+        except StopIteration:
+            return None
+
+    while True:
+        res_next = pool.apply_async(next_or_None)
+        if res is not None:
+            res = res.get()
+            if res is None:
+                return
+            yield res
+        res = res_next
+
+deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
+workdir = "/checkpoint/matthijs/ondisk_distributed/"
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='make index for a subset of the data')
+
+    def aa(*args, **kwargs):
+        group.add_argument(*args, **kwargs)
+
+    group = parser.add_argument_group('index type')
+    aa('--inputindex',
+       default=workdir + 'trained.faissindex',
+       help='empty input index to fill in')
+    aa('--nt', default=-1, type=int, help='nb of openmp threads to use')
+
+    group = parser.add_argument_group('db options')
+    aa('--input', default=deep1bdir + "base.fvecs")
+    aa('--bs', default=2**18, type=int,
+       help='batch size for db access')
+    aa('--i0', default=0, type=int, help='lower bound to index')
+    aa('--i1', default=-1, type=int, help='upper bound of vectors to index')
+
+    group = parser.add_argument_group('output')
+    aa('-o', default='/tmp/x', help='output index')
+    aa('--keepquantizer', default=False, action='store_true',
+       help='by default we remove the data from the quantizer to save space')
+
+    args = parser.parse_args()
+    print('args=', args)
+
+    print('start accessing data')
+    src = produce_batches(args)
+
+    print('loading index', args.inputindex)
+    index = faiss.read_index(args.inputindex)
+
+    if args.nt != -1:
+        faiss.omp_set_num_threads(args.nt)
+
+    t0 = time.time()
+    ntot = 0
+    for ids, x in rate_limited_iter(src):
+        print('add %d:%d (%.3f s)' % (ntot, ntot + ids.size, time.time() - t0))
+        index.add_with_ids(np.ascontiguousarray(x, dtype='float32'), ids)
+        ntot += ids.size
+
+    index_ivf = faiss.extract_index_ivf(index)
+    print('invlists stats: imbalance %.3f' % index_ivf.invlists.imbalance_factor())
+    index_ivf.invlists.print_stats()
+
+    if not args.keepquantizer:
+        print('resetting quantizer content')
+        index_ivf = faiss.extract_index_ivf(index)
+        index_ivf.quantizer.reset()
+
+    print('store output', args.o)
+    faiss.write_index(index, args.o)
+
+if __name__ == '__main__':
+    main()
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/make_trained_index.py b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/make_trained_index.py
new file mode 100644
index 0000000000..50e4668f1b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/make_trained_index.py
@@ -0,0 +1,52 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import faiss
+
+deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
+workdir = "/checkpoint/matthijs/ondisk_distributed/"
+
+
+print('Load centroids')
+centroids = np.load(workdir + '1M_centroids.npy')
+ncent, d = centroids.shape
+
+
+print('apply random rotation')
+rrot = faiss.RandomRotationMatrix(d, d)
+rrot.init(1234)
+centroids = rrot.apply_py(centroids)
+
+print('make HNSW index as quantizer')
+quantizer = faiss.IndexHNSWFlat(d, 32)
+quantizer.hnsw.efSearch = 1024
+quantizer.hnsw.efConstruction = 200
+quantizer.add(centroids)
+
+print('build index')
+index = faiss.IndexPreTransform(
+    rrot,
+    faiss.IndexIVFScalarQuantizer(
+        quantizer, d, ncent, faiss.ScalarQuantizer.QT_6bit
+        )
+    )
+
+def ivecs_mmap(fname):
+    a = np.memmap(fname, dtype='int32', mode='r')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:]
+
+def fvecs_mmap(fname):
+    return ivecs_mmap(fname).view('float32')
+
+
+print('finish training index')
+xt = fvecs_mmap(deep1bdir + 'learn.fvecs')
+xt = np.ascontiguousarray(xt[:256 * 1000], dtype='float32')
+index.train(xt)
+
+print('write output')
+faiss.write_index(index, workdir + 'trained.faissindex')
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/merge_to_ondisk.py b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/merge_to_ondisk.py
new file mode 100644
index 0000000000..735c92b2a2
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/merge_to_ondisk.py
@@ -0,0 +1,96 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import faiss
+import argparse
+from multiprocessing.dummy import Pool as ThreadPool
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--inputs', nargs='*', required=True,
+                        help='input indexes to merge')
+    parser.add_argument('--l0', type=int, default=0)
+    parser.add_argument('--l1', type=int, default=-1)
+
+    parser.add_argument('--nt', default=-1,
+                        help='nb threads')
+
+    parser.add_argument('--output', required=True,
+                        help='output index filename')
+    parser.add_argument('--outputIL',
+                        help='output invfile filename')
+
+    args = parser.parse_args()
+
+    if args.nt != -1:
+        print('set nb of threads to', args.nt)
+
+
+    ils = faiss.InvertedListsPtrVector()
+    ils_dont_dealloc = []
+
+    pool = ThreadPool(20)
+
+    def load_index(fname):
+        print("loading", fname)
+        try:
+            index = faiss.read_index(fname, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
+        except RuntimeError as e:
+            print('could not load %s: %s' % (fname, e))
+            return fname, None
+
+        print("  %d entries" % index.ntotal)
+        return fname, index
+
+    index0 = None
+
+    for fname, index in pool.imap(load_index, args.inputs):
+        if index is None:
+            continue
+        index_ivf = faiss.extract_index_ivf(index)
+        il = faiss.downcast_InvertedLists(index_ivf.invlists)
+        index_ivf.invlists = None
+        il.this.own()
+        ils_dont_dealloc.append(il)
+        if (args.l0, args.l1) != (0, -1):
+            print('restricting to lists %d:%d' % (args.l0, args.l1))
+            # il = faiss.SliceInvertedLists(il, args.l0, args.l1)
+
+            il.crop_invlists(args.l0, args.l1)
+            ils_dont_dealloc.append(il)
+        ils.push_back(il)
+
+        if index0 is None:
+            index0 = index
+
+    print("loaded %d invlists" % ils.size())
+
+    if not args.outputIL:
+        args.outputIL = args.output + '_invlists'
+
+    il0 = ils.at(0)
+
+    il = faiss.OnDiskInvertedLists(
+        il0.nlist, il0.code_size,
+        args.outputIL)
+
+    print("perform merge")
+
+    ntotal = il.merge_from(ils.data(), ils.size(), True)
+
+    print("swap into index0")
+
+    index0_ivf = faiss.extract_index_ivf(index0)
+    index0_ivf.nlist = il0.nlist
+    index0_ivf.ntotal = index0.ntotal = ntotal
+    index0_ivf.invlists = il
+    index0_ivf.own_invlists = False
+
+    print("write", args.output)
+
+    faiss.write_index(index0, args.output)
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/rpc.py b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/rpc.py
new file mode 100644
index 0000000000..401d0d5bcc
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/rpc.py
@@ -0,0 +1,249 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Simplistic RPC implementation.
+Exposes all functions of a Server object.
+
+Uses pickle for serialization and the socket interface.
+"""
+
+import os,pdb,pickle,time,errno,sys,_thread,traceback,socket,threading,gc
+
+
+# default
+PORT=12032
+
+
+#########################################################################
+# simple I/O functions
+
+
+
+def inline_send_handle(f, conn):
+    st = os.fstat(f.fileno())
+    size = st.st_size
+    pickle.dump(size, conn)
+    conn.write(f.read(size))
+
+def inline_send_string(s, conn):
+    size = len(s)
+    pickle.dump(size, conn)
+    conn.write(s)
+
+
+class FileSock:
+    " wraps a socket so that it is usable by pickle/cPickle "
+
+    def __init__(self,sock):
+        self.sock = sock
+        self.nr=0
+
+    def write(self, buf):
+        # print("sending %d bytes"%len(buf))
+        #self.sock.sendall(buf)
+        # print("...done")
+        bs = 512 * 1024
+        ns = 0
+        while ns < len(buf):
+            sent = self.sock.send(buf[ns:ns + bs])
+            ns += sent
+
+
+    def read(self,bs=512*1024):
+        #if self.nr==10000: pdb.set_trace()
+        self.nr+=1
+        # print("read bs=%d"%bs)
+        b = []
+        nb = 0
+        while len(b)<bs:
+            # print('   loop')
+            rb = self.sock.recv(bs - nb)
+            if not rb: break
+            b.append(rb)
+            nb += len(rb)
+        return b''.join(b)
+
+    def readline(self):
+        # print("readline!")
+        """may be optimized..."""
+        s=bytes()
+        while True:
+            c=self.read(1)
+            s+=c
+        if len(c)==0 or chr(c[0])=='\n':
+            return s
+
+class ClientExit(Exception):
+    pass
+
+class ServerException(Exception):
+    pass
+
+
+class Server:
+    """
+    server protocol. Methods from classes that subclass Server can be called
+    transparently from a client
+    """
+
+    def __init__(self, s, logf=sys.stderr, log_prefix=''):
+        self.logf = logf
+        self.log_prefix = log_prefix
+
+        # connection
+
+        self.conn = s
+        self.fs = FileSock(s)
+
+
+    def log(self, s):
+        self.logf.write("Sever log %s: %s\n" % (self.log_prefix, s))
+
+    def one_function(self):
+        """
+        Executes a single function with associated I/O.
+        Protocol:
+        - the arguments and results are serialized with the pickle protocol
+        - client sends : (fname,args)
+            fname = method name to call
+            args = tuple of arguments
+        - server sends result: (rid,st,ret)
+            rid = request id
+            st = None, or exception if there was during execution
+            ret = return value or None if st!=None
+        """
+
+        try:
+            (fname,args)=pickle.load(self.fs)
+        except EOFError:
+            raise ClientExit("read args")
+        self.log("executing method %s"%(fname))
+        st = None
+        ret = None
+        try:
+            f=getattr(self,fname)
+        except AttributeError:
+            st = AttributeError("unknown method "+fname)
+            self.log("unknown method ")
+
+        try:
+            ret = f(*args)
+        except Exception as e:
+            # due to a bug (in mod_python?), ServerException cannot be
+            # unpickled, so send the string and make the exception on the client side
+
+            #st=ServerException(
+            #  "".join(traceback.format_tb(sys.exc_info()[2]))+
+            #  str(e))
+            st="".join(traceback.format_tb(sys.exc_info()[2]))+str(e)
+            self.log("exception in method")
+            traceback.print_exc(50,self.logf)
+            self.logf.flush()
+
+        print("return")
+        try:
+            pickle.dump((st ,ret), self.fs, protocol=4)
+        except EOFError:
+            raise ClientExit("function return")
+
+    def exec_loop(self):
+        """ main execution loop. Loops and handles exit states"""
+
+        self.log("in exec_loop")
+        try:
+            while True:
+                self.one_function()
+        except ClientExit as e:
+            self.log("ClientExit %s"%e)
+        except socket.error as e:
+            self.log("socket error %s"%e)
+            traceback.print_exc(50,self.logf)
+        except EOFError:
+            self.log("EOF during communication")
+            traceback.print_exc(50,self.logf)
+        except:
+            # unexpected
+            traceback.print_exc(50,sys.stderr)
+            sys.exit(1)
+
+        print("exit sever")
+
+    def exec_loop_cleanup(self):
+        pass
+
+    ###################################################################
+    # spying stuff
+
+    def get_ps_stats(self):
+        ret=''
+        f=os.popen("echo ============ `hostname` uptime:; uptime;"+
+                   "echo ============ self:; "+
+                   "ps -p %d -o pid,vsize,rss,%%cpu,nlwp,psr; "%os.getpid()+
+                   "echo ============ run queue:;"+
+                   "ps ar -o user,pid,%cpu,%mem,ni,nlwp,psr,vsz,rss,cputime,command")
+        for l in f:
+            ret+=l
+        return ret
+
+class Client:
+    """
+    Methods of the server object can be called transparently. Exceptions are
+    re-raised.
+    """
+    def __init__(self, HOST, port=PORT, v6=False):
+        socktype = socket.AF_INET6 if v6 else socket.AF_INET
+
+        sock = socket.socket(socktype, socket.SOCK_STREAM)
+        print("connecting",HOST, port, socktype)
+        sock.connect((HOST, port))
+        self.sock = sock
+        self.fs = FileSock(sock)
+
+    def generic_fun(self, fname, args):
+        # int "gen fun",fname
+        pickle.dump((fname, args), self.fs, protocol=4)
+        return self.get_result()
+
+    def get_result(self):
+        (st, ret) = pickle.load(self.fs)
+        if st!=None:
+            raise ServerException(st)
+        else:
+            return ret
+
+    def __getattr__(self,name):
+        return lambda *x: self.generic_fun(name,x)
+
+
+def run_server(new_handler, port=PORT, report_to_file=None, v6=False):
+
+    HOST = ''                 # Symbolic name meaning the local host
+    socktype = socket.AF_INET6 if v6 else socket.AF_INET
+    s = socket.socket(socktype, socket.SOCK_STREAM)
+    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+
+    print("bind %s:%d" % (HOST, port))
+    s.bind((HOST, port))
+    s.listen(5)
+
+    print("accepting connections")
+    if report_to_file is not None:
+        print('storing host+port in', report_to_file)
+        open(report_to_file, 'w').write('%s:%d ' % (socket.gethostname(), port))
+
+    while True:
+        try:
+            conn, addr = s.accept()
+        except socket.error as e:
+            if e[1]=='Interrupted system call': continue
+            raise
+
+        print('Connected by', addr, end=' ')
+
+        ibs = new_handler(conn)
+
+        tid = _thread.start_new_thread(ibs.exec_loop,())
+
+        print("tid",tid)
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/run_on_cluster.bash b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/run_on_cluster.bash
new file mode 100644
index 0000000000..1f5236259c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/run_on_cluster.bash
@@ -0,0 +1,263 @@
+#! /bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -e
+
+todo=$1
+# other options can be transmitted
+shift
+
+# the training data of the Deep1B dataset
+deep1bdir=/datasets01_101/simsearch/041218/deep1b
+traindata=$deep1bdir/learn.fvecs
+
+# this is for small tests
+nvec=1000000
+k=4000
+
+# for the real run
+# nvec=50000000
+# k=1000000
+
+# working directory for the real run
+workdir=/checkpoint/matthijs/ondisk_distributed
+mkdir -p $workdir/{vslices,hslices}
+
+if [ -z "$todo" ]; then
+    echo "nothing to do"
+    exit 1
+elif [ $todo == test_kmeans_0 ]; then
+    # non distributed baseline
+    python distributed_kmeans.py \
+           --indata $traindata --i1 $nvec \
+           --k $k
+
+elif [ $todo == test_kmeans_1 ]; then
+    # using all the machine's GPUs
+    python distributed_kmeans.py \
+           --indata $traindata --i1 $nvec \
+           --k $k --gpu -1
+
+elif [ $todo == test_kmeans_2 ]; then
+    # distrbuted run, with one local server per GPU
+    ngpu=$( echo /dev/nvidia? | wc -w )
+    baseport=12012
+
+    # kill background porcesses on output of this script
+    trap 'kill -HUP 0' 0
+
+    hostports=''
+
+    for((gpu=0;gpu<ngpu;gpu++)); do
+        # range of vectors to assign to each sever
+        i0=$((nvec * gpu / ngpu))
+        i1=$((nvec * (gpu + 1) / ngpu))
+        port=$(( baseport + gpu ))
+
+        echo "start server $gpu for range $i0:$i1"
+
+        python distributed_kmeans.py \
+               --indata $traindata \
+               --i0 $i0 --i1 $i1 \
+               --server --gpu $gpu \
+               --port $port --ipv4 &
+
+        hostports="$hostports localhost:$port"
+    done
+
+    # lame way of making sure all servers are running
+    sleep 5s
+
+    python distributed_kmeans.py \
+           --client --servers "$hostports" \
+           --k $k --ipv4
+
+elif [ $todo == slurm_distributed_kmeans ]; then
+
+    nserv=5
+
+    srun -n$nserv \
+         --time=48:00:00 \
+         --cpus-per-task=40 --gres=gpu:4 --mem=100G \
+         --partition=priority --comment='priority is the only one that works'  \
+         -l bash $( realpath $0 ) slurm_within_kmeans_server
+
+elif [ $todo == slurm_within_kmeans_server ]; then
+
+   nserv=$SLURM_NPROCS
+   [ ! -z "$nserv" ] || (echo "should be run by slurm"; exit 1)
+   rank=$SLURM_PROCID
+
+   baseport=12012
+
+   i0=$((nvec * rank / nserv))
+   i1=$((nvec * (rank + 1) / nserv))
+   port=$(( baseport + rank ))
+
+   echo "host $(hostname) start server $rank for range $i0:$i1 port $port"
+
+   if [ $rank != 0 ]; then
+
+       python -u distributed_kmeans.py \
+              --indata $traindata \
+              --i0 $i0 --i1 $i1 \
+              --server --gpu -1 \
+              --port $port --ipv4
+   else
+       # master process
+
+       # kill background processes on output of this script
+       trap 'kill -HUP 0' 0
+
+       python -u distributed_kmeans.py \
+              --indata $traindata \
+              --i0 $i0 --i1 $i1 \
+              --server --gpu -1 \
+              --port $port --ipv4 &
+
+       # Slurm has a somewhat convoluted way of specifying the nodes
+       # assigned to each task. This is to parse the SLURM_TASKS_PER_NODE variable
+       function parse_tasks_per_node () {
+           local blocks=$1
+           for block in ${blocks//,/ }; do
+               if [ ${block/x/} != $block ]; then
+                   tpn="${block%(*}"
+                   repeat=${block#*x}
+                   repeat=${repeat%?}
+                   for((i=0;i<repeat;i++)); do
+                       echo $tpn
+                   done
+               else
+                   echo $block
+               fi
+            done
+       }
+
+       hostports=""
+       port=$baseport
+       echo VARS $SLURM_TASKS_PER_NODE $SLURM_JOB_NODELIST
+       tasks_per_node=( $( parse_tasks_per_node $SLURM_TASKS_PER_NODE ) )
+       nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
+       n=${#nodes[*]}
+       for((i=0;i<n;i++)); do
+           hostname=${nodes[i]}
+           for((j=0;j<tasks_per_node[i];j++)); do
+               hostports="$hostports $hostname:$port"
+               ((port++))
+           done
+       done
+
+       echo HOSTPORTS $hostports
+
+       sleep 20s
+
+       # run client
+       python distributed_kmeans.py \
+           --client --servers "$hostports" \
+           --k $k --ipv4 "$@"
+
+       echo "Done, kill the job"
+       scancel $SLURM_JOBID
+
+   fi
+
+elif [ $todo == deep1b_clustering ]; then
+    # also set nvec=500M and k=10M in the top of the file
+    nserv=20
+
+    srun -n$nserv \
+         --time=48:00:00 \
+         --cpus-per-task=40 --gres=gpu:4 --mem=100G \
+         --partition=priority --comment='priority is the only one that works'  \
+         -l bash $( realpath $0 ) slurm_within_kmeans_server \
+         --out $workdir/1M_centroids.npy
+
+elif [ $todo == make_index_vslices ]; then
+
+    # vslice: slice per database shards
+
+    nvec=1000000000
+    nslice=200
+
+    for((i=0;i<nslice;i++)); do
+        i0=$((nvec * i / nslice))
+        i1=$((nvec * (i + 1) / nslice))
+
+        # make the script to be run by sbatch
+        cat > $workdir/vslices/slice$i.bash <<EOF
+#!/bin/bash
+
+srun python -u make_index_vslice.py \
+                 --inputindex $workdir/trained.faissindex \
+                 --input $deep1bdir/base.fvecs \
+                 --nt 40 \
+                 --i0 $i0 --i1 $i1 \
+                 -o $workdir/vslices/slice$i.faissindex
+
+EOF
+        # specify resources for script and run it
+        sbatch -n1 \
+             --time=48:00:00 \
+             --cpus-per-task=40 --gres=gpu:0 --mem=200G \
+             --output=$workdir/vslices/slice$i.log \
+             --job-name=vslice$i.c \
+             $workdir/vslices/slice$i.bash
+        echo "logs in $workdir/vslices/slice$i.log"
+
+    done
+
+elif [ $todo == make_index_hslices ]; then
+
+    # hslice: slice per inverted lists
+
+    nlist=1000000
+    nslice=50
+
+    for((i=0;i<nslice;i++)); do
+        i0=$((nlist * i / nslice))
+        i1=$((nlist * (i + 1) / nslice))
+
+        # make the script to be run by sbatch
+        cat > $workdir/hslices/slice$i.bash <<EOF
+#!/bin/bash
+
+srun python -u merge_to_ondisk.py \
+                 --input $workdir/vslices/slice{0..199}.faissindex \
+                 --nt 20 \
+                 --l0 $i0 --l1 $i1 \
+                 --output $workdir/hslices/slice$i.faissindex \
+                 --outputIL $workdir/hslices/slice$i.invlists
+
+
+EOF
+        # specify resources for script and run it
+        sbatch -n1 \
+             --time=48:00:00 \
+             --cpus-per-task=20 --gres=gpu:0 --mem=200G \
+             --output=$workdir/hslices/slice$i.log \
+             --job-name=hslice$i.a \
+             --constraint=pascal \
+             $workdir/hslices/slice$i.bash
+        echo "logs in $workdir/hslices/slice$i.log"
+
+    done
+
+elif [ $todo == run_search_servers ]; then
+
+    nserv=3
+
+    srun -n$nserv \
+         --time=48:00:00 \
+         --cpus-per-task=64 --gres=gpu:0 --mem=100G \
+         --constraint=pascal \
+         --partition=priority --comment='priority is the only one that works'  \
+         -l python -u search_server.py --port 12012
+
+
+else
+    echo "unknown todo $todo"
+    exit 1
+fi
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/search_server.py b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/search_server.py
new file mode 100644
index 0000000000..28c5efbdde
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/search_server.py
@@ -0,0 +1,220 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import time
+import rpc
+import combined_index
+
+import argparse
+
+
+############################################################
+# Server implementation
+############################################################
+
+
+class MyServer(rpc.Server):
+    """ Assign version that can be exposed via RPC """
+    def __init__(self, s, index):
+        rpc.Server.__init__(self, s)
+        self.index = index
+
+    def __getattr__(self, f):
+        return getattr(self.index, f)
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    def aa(*args, **kwargs):
+        group.add_argument(*args, **kwargs)
+
+    group = parser.add_argument_group('server options')
+    aa('--port', default=12012, type=int, help='server port')
+    aa('--when_ready_dir', default=None,
+       help='store host:port to this file when ready')
+    aa('--ipv4', default=False, action='store_true', help='force ipv4')
+    aa('--rank', default=0, type=int,
+       help='rank used as index in the client table')
+
+    args = parser.parse_args()
+
+    when_ready = None
+    if args.when_ready_dir:
+        when_ready = '%s/%d' % (args.when_ready_dir, args.rank)
+
+    print('loading index')
+
+    index = combined_index.CombinedIndexDeep1B()
+
+    print('starting server')
+    rpc.run_server(
+        lambda s: MyServer(s, index),
+        args.port, report_to_file=when_ready,
+        v6=not args.ipv4)
+
+if __name__ == '__main__':
+    main()
+
+
+############################################################
+# Client implementation
+############################################################
+
+from multiprocessing.dummy import Pool as ThreadPool
+import faiss
+import numpy as np
+
+
+
+class ResultHeap:
+    """ Combine query results from a sliced dataset (for k-nn search) """
+
+    def __init__(self, nq, k):
+        " nq: number of query vectors, k: number of results per query "
+        self.I = np.zeros((nq, k), dtype='int64')
+        self.D = np.zeros((nq, k), dtype='float32')
+        self.nq, self.k = nq, k
+        heaps = faiss.float_maxheap_array_t()
+        heaps.k = k
+        heaps.nh = nq
+        heaps.val = faiss.swig_ptr(self.D)
+        heaps.ids = faiss.swig_ptr(self.I)
+        heaps.heapify()
+        self.heaps = heaps
+
+    def add_batch_result(self, D, I, i0):
+        assert D.shape == (self.nq, self.k)
+        assert I.shape == (self.nq, self.k)
+        I += i0
+        self.heaps.addn_with_ids(
+            self.k, faiss.swig_ptr(D),
+            faiss.swig_ptr(I), self.k)
+
+    def finalize(self):
+        self.heaps.reorder()
+
+def distribute_weights(weights, nbin):
+    """ assign a set of weights to a smaller set of bins to balance them """
+    nw = weights.size
+    o = weights.argsort()
+    bins = np.zeros(nbin)
+    assign = np.ones(nw, dtype=int)
+    for i in o[::-1]:
+        b = bins.argmin()
+        assign[i] = b
+        bins[b] += weights[i]
+    return bins, assign
+
+
+
+class SplitPerListIndex:
+    """manages a local index, that does the coarse quantization and a set
+    of sub_indexes. The sub_indexes search a subset of the inverted
+    lists. The SplitPerListIndex merges results from the sub-indexes"""
+
+    def __init__(self, index, sub_indexes):
+        self.index = index
+        self.code_size = faiss.extract_index_ivf(index.index).code_size
+        self.sub_indexes = sub_indexes
+        self.ni = len(self.sub_indexes)
+        # pool of threads. Each thread manages one sub-index.
+        self.pool = ThreadPool(self.ni)
+        self.verbose = False
+
+    def set_nprobe(self, nprobe):
+        self.index.set_nprobe(nprobe)
+        self.pool.map(
+            lambda i: self.sub_indexes[i].set_nprobe(nprobe),
+            range(self.ni)
+        )
+
+    def set_omp_num_threads(self, nt):
+        faiss.omp_set_num_threads(nt)
+        self.pool.map(
+            lambda idx: idx.set_omp_num_threads(nt),
+            self.sub_indexes
+        )
+
+    def set_parallel_mode(self, pm):
+        self.index.set_parallel_mode(pm)
+        self.pool.map(
+            lambda idx: idx.set_parallel_mode(pm),
+            self.sub_indexes
+        )
+
+    def set_prefetch_nthread(self, nt):
+        self.index.set_prefetch_nthread(nt)
+        self.pool.map(
+            lambda idx: idx.set_prefetch_nthread(nt),
+            self.sub_indexes
+        )
+
+    def balance_lists(self, list_nos):
+        big_il = self.index.big_il
+        weights = np.array([big_il.list_size(int(i))
+                            for i in list_nos.ravel()])
+        bins, assign = distribute_weights(weights, self.ni)
+        if self.verbose:
+            print('bins weight range %d:%d total %d (%.2f MiB)' % (
+                bins.min(), bins.max(), bins.sum(),
+                bins.sum() * (self.code_size + 8) / 2 ** 20))
+        self.nscan = bins.sum()
+        return assign.reshape(list_nos.shape)
+
+    def search(self, x, k):
+        xqo, list_nos, coarse_dis = self.index.transform_and_assign(x)
+        assign = self.balance_lists(list_nos)
+
+        def do_query(i):
+            sub_index = self.sub_indexes[i]
+            list_nos_i = list_nos.copy()
+            list_nos_i[assign != i] = -1
+            t0 = time.time()
+            Di, Ii = sub_index.ivf_search_preassigned(
+                xqo, list_nos_i, coarse_dis, k)
+            #print(list_nos_i, Ii)
+            if self.verbose:
+                print('client %d: %.3f s' % (i, time.time() - t0))
+            return Di, Ii
+
+        rh = ResultHeap(x.shape[0], k)
+
+        for Di, Ii in self.pool.imap(do_query, range(self.ni)):
+            #print("ADD", Ii, rh.I)
+            rh.add_batch_result(Di, Ii, 0)
+        rh.finalize()
+        return rh.D, rh.I
+
+    def range_search(self, x, radius):
+        xqo, list_nos, coarse_dis = self.index.transform_and_assign(x)
+        assign = self.balance_lists(list_nos)
+        nq = len(x)
+
+        def do_query(i):
+            sub_index = self.sub_indexes[i]
+            list_nos_i = list_nos.copy()
+            list_nos_i[assign != i] = -1
+            t0 = time.time()
+            limi, Di, Ii = sub_index.ivf_range_search_preassigned(
+                xqo, list_nos_i, coarse_dis, radius)
+            if self.verbose:
+                print('slice %d: %.3f s' % (i, time.time() - t0))
+            return limi, Di, Ii
+
+        D = [[] for i in range(nq)]
+        I = [[] for i in range(nq)]
+
+        sizes = np.zeros(nq, dtype=int)
+        for lims, Di, Ii in self.pool.imap(do_query, range(self.ni)):
+            for i in range(nq):
+                l0, l1 = lims[i:i + 2]
+                D[i].append(Di[l0:l1])
+                I[i].append(Ii[l0:l1])
+                sizes[i] += l1 - l0
+        lims = np.zeros(nq + 1, dtype=int)
+        lims[1:] = np.cumsum(sizes)
+        D = np.hstack([j for i in D for j in i])
+        I = np.hstack([j for i in I for j in i])
+        return lims, D, I
diff --git a/core/src/index/thirdparty/faiss/benchs/kmeans_mnist.py b/core/src/index/thirdparty/faiss/benchs/kmeans_mnist.py
new file mode 100644
index 0000000000..c6a71cb302
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/kmeans_mnist.py
@@ -0,0 +1,86 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+import numpy as np
+import time
+import faiss
+import sys
+
+
+# Get command-line arguments
+
+k = int(sys.argv[1])
+ngpu = int(sys.argv[2])
+
+# Load Leon's file format
+
+def load_mnist(fname):
+    print "load", fname
+    f = open(fname)
+
+    header = np.fromfile(f, dtype='int8', count=4*4)
+    header = header.reshape(4, 4)[:, ::-1].copy().view('int32')
+    print header
+    nim, xd, yd = [int(x) for x in header[1:]]
+
+    data = np.fromfile(f, count=nim * xd * yd,
+                       dtype='uint8')
+
+    print data.shape, nim, xd, yd
+    data = data.reshape(nim, xd, yd)
+    return data
+
+x = load_mnist(basedir + 'mnist8m/mnist8m-patterns-idx3-ubyte')
+
+print "reshape"
+
+x = x.reshape(x.shape[0], -1).astype('float32')
+
+
+def train_kmeans(x, k, ngpu):
+    "Runs kmeans on one or several GPUs"
+    d = x.shape[1]
+    clus = faiss.Clustering(d, k)
+    clus.verbose = True
+    clus.niter = 20
+
+    # otherwise the kmeans implementation sub-samples the training set
+    clus.max_points_per_centroid = 10000000
+
+    res = [faiss.StandardGpuResources() for i in range(ngpu)]
+
+    flat_config = []
+    for i in range(ngpu):
+        cfg = faiss.GpuIndexFlatConfig()
+        cfg.useFloat16 = False
+        cfg.device = i
+        flat_config.append(cfg)
+
+    if ngpu == 1:
+        index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
+    else:
+        indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
+                   for i in range(ngpu)]
+        index = faiss.IndexReplicas()
+        for sub_index in indexes:
+            index.addIndex(sub_index)
+
+    # perform the training
+    clus.train(x, index)
+    centroids = faiss.vector_float_to_array(clus.centroids)
+
+    obj = faiss.vector_float_to_array(clus.obj)
+    print "final objective: %.4g" % obj[-1]
+
+    return centroids.reshape(k, d)
+
+print "run"
+t0 = time.time()
+train_kmeans(x, k, ngpu)
+t1 = time.time()
+
+print "total runtime: %.3f s" % (t1 - t0)
diff --git a/core/src/index/thirdparty/faiss/benchs/link_and_code/README.md b/core/src/index/thirdparty/faiss/benchs/link_and_code/README.md
new file mode 100644
index 0000000000..fc2035f54f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/link_and_code/README.md
@@ -0,0 +1,157 @@
+
+
+
+README for the link & code implementation
+=========================================
+
+What is this?
+-------------
+
+Link & code is an indexing method that combines HNSW indexing with
+compression and exploits the neighborhood structure of the similarity
+graph to improve the reconstruction. It is described in
+
+```
+@inproceedings{link_and_code,
+   author = {Matthijs Douze and Alexandre Sablayrolles and Herv\'e J\'egou},
+   title = {Link and code: Fast indexing with graphs and compact regression codes},
+   booktitle = {CVPR},
+   year = {2018}
+}
+```
+
+ArXiV [here](https://arxiv.org/abs/1804.09996)
+
+Code structure
+--------------
+
+The test runs with 3 files:
+
+- `bench_link_and_code.py`: driver script
+
+- `datasets.py`: code to load the datasets. The example code runs on the
+  deep1b and bigann datasets. See the [toplevel README](../README.md)
+  on how to downlod them. They should be put in a directory, edit
+  datasets.py to set the path.
+
+- `neighbor_codec.py`: this is where the representation is trained.
+
+The code runs on top of Faiss. The HNSW index can be extended with a
+`ReconstructFromNeighbors` C++ object that refines the distances. The
+training is implemented in Python.
+
+
+Reproducing Table 2 in the paper
+--------------------------------
+
+The results of table 2 (accuracy on deep100M) in the paper can be
+obtained with:
+
+```
+python bench_link_and_code.py \
+   --db deep100M \
+   --M0 6 \
+   --indexkey OPQ36_144,HNSW32_PQ36 \
+   --indexfile $bdir/deep100M_PQ36_L6.index \
+   --beta_nsq 4  \
+   --beta_centroids $bdir/deep100M_PQ36_L6_nsq4.npy \
+   --neigh_recons_codes $bdir/deep100M_PQ36_L6_nsq4_codes.npy \
+   --k_reorder 0,5 --efSearch 1,1024
+```
+
+Set `bdir` to a scratch directory.
+
+Explanation of the flags:
+
+- `--db deep1M`: dataset to process
+
+- `--M0 6`: number of links on the base level (L6)
+
+- `--indexkey OPQ36_144,HNSW32_PQ36`: Faiss index key to construct the
+  HNSW structure. It means that vectors are transformed by OPQ and
+  encoded with PQ 36x8 (with an intermediate size of 144D). The HNSW
+  level>0 nodes have 32 links (theses ones are "cheap" to store
+  because there are fewer nodes in the upper levels.
+
+- `--indexfile $bdir/deep1M_PQ36_M6.index`: name of the index file
+  (without information for the L&C extension)
+
+- `--beta_nsq 4`: number of bytes to allocate for the codes (M in the
+  paper)
+
+- `--beta_centroids $bdir/deep1M_PQ36_M6_nsq4.npy`: filename to store
+  the trained beta centroids
+
+- `--neigh_recons_codes $bdir/deep1M_PQ36_M6_nsq4_codes.npy`: filename
+  for the encoded weights (beta) of the combination
+
+- `--k_reorder 0,5`: number of restults to reorder. 0 = baseline
+  without reordering, 5 = value used throughout the paper
+
+- `--efSearch 1,1024`: number of nodes to visit (T in the paper)
+
+The script will proceed with the following steps:
+
+0. load dataset (and possibly compute the ground-truth if the
+ground-truth file is not provided)
+
+1. train the OPQ encoder
+
+2. build the index and store it
+
+3. compute the residuals and train the beta vocabulary to do the reconstuction
+
+4. encode the vertices
+
+5. search and evaluate the search results.
+
+With option `--exhaustive` the results of the exhaustive column can be
+obtained.
+
+The run above should output:
+```
+...
+setting k_reorder=5
+...
+efSearch=1024      0.3132 ms per query,  R@1: 0.4283 R@10: 0.6337 R@100: 0.6520 ndis 40941919 nreorder 50000
+
+```
+which matches the paper's table 2.
+
+Note that in multi-threaded mode, the building of the HNSW strcuture
+is not deterministic. Therefore, the results across runs may not be exactly the same.
+
+Reproducing Figure 5 in the paper
+---------------------------------
+
+Figure 5 just evaluates the combination of HNSW and PQ. For example,
+the operating point L6&OPQ40 can be obtained with
+
+```
+python bench_link_and_code.py \
+   --db deep1M \
+   --M0 6 \
+   --indexkey OPQ40_160,HNSW32_PQ40 \
+   --indexfile $bdir/deep1M_PQ40_M6.index \
+   --beta_nsq 1 --beta_k 1  \
+   --beta_centroids $bdir/deep1M_PQ40_M6_nsq0.npy \
+   --neigh_recons_codes $bdir/deep1M_PQ36_M6_nsq0_codes.npy \
+   --k_reorder 0 --efSearch 16,64,256,1024
+```
+
+The arguments are similar to the previous table. Note that nsq = 0 is
+simulated by setting beta_nsq = 1 and beta_k = 1 (ie a code with a single
+reproduction value).
+
+The output should look like:
+
+```
+setting k_reorder=0
+efSearch=16        0.0147 ms per query,  R@1: 0.3409 R@10: 0.4388 R@100: 0.4394 ndis 2629735 nreorder 0
+efSearch=64        0.0122 ms per query,  R@1: 0.4836 R@10: 0.6490 R@100: 0.6509 ndis 4623221 nreorder 0
+efSearch=256       0.0344 ms per query,  R@1: 0.5730 R@10: 0.7915 R@100: 0.7951 ndis 11090176 nreorder 0
+efSearch=1024      0.2656 ms per query,  R@1: 0.6212 R@10: 0.8722 R@100: 0.8765 ndis 33501951 nreorder 0
+```
+
+The results with k_reorder=5 are not reported in the paper, they
+represent the performance of a "free coding" version of the algorithm.
diff --git a/core/src/index/thirdparty/faiss/benchs/link_and_code/bench_link_and_code.py b/core/src/index/thirdparty/faiss/benchs/link_and_code/bench_link_and_code.py
new file mode 100644
index 0000000000..0b055169e4
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/link_and_code/bench_link_and_code.py
@@ -0,0 +1,304 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python2
+
+import os
+import sys
+import time
+import numpy as np
+import re
+import faiss
+from multiprocessing.dummy import Pool as ThreadPool
+import pdb
+import argparse
+import datasets
+from datasets import sanitize
+import neighbor_codec
+
+######################################################
+# Command-line parsing
+######################################################
+
+
+parser = argparse.ArgumentParser()
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+group = parser.add_argument_group('dataset options')
+
+aa('--db', default='deep1M', help='dataset')
+aa( '--compute_gt', default=False, action='store_true',
+    help='compute and store the groundtruth')
+
+group = parser.add_argument_group('index consturction')
+
+aa('--indexkey', default='HNSW32', help='index_factory type')
+aa('--efConstruction', default=200, type=int,
+   help='HNSW construction factor')
+aa('--M0', default=-1, type=int, help='size of base level')
+aa('--maxtrain', default=256 * 256, type=int,
+   help='maximum number of training points')
+aa('--indexfile', default='', help='file to read or write index from')
+aa('--add_bs', default=-1, type=int,
+   help='add elements index by batches of this size')
+aa('--link_singletons', default=False, action='store_true',
+   help='do a pass to link in the singletons')
+
+group = parser.add_argument_group(
+    'searching (reconstruct_from_neighbors options)')
+
+aa('--beta_centroids', default='',
+   help='file with codebook')
+aa('--neigh_recons_codes', default='',
+   help='file with codes for reconstruction')
+aa('--beta_ntrain', default=250000, type=int, help='')
+aa('--beta_k', default=256, type=int, help='beta codebook size')
+aa('--beta_nsq', default=1, type=int, help='number of beta sub-vectors')
+aa('--beta_niter', default=10, type=int, help='')
+aa('--k_reorder', default='-1', help='')
+
+group = parser.add_argument_group('searching')
+
+aa('--k', default=100, type=int, help='nb of nearest neighbors')
+aa('--exhaustive', default=False, action='store_true',
+    help='report the exhaustive search topline')
+aa('--searchthreads', default=-1, type=int,
+   help='nb of threads to use at search time')
+aa('--efSearch', default='', type=str,
+   help='comma-separated values of efSearch to try')
+
+args = parser.parse_args()
+
+print "args:", args
+
+
+######################################################
+# Load dataset
+######################################################
+
+xt, xb, xq, gt = datasets.load_data(
+    dataset=args.db, compute_gt=args.compute_gt)
+
+nq, d = xq.shape
+nb, d = xb.shape
+
+
+######################################################
+# Make index
+######################################################
+
+if os.path.exists(args.indexfile):
+
+    print "reading", args.indexfile
+    index = faiss.read_index(args.indexfile)
+
+    if isinstance(index, faiss.IndexPreTransform):
+        index_hnsw = faiss.downcast_index(index.index)
+        vec_transform = index.chain.at(0).apply_py
+    else:
+        index_hnsw = index
+        vec_transform = lambda x:x
+
+    hnsw = index_hnsw.hnsw
+    hnsw_stats = faiss.cvar.hnsw_stats
+
+else:
+
+    print "build index, key=", args.indexkey
+
+    index = faiss.index_factory(d, args.indexkey)
+
+    if isinstance(index, faiss.IndexPreTransform):
+        index_hnsw = faiss.downcast_index(index.index)
+        vec_transform = index.chain.at(0).apply_py
+    else:
+        index_hnsw = index
+        vec_transform = lambda x:x
+
+    hnsw = index_hnsw.hnsw
+    hnsw.efConstruction = args.efConstruction
+    hnsw_stats = faiss.cvar.hnsw_stats
+    index.verbose = True
+    index_hnsw.verbose = True
+    index_hnsw.storage.verbose = True
+
+    if args.M0 != -1:
+        print "set level 0 nb of neighbors to", args.M0
+        hnsw.set_nb_neighbors(0, args.M0)
+
+    xt2 = sanitize(xt[:args.maxtrain])
+    assert np.all(np.isfinite(xt2))
+
+    print "train, size", xt.shape
+    t0 = time.time()
+    index.train(xt2)
+    print "  train in %.3f s" % (time.time() - t0)
+
+    print "adding"
+    t0 = time.time()
+    if args.add_bs == -1:
+        index.add(sanitize(xb))
+    else:
+        for i0 in range(0, nb, args.add_bs):
+            i1 = min(nb, i0 + args.add_bs)
+            print "  adding %d:%d / %d" % (i0, i1, nb)
+            index.add(sanitize(xb[i0:i1]))
+
+    print "  add in %.3f s" % (time.time() - t0)
+    print "storing", args.indexfile
+    faiss.write_index(index, args.indexfile)
+
+
+######################################################
+# Train beta centroids and encode dataset
+######################################################
+
+if args.beta_centroids:
+    print "reordering links"
+    index_hnsw.reorder_links()
+
+    if os.path.exists(args.beta_centroids):
+        print "load", args.beta_centroids
+        beta_centroids = np.load(args.beta_centroids)
+        nsq, k, M1 = beta_centroids.shape
+        assert M1 == hnsw.nb_neighbors(0) + 1
+
+        rfn = faiss.ReconstructFromNeighbors(index_hnsw, k, nsq)
+    else:
+        print "train beta centroids"
+        rfn = faiss.ReconstructFromNeighbors(
+            index_hnsw, args.beta_k, args.beta_nsq)
+
+        xb_full = vec_transform(sanitize(xb[:args.beta_ntrain]))
+
+        beta_centroids = neighbor_codec.train_beta_codebook(
+            rfn, xb_full, niter=args.beta_niter)
+
+        print "  storing", args.beta_centroids
+        np.save(args.beta_centroids, beta_centroids)
+
+
+    faiss.copy_array_to_vector(beta_centroids.ravel(),
+                               rfn.codebook)
+    index_hnsw.reconstruct_from_neighbors = rfn
+
+    if rfn.k == 1:
+        pass     # no codes to take care of
+    elif os.path.exists(args.neigh_recons_codes):
+        print "loading neigh codes", args.neigh_recons_codes
+        codes = np.load(args.neigh_recons_codes)
+        assert codes.size == rfn.code_size * index.ntotal
+        faiss.copy_array_to_vector(codes.astype('uint8'),
+                                   rfn.codes)
+        rfn.ntotal = index.ntotal
+    else:
+        print "encoding neigh codes"
+        t0 = time.time()
+
+        bs = 1000000 if args.add_bs == -1 else args.add_bs
+
+        for i0 in range(0, nb, bs):
+            i1 = min(i0 + bs, nb)
+            print "   encode %d:%d / %d [%.3f s]\r" % (
+                i0, i1, nb, time.time() - t0),
+            sys.stdout.flush()
+            xbatch = vec_transform(sanitize(xb[i0:i1]))
+            rfn.add_codes(i1 - i0, faiss.swig_ptr(xbatch))
+        print
+
+        print "storing %s" % args.neigh_recons_codes
+        codes = faiss.vector_to_array(rfn.codes)
+        np.save(args.neigh_recons_codes, codes)
+
+######################################################
+# Exhaustive evaluation
+######################################################
+
+if args.exhaustive:
+    print "exhaustive evaluation"
+    xq_tr = vec_transform(sanitize(xq))
+    index2 = faiss.IndexFlatL2(index_hnsw.d)
+    accu_recons_error = 0.0
+
+    if faiss.get_num_gpus() > 0:
+        print "do eval on GPU"
+        co = faiss.GpuMultipleClonerOptions()
+        co.shard = False
+        index2 = faiss.index_cpu_to_all_gpus(index2, co)
+
+    # process in batches in case the dataset does not fit in RAM
+    rh = datasets.ResultHeap(xq_tr.shape[0], 100)
+    t0 = time.time()
+    bs = 500000
+    for i0 in range(0, nb, bs):
+        i1 = min(nb, i0 + bs)
+        print '  handling batch %d:%d' % (i0, i1)
+
+        xb_recons = np.empty(
+            (i1 - i0, index_hnsw.d), dtype='float32')
+        rfn.reconstruct_n(i0, i1 - i0, faiss.swig_ptr(xb_recons))
+
+        accu_recons_error += (
+            (vec_transform(sanitize(xb[i0:i1])) -
+             xb_recons)**2).sum()
+
+        index2.reset()
+        index2.add(xb_recons)
+        D, I = index2.search(xq_tr, 100)
+        rh.add_batch_result(D, I, i0)
+
+    rh.finalize()
+    del index2
+    t1 = time.time()
+    print "done in %.3f s" % (t1 - t0)
+    print "total reconstruction error: ", accu_recons_error
+    print "eval retrieval:"
+    datasets.evaluate_DI(rh.D, rh.I, gt)
+
+
+def get_neighbors(hnsw, i, level):
+    " list the neighbors for node i at level "
+    assert i < hnsw.levels.size()
+    assert level < hnsw.levels.at(i)
+    be = np.empty(2, 'uint64')
+    hnsw.neighbor_range(i, level, faiss.swig_ptr(be), faiss.swig_ptr(be[1:]))
+    return [hnsw.neighbors.at(j) for j in range(be[0], be[1])]
+
+
+#############################################################
+# Index is ready
+#############################################################
+
+xq = sanitize(xq)
+
+if args.searchthreads != -1:
+    print "Setting nb of threads to", args.searchthreads
+    faiss.omp_set_num_threads(args.searchthreads)
+
+
+if gt is None:
+    print "no valid groundtruth -- exit"
+    sys.exit()
+
+
+k_reorders = [int(x) for x in args.k_reorder.split(',')]
+efSearchs = [int(x) for x in args.efSearch.split(',')]
+
+
+for k_reorder in k_reorders:
+
+    if index_hnsw.reconstruct_from_neighbors:
+        print "setting k_reorder=%d" % k_reorder
+        index_hnsw.reconstruct_from_neighbors.k_reorder = k_reorder
+
+    for efSearch in efSearchs:
+        print "efSearch=%-4d" % efSearch,
+        hnsw.efSearch = efSearch
+        hnsw_stats.reset()
+        datasets.evaluate(xq, gt, index, k=args.k, endl=False)
+
+        print "ndis %d nreorder %d" % (hnsw_stats.ndis, hnsw_stats.nreorder)
diff --git a/core/src/index/thirdparty/faiss/benchs/link_and_code/datasets.py b/core/src/index/thirdparty/faiss/benchs/link_and_code/datasets.py
new file mode 100644
index 0000000000..ce1379f408
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/link_and_code/datasets.py
@@ -0,0 +1,235 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+"""
+Common functions to load datasets and compute their ground-truth
+"""
+
+import time
+import numpy as np
+import faiss
+import pdb
+import sys
+
+# set this to the directory that contains the datafiles.
+# deep1b data should be at simdir + 'deep1b'
+# bigann data should be at simdir + 'bigann'
+simdir = '/mnt/vol/gfsai-east/ai-group/datasets/simsearch/'
+
+#################################################################
+# Small I/O functions
+#################################################################
+
+
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype='int32')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+
+
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+
+
+def ivecs_mmap(fname):
+    a = np.memmap(fname, dtype='int32', mode='r')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:]
+
+
+def fvecs_mmap(fname):
+    return ivecs_mmap(fname).view('float32')
+
+
+def bvecs_mmap(fname):
+    x = np.memmap(fname, dtype='uint8', mode='r')
+    d = x[:4].view('int32')[0]
+    return x.reshape(-1, d + 4)[:, 4:]
+
+
+def ivecs_write(fname, m):
+    n, d = m.shape
+    m1 = np.empty((n, d + 1), dtype='int32')
+    m1[:, 0] = d
+    m1[:, 1:] = m
+    m1.tofile(fname)
+
+
+def fvecs_write(fname, m):
+    m = m.astype('float32')
+    ivecs_write(fname, m.view('int32'))
+
+
+#################################################################
+# Dataset
+#################################################################
+
+def sanitize(x):
+    return np.ascontiguousarray(x, dtype='float32')
+
+
+class ResultHeap:
+    """ Combine query results from a sliced dataset """
+
+    def __init__(self, nq, k):
+        " nq: number of query vectors, k: number of results per query "
+        self.I = np.zeros((nq, k), dtype='int64')
+        self.D = np.zeros((nq, k), dtype='float32')
+        self.nq, self.k = nq, k
+        heaps = faiss.float_maxheap_array_t()
+        heaps.k = k
+        heaps.nh = nq
+        heaps.val = faiss.swig_ptr(self.D)
+        heaps.ids = faiss.swig_ptr(self.I)
+        heaps.heapify()
+        self.heaps = heaps
+
+    def add_batch_result(self, D, I, i0):
+        assert D.shape == (self.nq, self.k)
+        assert I.shape == (self.nq, self.k)
+        I += i0
+        self.heaps.addn_with_ids(
+            self.k, faiss.swig_ptr(D),
+            faiss.swig_ptr(I), self.k)
+
+    def finalize(self):
+        self.heaps.reorder()
+
+
+
+def compute_GT_sliced(xb, xq, k):
+    print "compute GT"
+    t0 = time.time()
+    nb, d = xb.shape
+    nq, d = xq.shape
+    rh = ResultHeap(nq, k)
+    bs = 10 ** 5
+
+    xqs = sanitize(xq)
+
+    db_gt = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
+
+    # compute ground-truth by blocks of bs, and add to heaps
+    for i0 in range(0, nb, bs):
+        i1 = min(nb, i0 + bs)
+        xsl = sanitize(xb[i0:i1])
+        db_gt.add(xsl)
+        D, I = db_gt.search(xqs, k)
+        rh.add_batch_result(D, I, i0)
+        db_gt.reset()
+        print "\r   %d/%d, %.3f s" % (i0, nb, time.time() - t0),
+        sys.stdout.flush()
+    print
+    rh.finalize()
+    gt_I = rh.I
+
+    print "GT time: %.3f s" % (time.time() - t0)
+    return gt_I
+
+
+def do_compute_gt(xb, xq, k):
+    print "computing GT"
+    nb, d = xb.shape
+    index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
+    if nb < 100 * 1000:
+        print "   add"
+        index.add(np.ascontiguousarray(xb, dtype='float32'))
+        print "   search"
+        D, I = index.search(np.ascontiguousarray(xq, dtype='float32'), k)
+    else:
+        I = compute_GT_sliced(xb, xq, k)
+
+    return I.astype('int32')
+
+
+def load_data(dataset='deep1M', compute_gt=False):
+
+    print "load data", dataset
+
+    if dataset == 'sift1M':
+        basedir = simdir + 'sift1M/'
+
+        xt = fvecs_read(basedir + "sift_learn.fvecs")
+        xb = fvecs_read(basedir + "sift_base.fvecs")
+        xq = fvecs_read(basedir + "sift_query.fvecs")
+        gt = ivecs_read(basedir + "sift_groundtruth.ivecs")
+
+    elif dataset.startswith('bigann'):
+        basedir = simdir + 'bigann/'
+
+        dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
+        xb = bvecs_mmap(basedir + 'bigann_base.bvecs')
+        xq = bvecs_mmap(basedir + 'bigann_query.bvecs')
+        xt = bvecs_mmap(basedir + 'bigann_learn.bvecs')
+        # trim xb to correct size
+        xb = xb[:dbsize * 1000 * 1000]
+        gt = ivecs_read(basedir + 'gnd/idx_%dM.ivecs' % dbsize)
+
+    elif dataset.startswith("deep"):
+        basedir = simdir + 'deep1b/'
+        szsuf = dataset[4:]
+        if szsuf[-1] == 'M':
+            dbsize = 10 ** 6 * int(szsuf[:-1])
+        elif szsuf == '1B':
+            dbsize = 10 ** 9
+        elif szsuf[-1] == 'k':
+            dbsize = 1000 * int(szsuf[:-1])
+        else:
+            assert False, "did not recognize suffix " + szsuf
+
+        xt = fvecs_mmap(basedir + "learn.fvecs")
+        xb = fvecs_mmap(basedir + "base.fvecs")
+        xq = fvecs_read(basedir + "deep1B_queries.fvecs")
+
+        xb = xb[:dbsize]
+
+        gt_fname = basedir + "%s_groundtruth.ivecs" % dataset
+        if compute_gt:
+            gt = do_compute_gt(xb, xq, 100)
+            print "store", gt_fname
+            ivecs_write(gt_fname, gt)
+
+        gt = ivecs_read(gt_fname)
+
+    else:
+        assert False
+
+    print "dataset %s sizes: B %s Q %s T %s" % (
+        dataset, xb.shape, xq.shape, xt.shape)
+
+    return xt, xb, xq, gt
+
+#################################################################
+# Evaluation
+#################################################################
+
+
+def evaluate_DI(D, I, gt):
+    nq = gt.shape[0]
+    k = I.shape[1]
+    rank = 1
+    while rank <= k:
+        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+        print "R@%d: %.4f" % (rank, recall),
+        rank *= 10
+
+
+def evaluate(xq, gt, index, k=100, endl=True):
+    t0 = time.time()
+    D, I = index.search(xq, k)
+    t1 = time.time()
+    nq = xq.shape[0]
+    print "\t %8.4f ms per query, " % (
+        (t1 - t0) * 1000.0 / nq),
+    rank = 1
+    while rank <= k:
+        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+        print "R@%d: %.4f" % (rank, recall),
+        rank *= 10
+    if endl:
+        print
+    return D, I
diff --git a/core/src/index/thirdparty/faiss/benchs/link_and_code/neighbor_codec.py b/core/src/index/thirdparty/faiss/benchs/link_and_code/neighbor_codec.py
new file mode 100644
index 0000000000..3869a2c109
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/benchs/link_and_code/neighbor_codec.py
@@ -0,0 +1,239 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+"""
+This is the training code for the link and code. Especially the
+neighbors_kmeans function implements the EM-algorithm to find the
+appropriate weightings and cluster them.
+"""
+
+import time
+import numpy as np
+import faiss
+
+#----------------------------------------------------------
+# Utils
+#----------------------------------------------------------
+
+def sanitize(x):
+    return np.ascontiguousarray(x, dtype='float32')
+
+
+def train_kmeans(x, k, ngpu, max_points_per_centroid=256):
+    "Runs kmeans on one or several GPUs"
+    d = x.shape[1]
+    clus = faiss.Clustering(d, k)
+    clus.verbose = True
+    clus.niter = 20
+    clus.max_points_per_centroid = max_points_per_centroid
+
+    if ngpu == 0:
+        index = faiss.IndexFlatL2(d)
+    else:
+        res = [faiss.StandardGpuResources() for i in range(ngpu)]
+
+        flat_config = []
+        for i in range(ngpu):
+            cfg = faiss.GpuIndexFlatConfig()
+            cfg.useFloat16 = False
+            cfg.device = i
+            flat_config.append(cfg)
+
+        if ngpu == 1:
+            index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
+        else:
+            indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
+                       for i in range(ngpu)]
+            index = faiss.IndexReplicas()
+            for sub_index in indexes:
+                index.addIndex(sub_index)
+
+    # perform the training
+    clus.train(x, index)
+    centroids = faiss.vector_float_to_array(clus.centroids)
+
+    obj = faiss.vector_float_to_array(clus.obj)
+    print "final objective: %.4g" % obj[-1]
+
+    return centroids.reshape(k, d)
+
+
+#----------------------------------------------------------
+# Learning the codebook from neighbors
+#----------------------------------------------------------
+
+
+# works with both a full Inn table and dynamically generated neighbors
+
+def get_Inn_shape(Inn):
+    if type(Inn) != tuple:
+        return Inn.shape
+    return Inn[:2]
+
+def get_neighbor_table(x_coded, Inn, i):
+    if type(Inn) != tuple:
+        return x_coded[Inn[i,:],:]
+    rfn = x_coded
+    M, d = rfn.M, rfn.index.d
+    out = np.zeros((M + 1, d), dtype='float32')
+    rfn.get_neighbor_table(i, faiss.swig_ptr(out))
+    _, _, sq = Inn
+    return out[:, sq * rfn.dsub : (sq + 1) * rfn.dsub]
+
+
+# Function that produces the best regression values from the vector
+# and its neighbors
+def regress_from_neighbors (x, x_coded, Inn):
+    (N, knn) = get_Inn_shape(Inn)
+    betas = np.zeros((N,knn))
+    t0 = time.time()
+    for i in xrange (N):
+        xi = x[i,:]
+        NNi = get_neighbor_table(x_coded, Inn, i)
+        betas[i,:] = np.linalg.lstsq(NNi.transpose(), xi, rcond=0.01)[0]
+        if i % (N / 10) == 0:
+            print ("[%d:%d]  %6.3fs" % (i, i + N / 10, time.time() - t0))
+    return betas
+
+
+
+# find the best beta minimizing ||x-x_coded[Inn,:]*beta||^2
+def regress_opt_beta (x, x_coded, Inn):
+    (N, knn) = get_Inn_shape(Inn)
+    d = x.shape[1]
+
+    # construct the linear system to be solved
+    X = np.zeros ((d*N))
+    Y = np.zeros ((d*N, knn))
+    for i in xrange (N):
+        X[i*d:(i+1)*d] = x[i,:]
+        neighbor_table = get_neighbor_table(x_coded, Inn, i)
+        Y[i*d:(i+1)*d, :] = neighbor_table.transpose()
+    beta_opt = np.linalg.lstsq(Y, X, rcond=0.01)[0]
+    return beta_opt
+
+
+# Find the best encoding by minimizing the reconstruction error using
+# a set of pre-computed beta values
+def assign_beta (beta_centroids, x, x_coded, Inn, verbose=True):
+    if type(Inn) == tuple:
+        return assign_beta_2(beta_centroids, x, x_coded, Inn)
+    (N, knn) = Inn.shape
+    x_ibeta = np.zeros ((N), dtype='int32')
+    t0= time.time()
+    for i in xrange (N):
+        NNi = x_coded[Inn[i,:]]
+        # Consider all possible betas for the encoding and compute the
+        # encoding error
+        x_reg_all = np.dot (beta_centroids, NNi)
+        err = ((x_reg_all - x[i,:]) ** 2).sum(axis=1)
+        x_ibeta[i] = err.argmin()
+        if verbose:
+            if i % (N / 10) == 0:
+                print ("[%d:%d]  %6.3fs" % (i, i + N / 10, time.time() - t0))
+    return x_ibeta
+
+
+# Reconstruct a set of vectors using the beta_centroids, the
+# assignment, the encoded neighbors identified by the list Inn (which
+# includes the vector itself)
+def recons_from_neighbors (beta_centroids, x_ibeta, x_coded, Inn):
+    (N, knn) = Inn.shape
+    x_rec = np.zeros(x_coded.shape)
+    t0= time.time()
+    for i in xrange (N):
+        NNi = x_coded[Inn[i,:]]
+        x_rec[i, :] = np.dot (beta_centroids[x_ibeta[i]], NNi)
+        if i % (N / 10) == 0:
+            print ("[%d:%d]  %6.3fs" % (i, i + N / 10, time.time() - t0))
+    return x_rec
+
+
+# Compute a EM-like algorithm trying at optimizing the beta such as they
+# minimize the reconstruction error from the neighbors
+def neighbors_kmeans (x, x_coded, Inn, K, ngpus=1, niter=5):
+    # First compute centroids using a regular k-means algorithm
+    betas = regress_from_neighbors (x, x_coded, Inn)
+    beta_centroids = train_kmeans(
+        sanitize(betas), K, ngpus, max_points_per_centroid=1000000)
+    _, knn = get_Inn_shape(Inn)
+    d = x.shape[1]
+
+    rs = np.random.RandomState()
+    for iter in range(niter):
+        print 'iter', iter
+        idx = assign_beta (beta_centroids, x, x_coded, Inn, verbose=False)
+
+        hist = np.bincount(idx)
+        for cl0 in np.where(hist == 0)[0]:
+            print "  cluster %d empty, split" % cl0,
+            cl1 = idx[np.random.randint(idx.size)]
+            pos = np.nonzero (idx == cl1)[0]
+            pos = rs.choice(pos, pos.size / 2)
+            print "   cl %d -> %d + %d" % (cl1, len(pos), hist[cl1] - len(pos))
+            idx[pos] = cl0
+            hist = np.bincount(idx)
+
+        tot_err = 0
+        for k in range (K):
+            pos = np.nonzero (idx == k)[0]
+            npos = pos.shape[0]
+
+            X = np.zeros (d*npos)
+            Y = np.zeros ((d*npos, knn))
+
+            for i in range(npos):
+                X[i*d:(i+1)*d] = x[pos[i],:]
+                neighbor_table = get_neighbor_table(x_coded, Inn, pos[i])
+                Y[i*d:(i+1)*d, :] = neighbor_table.transpose()
+            sol, residuals, _, _ = np.linalg.lstsq(Y, X, rcond=0.01)
+            if residuals.size > 0:
+                tot_err += residuals.sum()
+            beta_centroids[k, :] = sol
+        print '  err=%g' % tot_err
+    return beta_centroids
+
+
+# assign the betas in C++
+def assign_beta_2(beta_centroids, x, rfn, Inn):
+    _, _, sq = Inn
+    if rfn.k == 1:
+        return np.zeros(x.shape[0], dtype=int)
+    # add dummy dimensions to beta_centroids and x
+    all_beta_centroids = np.zeros(
+        (rfn.nsq, rfn.k, rfn.M + 1), dtype='float32')
+    all_beta_centroids[sq] = beta_centroids
+    all_x = np.zeros((len(x), rfn.d), dtype='float32')
+    all_x[:, sq * rfn.dsub : (sq + 1) * rfn.dsub] = x
+    rfn.codes.clear()
+    rfn.ntotal = 0
+    faiss.copy_array_to_vector(
+        all_beta_centroids.ravel(), rfn.codebook)
+    rfn.add_codes(len(x), faiss.swig_ptr(all_x))
+    codes = faiss.vector_to_array(rfn.codes)
+    codes = codes.reshape(-1, rfn.nsq)
+    return codes[:, sq]
+
+
+#######################################################
+# For usage from bench_storages.py
+
+def train_beta_codebook(rfn, xb_full, niter=10):
+    beta_centroids = []
+    for sq in range(rfn.nsq):
+        d0, d1 = sq * rfn.dsub, (sq + 1) * rfn.dsub
+        print "training subquantizer %d/%d on dimensions %d:%d" % (
+            sq, rfn.nsq, d0, d1)
+        beta_centroids_i = neighbors_kmeans(
+            xb_full[:, d0:d1], rfn, (xb_full.shape[0], rfn.M + 1, sq),
+            rfn.k,
+            ngpus=0, niter=niter)
+        beta_centroids.append(beta_centroids_i)
+        rfn.ntotal = 0
+        rfn.codes.clear()
+        rfn.codebook.clear()
+    return np.stack(beta_centroids)
diff --git a/core/src/index/thirdparty/faiss/build-aux/config.guess b/core/src/index/thirdparty/faiss/build-aux/config.guess
new file mode 100755
index 0000000000..2193702b12
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/build-aux/config.guess
@@ -0,0 +1,1473 @@
+#! /bin/sh
+# Attempt to guess a canonical system name.
+#   Copyright 1992-2017 Free Software Foundation, Inc.
+
+timestamp='2017-05-27'
+
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that
+# program.  This Exception is an additional permission under section 7
+# of the GNU General Public License, version 3 ("GPLv3").
+#
+# Originally written by Per Bothner; maintained since 2000 by Ben Elliston.
+#
+# You can get the latest version of this script from:
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
+#
+# Please send patches to <config-patches@gnu.org>.
+
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION]
+
+Output the configuration name of the system \`$me' is run on.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.guess ($timestamp)
+
+Originally written by Per Bothner.
+Copyright 1992-2017 Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help" >&2
+       exit 1 ;;
+    * )
+       break ;;
+  esac
+done
+
+if test $# != 0; then
+  echo "$me: too many arguments$help" >&2
+  exit 1
+fi
+
+trap 'exit 1' 1 2 15
+
+# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
+# compiler to aid in system detection is discouraged as it requires
+# temporary files to be created and, as you can see below, it is a
+# headache to deal with in a portable fashion.
+
+# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
+# use `HOST_CC' if defined, but it is deprecated.
+
+# Portable tmp directory creation inspired by the Autoconf team.
+
+set_cc_for_build='
+trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
+trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
+: ${TMPDIR=/tmp} ;
+ { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
+ { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
+ { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
+ { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
+dummy=$tmp/dummy ;
+tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
+case $CC_FOR_BUILD,$HOST_CC,$CC in
+ ,,)    echo "int x;" > $dummy.c ;
+	for c in cc gcc c89 c99 ; do
+	  if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
+	     CC_FOR_BUILD="$c"; break ;
+	  fi ;
+	done ;
+	if test x"$CC_FOR_BUILD" = x ; then
+	  CC_FOR_BUILD=no_compiler_found ;
+	fi
+	;;
+ ,,*)   CC_FOR_BUILD=$CC ;;
+ ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
+esac ; set_cc_for_build= ;'
+
+# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
+# (ghazi@noc.rutgers.edu 1994-08-24)
+if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
+	PATH=$PATH:/.attbin ; export PATH
+fi
+
+UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
+UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
+UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
+UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
+
+case "${UNAME_SYSTEM}" in
+Linux|GNU|GNU/*)
+	# If the system lacks a compiler, then just pick glibc.
+	# We could probably try harder.
+	LIBC=gnu
+
+	eval $set_cc_for_build
+	cat <<-EOF > $dummy.c
+	#include <features.h>
+	#if defined(__UCLIBC__)
+	LIBC=uclibc
+	#elif defined(__dietlibc__)
+	LIBC=dietlibc
+	#else
+	LIBC=gnu
+	#endif
+	EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`
+	;;
+esac
+
+# Note: order is significant - the case branches are not exclusive.
+
+case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
+    *:NetBSD:*:*)
+	# NetBSD (nbsd) targets should (where applicable) match one or
+	# more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*,
+	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
+	# switched to ELF, *-*-netbsd* would select the old
+	# object file format.  This provides both forward
+	# compatibility and a consistent mechanism for selecting the
+	# object file format.
+	#
+	# Note: NetBSD doesn't particularly care about the vendor
+	# portion of the name.  We always set it to "unknown".
+	sysctl="sysctl -n hw.machine_arch"
+	UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \
+	    /sbin/$sysctl 2>/dev/null || \
+	    /usr/sbin/$sysctl 2>/dev/null || \
+	    echo unknown)`
+	case "${UNAME_MACHINE_ARCH}" in
+	    armeb) machine=armeb-unknown ;;
+	    arm*) machine=arm-unknown ;;
+	    sh3el) machine=shl-unknown ;;
+	    sh3eb) machine=sh-unknown ;;
+	    sh5el) machine=sh5le-unknown ;;
+	    earmv*)
+		arch=`echo ${UNAME_MACHINE_ARCH} | sed -e 's,^e\(armv[0-9]\).*$,\1,'`
+		endian=`echo ${UNAME_MACHINE_ARCH} | sed -ne 's,^.*\(eb\)$,\1,p'`
+		machine=${arch}${endian}-unknown
+		;;
+	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
+	esac
+	# The Operating System including object format, if it has switched
+	# to ELF recently (or will in the future) and ABI.
+	case "${UNAME_MACHINE_ARCH}" in
+	    earm*)
+		os=netbsdelf
+		;;
+	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
+		eval $set_cc_for_build
+		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
+			| grep -q __ELF__
+		then
+		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
+		    # Return netbsd for either.  FIX?
+		    os=netbsd
+		else
+		    os=netbsdelf
+		fi
+		;;
+	    *)
+		os=netbsd
+		;;
+	esac
+	# Determine ABI tags.
+	case "${UNAME_MACHINE_ARCH}" in
+	    earm*)
+		expr='s/^earmv[0-9]/-eabi/;s/eb$//'
+		abi=`echo ${UNAME_MACHINE_ARCH} | sed -e "$expr"`
+		;;
+	esac
+	# The OS release
+	# Debian GNU/NetBSD machines have a different userland, and
+	# thus, need a distinct triplet. However, they do not need
+	# kernel version information, so it can be replaced with a
+	# suitable tag, in the style of linux-gnu.
+	case "${UNAME_VERSION}" in
+	    Debian*)
+		release='-gnu'
+		;;
+	    *)
+		release=`echo ${UNAME_RELEASE} | sed -e 's/[-_].*//' | cut -d. -f1,2`
+		;;
+	esac
+	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
+	# contains redundant information, the shorter form:
+	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
+	echo "${machine}-${os}${release}${abi}"
+	exit ;;
+    *:Bitrig:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
+	echo ${UNAME_MACHINE_ARCH}-unknown-bitrig${UNAME_RELEASE}
+	exit ;;
+    *:OpenBSD:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
+	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
+	exit ;;
+    *:LibertyBSD:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'`
+	echo ${UNAME_MACHINE_ARCH}-unknown-libertybsd${UNAME_RELEASE}
+	exit ;;
+    *:ekkoBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
+	exit ;;
+    *:SolidBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
+	exit ;;
+    macppc:MirBSD:*:*)
+	echo powerpc-unknown-mirbsd${UNAME_RELEASE}
+	exit ;;
+    *:MirBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
+	exit ;;
+    *:Sortix:*:*)
+	echo ${UNAME_MACHINE}-unknown-sortix
+	exit ;;
+    alpha:OSF1:*:*)
+	case $UNAME_RELEASE in
+	*4.0)
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
+		;;
+	*5.*)
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
+		;;
+	esac
+	# According to Compaq, /usr/sbin/psrinfo has been available on
+	# OSF/1 and Tru64 systems produced since 1995.  I hope that
+	# covers most systems running today.  This code pipes the CPU
+	# types through head -n 1, so we only detect the type of CPU 0.
+	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
+	case "$ALPHA_CPU_TYPE" in
+	    "EV4 (21064)")
+		UNAME_MACHINE=alpha ;;
+	    "EV4.5 (21064)")
+		UNAME_MACHINE=alpha ;;
+	    "LCA4 (21066/21068)")
+		UNAME_MACHINE=alpha ;;
+	    "EV5 (21164)")
+		UNAME_MACHINE=alphaev5 ;;
+	    "EV5.6 (21164A)")
+		UNAME_MACHINE=alphaev56 ;;
+	    "EV5.6 (21164PC)")
+		UNAME_MACHINE=alphapca56 ;;
+	    "EV5.7 (21164PC)")
+		UNAME_MACHINE=alphapca57 ;;
+	    "EV6 (21264)")
+		UNAME_MACHINE=alphaev6 ;;
+	    "EV6.7 (21264A)")
+		UNAME_MACHINE=alphaev67 ;;
+	    "EV6.8CB (21264C)")
+		UNAME_MACHINE=alphaev68 ;;
+	    "EV6.8AL (21264B)")
+		UNAME_MACHINE=alphaev68 ;;
+	    "EV6.8CX (21264D)")
+		UNAME_MACHINE=alphaev68 ;;
+	    "EV6.9A (21264/EV69A)")
+		UNAME_MACHINE=alphaev69 ;;
+	    "EV7 (21364)")
+		UNAME_MACHINE=alphaev7 ;;
+	    "EV7.9 (21364A)")
+		UNAME_MACHINE=alphaev79 ;;
+	esac
+	# A Pn.n version is a patched version.
+	# A Vn.n version is a released version.
+	# A Tn.n version is a released field test version.
+	# A Xn.n version is an unreleased experimental baselevel.
+	# 1.2 uses "1.2" for uname -r.
+	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
+	# Reset EXIT trap before exiting to avoid spurious non-zero exit code.
+	exitcode=$?
+	trap '' 0
+	exit $exitcode ;;
+    Alpha\ *:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# Should we change UNAME_MACHINE based on the output of uname instead
+	# of the specific Alpha model?
+	echo alpha-pc-interix
+	exit ;;
+    21064:Windows_NT:50:3)
+	echo alpha-dec-winnt3.5
+	exit ;;
+    Amiga*:UNIX_System_V:4.0:*)
+	echo m68k-unknown-sysv4
+	exit ;;
+    *:[Aa]miga[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-amigaos
+	exit ;;
+    *:[Mm]orph[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-morphos
+	exit ;;
+    *:OS/390:*:*)
+	echo i370-ibm-openedition
+	exit ;;
+    *:z/VM:*:*)
+	echo s390-ibm-zvmoe
+	exit ;;
+    *:OS400:*:*)
+	echo powerpc-ibm-os400
+	exit ;;
+    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
+	echo arm-acorn-riscix${UNAME_RELEASE}
+	exit ;;
+    arm*:riscos:*:*|arm*:RISCOS:*:*)
+	echo arm-unknown-riscos
+	exit ;;
+    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
+	echo hppa1.1-hitachi-hiuxmpp
+	exit ;;
+    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
+	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
+	if test "`(/bin/universe) 2>/dev/null`" = att ; then
+		echo pyramid-pyramid-sysv3
+	else
+		echo pyramid-pyramid-bsd
+	fi
+	exit ;;
+    NILE*:*:*:dcosx)
+	echo pyramid-pyramid-svr4
+	exit ;;
+    DRS?6000:unix:4.0:6*)
+	echo sparc-icl-nx6
+	exit ;;
+    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
+	case `/usr/bin/uname -p` in
+	    sparc) echo sparc-icl-nx7; exit ;;
+	esac ;;
+    s390x:SunOS:*:*)
+	echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4H:SunOS:5.*:*)
+	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
+	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*)
+	echo i386-pc-auroraux${UNAME_RELEASE}
+	exit ;;
+    i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
+	eval $set_cc_for_build
+	SUN_ARCH=i386
+	# If there is a compiler, see if it is configured for 64-bit objects.
+	# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
+	# This test works for both compilers.
+	if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
+	    if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
+		(CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+		grep IS_64BIT_ARCH >/dev/null
+	    then
+		SUN_ARCH=x86_64
+	    fi
+	fi
+	echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:6*:*)
+	# According to config.sub, this is the proper way to canonicalize
+	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
+	# it's likely to be more like Solaris than SunOS4.
+	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:*:*)
+	case "`/usr/bin/arch -k`" in
+	    Series*|S4*)
+		UNAME_RELEASE=`uname -v`
+		;;
+	esac
+	# Japanese Language versions have a version number like `4.1.3-JL'.
+	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
+	exit ;;
+    sun3*:SunOS:*:*)
+	echo m68k-sun-sunos${UNAME_RELEASE}
+	exit ;;
+    sun*:*:4.2BSD:*)
+	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
+	test "x${UNAME_RELEASE}" = x && UNAME_RELEASE=3
+	case "`/bin/arch`" in
+	    sun3)
+		echo m68k-sun-sunos${UNAME_RELEASE}
+		;;
+	    sun4)
+		echo sparc-sun-sunos${UNAME_RELEASE}
+		;;
+	esac
+	exit ;;
+    aushp:SunOS:*:*)
+	echo sparc-auspex-sunos${UNAME_RELEASE}
+	exit ;;
+    # The situation for MiNT is a little confusing.  The machine name
+    # can be virtually everything (everything which is not
+    # "atarist" or "atariste" at least should have a processor
+    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
+    # to the lowercase version "mint" (or "freemint").  Finally
+    # the system name "TOS" denotes a system which is actually not
+    # MiNT.  But MiNT is downward compatible to TOS, so this should
+    # be no problem.
+    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
+	echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
+    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
+	echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
+    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
+	echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
+    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
+	echo m68k-milan-mint${UNAME_RELEASE}
+	exit ;;
+    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
+	echo m68k-hades-mint${UNAME_RELEASE}
+	exit ;;
+    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
+	echo m68k-unknown-mint${UNAME_RELEASE}
+	exit ;;
+    m68k:machten:*:*)
+	echo m68k-apple-machten${UNAME_RELEASE}
+	exit ;;
+    powerpc:machten:*:*)
+	echo powerpc-apple-machten${UNAME_RELEASE}
+	exit ;;
+    RISC*:Mach:*:*)
+	echo mips-dec-mach_bsd4.3
+	exit ;;
+    RISC*:ULTRIX:*:*)
+	echo mips-dec-ultrix${UNAME_RELEASE}
+	exit ;;
+    VAX*:ULTRIX*:*:*)
+	echo vax-dec-ultrix${UNAME_RELEASE}
+	exit ;;
+    2020:CLIX:*:* | 2430:CLIX:*:*)
+	echo clipper-intergraph-clix${UNAME_RELEASE}
+	exit ;;
+    mips:*:*:UMIPS | mips:*:*:RISCos)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
+	#if defined (host_mips) && defined (MIPSEB)
+	#if defined (SYSTYPE_SYSV)
+	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_SVR4)
+	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
+	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
+	#endif
+	#endif
+	  exit (-1);
+	}
+EOF
+	$CC_FOR_BUILD -o $dummy $dummy.c &&
+	  dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
+	  SYSTEM_NAME=`$dummy $dummyarg` &&
+	    { echo "$SYSTEM_NAME"; exit; }
+	echo mips-mips-riscos${UNAME_RELEASE}
+	exit ;;
+    Motorola:PowerMAX_OS:*:*)
+	echo powerpc-motorola-powermax
+	exit ;;
+    Motorola:*:4.3:PL8-*)
+	echo powerpc-harris-powermax
+	exit ;;
+    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
+	echo powerpc-harris-powermax
+	exit ;;
+    Night_Hawk:Power_UNIX:*:*)
+	echo powerpc-harris-powerunix
+	exit ;;
+    m88k:CX/UX:7*:*)
+	echo m88k-harris-cxux7
+	exit ;;
+    m88k:*:4*:R4*)
+	echo m88k-motorola-sysv4
+	exit ;;
+    m88k:*:3*:R3*)
+	echo m88k-motorola-sysv3
+	exit ;;
+    AViiON:dgux:*:*)
+	# DG/UX returns AViiON for all architectures
+	UNAME_PROCESSOR=`/usr/bin/uname -p`
+	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
+	then
+	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
+	       [ ${TARGET_BINARY_INTERFACE}x = x ]
+	    then
+		echo m88k-dg-dgux${UNAME_RELEASE}
+	    else
+		echo m88k-dg-dguxbcs${UNAME_RELEASE}
+	    fi
+	else
+	    echo i586-dg-dgux${UNAME_RELEASE}
+	fi
+	exit ;;
+    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
+	echo m88k-dolphin-sysv3
+	exit ;;
+    M88*:*:R3*:*)
+	# Delta 88k system running SVR3
+	echo m88k-motorola-sysv3
+	exit ;;
+    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
+	echo m88k-tektronix-sysv3
+	exit ;;
+    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
+	echo m68k-tektronix-bsd
+	exit ;;
+    *:IRIX*:*:*)
+	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
+	exit ;;
+    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
+	echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
+	exit ;;               # Note that: echo "'`uname -s`'" gives 'AIX '
+    i*86:AIX:*:*)
+	echo i386-ibm-aix
+	exit ;;
+    ia64:AIX:*:*)
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
+	exit ;;
+    *:AIX:2:3)
+	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
+		eval $set_cc_for_build
+		sed 's/^		//' << EOF >$dummy.c
+		#include <sys/systemcfg.h>
+
+		main()
+			{
+			if (!__power_pc())
+				exit(1);
+			puts("powerpc-ibm-aix3.2.5");
+			exit(0);
+			}
+EOF
+		if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
+		then
+			echo "$SYSTEM_NAME"
+		else
+			echo rs6000-ibm-aix3.2.5
+		fi
+	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
+		echo rs6000-ibm-aix3.2.4
+	else
+		echo rs6000-ibm-aix3.2
+	fi
+	exit ;;
+    *:AIX:*:[4567])
+	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
+	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
+		IBM_ARCH=rs6000
+	else
+		IBM_ARCH=powerpc
+	fi
+	if [ -x /usr/bin/lslpp ] ; then
+		IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc |
+			   awk -F: '{ print $3 }' | sed s/[0-9]*$/0/`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
+	exit ;;
+    *:AIX:*:*)
+	echo rs6000-ibm-aix
+	exit ;;
+    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
+	echo romp-ibm-bsd4.4
+	exit ;;
+    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
+	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
+	exit ;;                             # report: romp-ibm BSD 4.3
+    *:BOSX:*:*)
+	echo rs6000-bull-bosx
+	exit ;;
+    DPX/2?00:B.O.S.:*:*)
+	echo m68k-bull-sysv3
+	exit ;;
+    9000/[34]??:4.3bsd:1.*:*)
+	echo m68k-hp-bsd
+	exit ;;
+    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
+	echo m68k-hp-bsd4.4
+	exit ;;
+    9000/[34678]??:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	case "${UNAME_MACHINE}" in
+	    9000/31? )            HP_ARCH=m68000 ;;
+	    9000/[34]?? )         HP_ARCH=m68k ;;
+	    9000/[678][0-9][0-9])
+		if [ -x /usr/bin/getconf ]; then
+		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
+		    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
+		    case "${sc_cpu_version}" in
+		      523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0
+		      528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1
+		      532)                      # CPU_PA_RISC2_0
+			case "${sc_kernel_bits}" in
+			  32) HP_ARCH=hppa2.0n ;;
+			  64) HP_ARCH=hppa2.0w ;;
+			  '') HP_ARCH=hppa2.0 ;;   # HP-UX 10.20
+			esac ;;
+		    esac
+		fi
+		if [ "${HP_ARCH}" = "" ]; then
+		    eval $set_cc_for_build
+		    sed 's/^		//' << EOF >$dummy.c
+
+		#define _HPUX_SOURCE
+		#include <stdlib.h>
+		#include <unistd.h>
+
+		int main ()
+		{
+		#if defined(_SC_KERNEL_BITS)
+		    long bits = sysconf(_SC_KERNEL_BITS);
+		#endif
+		    long cpu  = sysconf (_SC_CPU_VERSION);
+
+		    switch (cpu)
+			{
+			case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+			case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+			case CPU_PA_RISC2_0:
+		#if defined(_SC_KERNEL_BITS)
+			    switch (bits)
+				{
+				case 64: puts ("hppa2.0w"); break;
+				case 32: puts ("hppa2.0n"); break;
+				default: puts ("hppa2.0"); break;
+				} break;
+		#else  /* !defined(_SC_KERNEL_BITS) */
+			    puts ("hppa2.0"); break;
+		#endif
+			default: puts ("hppa1.0"); break;
+			}
+		    exit (0);
+		}
+EOF
+		    (CCOPTS="" $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
+		    test -z "$HP_ARCH" && HP_ARCH=hppa
+		fi ;;
+	esac
+	if [ ${HP_ARCH} = hppa2.0w ]
+	then
+	    eval $set_cc_for_build
+
+	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
+	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
+	    # generating 64-bit code.  GNU and HP use different nomenclature:
+	    #
+	    # $ CC_FOR_BUILD=cc ./config.guess
+	    # => hppa2.0w-hp-hpux11.23
+	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
+	    # => hppa64-hp-hpux11.23
+
+	    if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) |
+		grep -q __LP64__
+	    then
+		HP_ARCH=hppa2.0w
+	    else
+		HP_ARCH=hppa64
+	    fi
+	fi
+	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
+	exit ;;
+    ia64:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	echo ia64-hp-hpux${HPUX_REV}
+	exit ;;
+    3050*:HI-UX:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <unistd.h>
+	int
+	main ()
+	{
+	  long cpu = sysconf (_SC_CPU_VERSION);
+	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
+	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
+	     results, however.  */
+	  if (CPU_IS_PA_RISC (cpu))
+	    {
+	      switch (cpu)
+		{
+		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
+		  default: puts ("hppa-hitachi-hiuxwe2"); break;
+		}
+	    }
+	  else if (CPU_IS_HP_MC68K (cpu))
+	    puts ("m68k-hitachi-hiuxwe2");
+	  else puts ("unknown-hitachi-hiuxwe2");
+	  exit (0);
+	}
+EOF
+	$CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
+		{ echo "$SYSTEM_NAME"; exit; }
+	echo unknown-hitachi-hiuxwe2
+	exit ;;
+    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
+	echo hppa1.1-hp-bsd
+	exit ;;
+    9000/8??:4.3bsd:*:*)
+	echo hppa1.0-hp-bsd
+	exit ;;
+    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
+	echo hppa1.0-hp-mpeix
+	exit ;;
+    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
+	echo hppa1.1-hp-osf
+	exit ;;
+    hp8??:OSF1:*:*)
+	echo hppa1.0-hp-osf
+	exit ;;
+    i*86:OSF1:*:*)
+	if [ -x /usr/sbin/sysversion ] ; then
+	    echo ${UNAME_MACHINE}-unknown-osf1mk
+	else
+	    echo ${UNAME_MACHINE}-unknown-osf1
+	fi
+	exit ;;
+    parisc*:Lites*:*:*)
+	echo hppa1.1-hp-lites
+	exit ;;
+    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
+	echo c1-convex-bsd
+	exit ;;
+    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+	exit ;;
+    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
+	echo c34-convex-bsd
+	exit ;;
+    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
+	echo c38-convex-bsd
+	exit ;;
+    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
+	echo c4-convex-bsd
+	exit ;;
+    CRAY*Y-MP:*:*:*)
+	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*[A-Z]90:*:*:*)
+	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
+	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
+	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
+	      -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*TS:*:*:*)
+	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*T3E:*:*:*)
+	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*SV1:*:*:*)
+	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    *:UNICOS/mp:*:*)
+	echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
+	FUJITSU_PROC=`uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
+	FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
+	FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
+	echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+	exit ;;
+    5000:UNIX_System_V:4.*:*)
+	FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
+	FUJITSU_REL=`echo ${UNAME_RELEASE} | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'`
+	echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+	exit ;;
+    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
+	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
+	exit ;;
+    sparc*:BSD/OS:*:*)
+	echo sparc-unknown-bsdi${UNAME_RELEASE}
+	exit ;;
+    *:BSD/OS:*:*)
+	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
+	exit ;;
+    *:FreeBSD:*:*)
+	UNAME_PROCESSOR=`/usr/bin/uname -p`
+	case ${UNAME_PROCESSOR} in
+	    amd64)
+		UNAME_PROCESSOR=x86_64 ;;
+	    i386)
+		UNAME_PROCESSOR=i586 ;;
+	esac
+	echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
+	exit ;;
+    i*:CYGWIN*:*)
+	echo ${UNAME_MACHINE}-pc-cygwin
+	exit ;;
+    *:MINGW64*:*)
+	echo ${UNAME_MACHINE}-pc-mingw64
+	exit ;;
+    *:MINGW*:*)
+	echo ${UNAME_MACHINE}-pc-mingw32
+	exit ;;
+    *:MSYS*:*)
+	echo ${UNAME_MACHINE}-pc-msys
+	exit ;;
+    i*:windows32*:*)
+	# uname -m includes "-pc" on this system.
+	echo ${UNAME_MACHINE}-mingw32
+	exit ;;
+    i*:PW*:*)
+	echo ${UNAME_MACHINE}-pc-pw32
+	exit ;;
+    *:Interix*:*)
+	case ${UNAME_MACHINE} in
+	    x86)
+		echo i586-pc-interix${UNAME_RELEASE}
+		exit ;;
+	    authenticamd | genuineintel | EM64T)
+		echo x86_64-unknown-interix${UNAME_RELEASE}
+		exit ;;
+	    IA64)
+		echo ia64-unknown-interix${UNAME_RELEASE}
+		exit ;;
+	esac ;;
+    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
+	echo i${UNAME_MACHINE}-pc-mks
+	exit ;;
+    8664:Windows_NT:*)
+	echo x86_64-pc-mks
+	exit ;;
+    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
+	# UNAME_MACHINE based on the output of uname instead of i386?
+	echo i586-pc-interix
+	exit ;;
+    i*:UWIN*:*)
+	echo ${UNAME_MACHINE}-pc-uwin
+	exit ;;
+    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
+	echo x86_64-unknown-cygwin
+	exit ;;
+    p*:CYGWIN*:*)
+	echo powerpcle-unknown-cygwin
+	exit ;;
+    prep*:SunOS:5.*:*)
+	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    *:GNU:*:*)
+	# the GNU system
+	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-${LIBC}`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+	exit ;;
+    *:GNU/*:*:*)
+	# other systems with GNU libc and userland
+	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC}
+	exit ;;
+    i*86:Minix:*:*)
+	echo ${UNAME_MACHINE}-pc-minix
+	exit ;;
+    aarch64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    aarch64_be:Linux:*:*)
+	UNAME_MACHINE=aarch64_be
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+	esac
+	objdump --private-headers /bin/sh | grep -q ld.so.1
+	if test "$?" = 0 ; then LIBC=gnulibc1 ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    arc:Linux:*:* | arceb:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    arm*:Linux:*:*)
+	eval $set_cc_for_build
+	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
+	    | grep -q __ARM_EABI__
+	then
+	    echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	else
+	    if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
+		| grep -q __ARM_PCS_VFP
+	    then
+		echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi
+	    else
+		echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabihf
+	    fi
+	fi
+	exit ;;
+    avr32*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    cris:Linux:*:*)
+	echo ${UNAME_MACHINE}-axis-linux-${LIBC}
+	exit ;;
+    crisv32:Linux:*:*)
+	echo ${UNAME_MACHINE}-axis-linux-${LIBC}
+	exit ;;
+    e2k:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    frv:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    hexagon:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    i*86:Linux:*:*)
+	echo ${UNAME_MACHINE}-pc-linux-${LIBC}
+	exit ;;
+    ia64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    k1om:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    m32r*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    m68*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    mips:Linux:*:* | mips64:Linux:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#undef CPU
+	#undef ${UNAME_MACHINE}
+	#undef ${UNAME_MACHINE}el
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	CPU=${UNAME_MACHINE}el
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	CPU=${UNAME_MACHINE}
+	#else
+	CPU=
+	#endif
+	#endif
+EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
+	;;
+    mips64el:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    openrisc*:Linux:*:*)
+	echo or1k-unknown-linux-${LIBC}
+	exit ;;
+    or32:Linux:*:* | or1k*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    padre:Linux:*:*)
+	echo sparc-unknown-linux-${LIBC}
+	exit ;;
+    parisc64:Linux:*:* | hppa64:Linux:*:*)
+	echo hppa64-unknown-linux-${LIBC}
+	exit ;;
+    parisc:Linux:*:* | hppa:Linux:*:*)
+	# Look for CPU level
+	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
+	  PA7*) echo hppa1.1-unknown-linux-${LIBC} ;;
+	  PA8*) echo hppa2.0-unknown-linux-${LIBC} ;;
+	  *)    echo hppa-unknown-linux-${LIBC} ;;
+	esac
+	exit ;;
+    ppc64:Linux:*:*)
+	echo powerpc64-unknown-linux-${LIBC}
+	exit ;;
+    ppc:Linux:*:*)
+	echo powerpc-unknown-linux-${LIBC}
+	exit ;;
+    ppc64le:Linux:*:*)
+	echo powerpc64le-unknown-linux-${LIBC}
+	exit ;;
+    ppcle:Linux:*:*)
+	echo powerpcle-unknown-linux-${LIBC}
+	exit ;;
+    riscv32:Linux:*:* | riscv64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    s390:Linux:*:* | s390x:Linux:*:*)
+	echo ${UNAME_MACHINE}-ibm-linux-${LIBC}
+	exit ;;
+    sh64*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    sh*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    sparc:Linux:*:* | sparc64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    tile*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    vax:Linux:*:*)
+	echo ${UNAME_MACHINE}-dec-linux-${LIBC}
+	exit ;;
+    x86_64:Linux:*:*)
+	echo ${UNAME_MACHINE}-pc-linux-${LIBC}
+	exit ;;
+    xtensa*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    i*86:DYNIX/ptx:4*:*)
+	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
+	# earlier versions are messed up and put the nodename in both
+	# sysname and nodename.
+	echo i386-sequent-sysv4
+	exit ;;
+    i*86:UNIX_SV:4.2MP:2.*)
+	# Unixware is an offshoot of SVR4, but it has its own version
+	# number series starting with 2...
+	# I am not positive that other SVR4 systems won't match this,
+	# I just have to hope.  -- rms.
+	# Use sysv4.2uw... so that sysv4* matches it.
+	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
+	exit ;;
+    i*86:OS/2:*:*)
+	# If we were able to find `uname', then EMX Unix compatibility
+	# is probably installed.
+	echo ${UNAME_MACHINE}-pc-os2-emx
+	exit ;;
+    i*86:XTS-300:*:STOP)
+	echo ${UNAME_MACHINE}-unknown-stop
+	exit ;;
+    i*86:atheos:*:*)
+	echo ${UNAME_MACHINE}-unknown-atheos
+	exit ;;
+    i*86:syllable:*:*)
+	echo ${UNAME_MACHINE}-pc-syllable
+	exit ;;
+    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
+	echo i386-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    i*86:*DOS:*:*)
+	echo ${UNAME_MACHINE}-pc-msdosdjgpp
+	exit ;;
+    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
+	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
+	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
+		echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
+	else
+		echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
+	fi
+	exit ;;
+    i*86:*:5:[678]*)
+	# UnixWare 7.x, OpenUNIX and OpenServer 6.
+	case `/bin/uname -X | grep "^Machine"` in
+	    *486*)	     UNAME_MACHINE=i486 ;;
+	    *Pentium)	     UNAME_MACHINE=i586 ;;
+	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
+	esac
+	echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
+	exit ;;
+    i*86:*:3.2:*)
+	if test -f /usr/options/cb.name; then
+		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
+		echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
+	elif /bin/uname -X 2>/dev/null >/dev/null ; then
+		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
+		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
+		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
+			&& UNAME_MACHINE=i586
+		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
+	else
+		echo ${UNAME_MACHINE}-pc-sysv32
+	fi
+	exit ;;
+    pc:*:*:*)
+	# Left here for compatibility:
+	# uname -m prints for DJGPP always 'pc', but it prints nothing about
+	# the processor, so we play safe by assuming i586.
+	# Note: whatever this is, it MUST be the same as what config.sub
+	# prints for the "djgpp" host, or else GDB configure will decide that
+	# this is a cross-build.
+	echo i586-pc-msdosdjgpp
+	exit ;;
+    Intel:Mach:3*:*)
+	echo i386-pc-mach3
+	exit ;;
+    paragon:*:*:*)
+	echo i860-intel-osf1
+	exit ;;
+    i860:*:4.*:*) # i860-SVR4
+	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
+	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
+	else # Add other i860-SVR4 vendors below as they are discovered.
+	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
+	fi
+	exit ;;
+    mini*:CTIX:SYS*5:*)
+	# "miniframe"
+	echo m68010-convergent-sysv
+	exit ;;
+    mc68k:UNIX:SYSTEM5:3.51m)
+	echo m68k-convergent-sysv
+	exit ;;
+    M680?0:D-NIX:5.3:*)
+	echo m68k-diab-dnix
+	exit ;;
+    M68*:*:R3V[5678]*:*)
+	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
+    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
+	OS_REL=''
+	test -r /etc/.relid \
+	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	  && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
+    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && { echo i486-ncr-sysv4; exit; } ;;
+    NCR*:*:4.2:* | MPRAS*:*:4.2:*)
+	OS_REL='.3'
+	test -r /etc/.relid \
+	    && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	    && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \
+	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
+    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
+	echo m68k-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    mc68030:UNIX_System_V:4.*:*)
+	echo m68k-atari-sysv4
+	exit ;;
+    TSUNAMI:LynxOS:2.*:*)
+	echo sparc-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    rs6000:LynxOS:2.*:*)
+	echo rs6000-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
+	echo powerpc-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    SM[BE]S:UNIX_SV:*:*)
+	echo mips-dde-sysv${UNAME_RELEASE}
+	exit ;;
+    RM*:ReliantUNIX-*:*:*)
+	echo mips-sni-sysv4
+	exit ;;
+    RM*:SINIX-*:*:*)
+	echo mips-sni-sysv4
+	exit ;;
+    *:SINIX-*:*:*)
+	if uname -p 2>/dev/null >/dev/null ; then
+		UNAME_MACHINE=`(uname -p) 2>/dev/null`
+		echo ${UNAME_MACHINE}-sni-sysv4
+	else
+		echo ns32k-sni-sysv
+	fi
+	exit ;;
+    PENTIUM:*:4.0*:*)	# Unisys `ClearPath HMP IX 4000' SVR4/MP effort
+			# says <Richard.M.Bartel@ccMail.Census.GOV>
+	echo i586-unisys-sysv4
+	exit ;;
+    *:UNIX_System_V:4*:FTX*)
+	# From Gerald Hewes <hewes@openmarket.com>.
+	# How about differentiating between stratus architectures? -djm
+	echo hppa1.1-stratus-sysv4
+	exit ;;
+    *:*:*:FTX*)
+	# From seanf@swdc.stratus.com.
+	echo i860-stratus-sysv4
+	exit ;;
+    i*86:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	echo ${UNAME_MACHINE}-stratus-vos
+	exit ;;
+    *:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	echo hppa1.1-stratus-vos
+	exit ;;
+    mc68*:A/UX:*:*)
+	echo m68k-apple-aux${UNAME_RELEASE}
+	exit ;;
+    news*:NEWS-OS:6*:*)
+	echo mips-sony-newsos6
+	exit ;;
+    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
+	if [ -d /usr/nec ]; then
+		echo mips-nec-sysv${UNAME_RELEASE}
+	else
+		echo mips-unknown-sysv${UNAME_RELEASE}
+	fi
+	exit ;;
+    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
+	echo powerpc-be-beos
+	exit ;;
+    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
+	echo powerpc-apple-beos
+	exit ;;
+    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
+	echo i586-pc-beos
+	exit ;;
+    BePC:Haiku:*:*)	# Haiku running on Intel PC compatible.
+	echo i586-pc-haiku
+	exit ;;
+    x86_64:Haiku:*:*)
+	echo x86_64-unknown-haiku
+	exit ;;
+    SX-4:SUPER-UX:*:*)
+	echo sx4-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-5:SUPER-UX:*:*)
+	echo sx5-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-6:SUPER-UX:*:*)
+	echo sx6-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-7:SUPER-UX:*:*)
+	echo sx7-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-8:SUPER-UX:*:*)
+	echo sx8-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-8R:SUPER-UX:*:*)
+	echo sx8r-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-ACE:SUPER-UX:*:*)
+	echo sxace-nec-superux${UNAME_RELEASE}
+	exit ;;
+    Power*:Rhapsody:*:*)
+	echo powerpc-apple-rhapsody${UNAME_RELEASE}
+	exit ;;
+    *:Rhapsody:*:*)
+	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
+	exit ;;
+    *:Darwin:*:*)
+	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
+	eval $set_cc_for_build
+	if test "$UNAME_PROCESSOR" = unknown ; then
+	    UNAME_PROCESSOR=powerpc
+	fi
+	if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then
+	    if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
+		if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+		       (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+		       grep IS_64BIT_ARCH >/dev/null
+		then
+		    case $UNAME_PROCESSOR in
+			i386) UNAME_PROCESSOR=x86_64 ;;
+			powerpc) UNAME_PROCESSOR=powerpc64 ;;
+		    esac
+		fi
+		# On 10.4-10.6 one might compile for PowerPC via gcc -arch ppc
+		if (echo '#ifdef __POWERPC__'; echo IS_PPC; echo '#endif') | \
+		       (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+		       grep IS_PPC >/dev/null
+		then
+		    UNAME_PROCESSOR=powerpc
+		fi
+	    fi
+	elif test "$UNAME_PROCESSOR" = i386 ; then
+	    # Avoid executing cc on OS X 10.9, as it ships with a stub
+	    # that puts up a graphical alert prompting to install
+	    # developer tools.  Any system running Mac OS X 10.7 or
+	    # later (Darwin 11 and later) is required to have a 64-bit
+	    # processor. This is not true of the ARM version of Darwin
+	    # that Apple uses in portable devices.
+	    UNAME_PROCESSOR=x86_64
+	fi
+	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
+	exit ;;
+    *:procnto*:*:* | *:QNX:[0123456789]*:*)
+	UNAME_PROCESSOR=`uname -p`
+	if test "$UNAME_PROCESSOR" = x86; then
+		UNAME_PROCESSOR=i386
+		UNAME_MACHINE=pc
+	fi
+	echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
+	exit ;;
+    *:QNX:*:4*)
+	echo i386-pc-qnx
+	exit ;;
+    NEO-*:NONSTOP_KERNEL:*:*)
+	echo neo-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    NSE-*:NONSTOP_KERNEL:*:*)
+	echo nse-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    NSR-*:NONSTOP_KERNEL:*:*)
+	echo nsr-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    NSX-*:NONSTOP_KERNEL:*:*)
+	echo nsx-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    *:NonStop-UX:*:*)
+	echo mips-compaq-nonstopux
+	exit ;;
+    BS2000:POSIX*:*:*)
+	echo bs2000-siemens-sysv
+	exit ;;
+    DS/*:UNIX_System_V:*:*)
+	echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
+	exit ;;
+    *:Plan9:*:*)
+	# "uname -m" is not consistent, so use $cputype instead. 386
+	# is converted to i386 for consistency with other x86
+	# operating systems.
+	if test "$cputype" = 386; then
+	    UNAME_MACHINE=i386
+	else
+	    UNAME_MACHINE="$cputype"
+	fi
+	echo ${UNAME_MACHINE}-unknown-plan9
+	exit ;;
+    *:TOPS-10:*:*)
+	echo pdp10-unknown-tops10
+	exit ;;
+    *:TENEX:*:*)
+	echo pdp10-unknown-tenex
+	exit ;;
+    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
+	echo pdp10-dec-tops20
+	exit ;;
+    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
+	echo pdp10-xkl-tops20
+	exit ;;
+    *:TOPS-20:*:*)
+	echo pdp10-unknown-tops20
+	exit ;;
+    *:ITS:*:*)
+	echo pdp10-unknown-its
+	exit ;;
+    SEI:*:*:SEIUX)
+	echo mips-sei-seiux${UNAME_RELEASE}
+	exit ;;
+    *:DragonFly:*:*)
+	echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
+	exit ;;
+    *:*VMS:*:*)
+	UNAME_MACHINE=`(uname -p) 2>/dev/null`
+	case "${UNAME_MACHINE}" in
+	    A*) echo alpha-dec-vms ; exit ;;
+	    I*) echo ia64-dec-vms ; exit ;;
+	    V*) echo vax-dec-vms ; exit ;;
+	esac ;;
+    *:XENIX:*:SysV)
+	echo i386-pc-xenix
+	exit ;;
+    i*86:skyos:*:*)
+	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE} | sed -e 's/ .*$//'`
+	exit ;;
+    i*86:rdos:*:*)
+	echo ${UNAME_MACHINE}-pc-rdos
+	exit ;;
+    i*86:AROS:*:*)
+	echo ${UNAME_MACHINE}-pc-aros
+	exit ;;
+    x86_64:VMkernel:*:*)
+	echo ${UNAME_MACHINE}-unknown-esx
+	exit ;;
+    amd64:Isilon\ OneFS:*:*)
+	echo x86_64-unknown-onefs
+	exit ;;
+esac
+
+cat >&2 <<EOF
+$0: unable to guess system type
+
+This script (version $timestamp), has failed to recognize the
+operating system you are using. If your script is old, overwrite
+config.guess and config.sub with the latest versions from:
+
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
+and
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
+
+If $0 has already been updated, send the following data and any
+information you think might be pertinent to config-patches@gnu.org to
+provide the necessary information to handle your system.
+
+config.guess timestamp = $timestamp
+
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`
+
+hostinfo               = `(hostinfo) 2>/dev/null`
+/bin/universe          = `(/bin/universe) 2>/dev/null`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
+/bin/arch              = `(/bin/arch) 2>/dev/null`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
+
+UNAME_MACHINE = ${UNAME_MACHINE}
+UNAME_RELEASE = ${UNAME_RELEASE}
+UNAME_SYSTEM  = ${UNAME_SYSTEM}
+UNAME_VERSION = ${UNAME_VERSION}
+EOF
+
+exit 1
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/core/src/index/thirdparty/faiss/build-aux/config.sub b/core/src/index/thirdparty/faiss/build-aux/config.sub
new file mode 100755
index 0000000000..40ea5dfe11
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/build-aux/config.sub
@@ -0,0 +1,1836 @@
+#! /bin/sh
+# Configuration validation subroutine script.
+#   Copyright 1992-2017 Free Software Foundation, Inc.
+
+timestamp='2017-04-02'
+
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that
+# program.  This Exception is an additional permission under section 7
+# of the GNU General Public License, version 3 ("GPLv3").
+
+
+# Please send patches to <config-patches@gnu.org>.
+#
+# Configuration subroutine to validate and canonicalize a configuration type.
+# Supply the specified configuration type as an argument.
+# If it is invalid, we print an error message on stderr and exit with code 1.
+# Otherwise, we print the canonical config type on stdout and succeed.
+
+# You can get the latest version of this script from:
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
+
+# This file is supposed to be the same for all GNU packages
+# and recognize all the CPU types, system types and aliases
+# that are meaningful with *any* GNU software.
+# Each package is responsible for reporting which valid configurations
+# it does not support.  The user should be able to distinguish
+# a failure to support a valid configuration from a meaningless
+# configuration.
+
+# The goal of this file is to map all the various variations of a given
+# machine specification into a single specification in the form:
+#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
+# or in some cases, the newer four-part form:
+#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
+# It is wrong to echo any other type of specification.
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS
+
+Canonicalize a configuration name.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.sub ($timestamp)
+
+Copyright 1992-2017 Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help"
+       exit 1 ;;
+
+    *local*)
+       # First pass through any local machine types.
+       echo $1
+       exit ;;
+
+    * )
+       break ;;
+  esac
+done
+
+case $# in
+ 0) echo "$me: missing argument$help" >&2
+    exit 1;;
+ 1) ;;
+ *) echo "$me: too many arguments$help" >&2
+    exit 1;;
+esac
+
+# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
+# Here we must recognize all the valid KERNEL-OS combinations.
+maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
+case $maybe_os in
+  nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \
+  linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
+  knetbsd*-gnu* | netbsd*-gnu* | netbsd*-eabi* | \
+  kopensolaris*-gnu* | cloudabi*-eabi* | \
+  storm-chaos* | os2-emx* | rtmk-nova*)
+    os=-$maybe_os
+    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
+    ;;
+  android-linux)
+    os=-linux-android
+    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`-unknown
+    ;;
+  *)
+    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
+    if [ $basic_machine != $1 ]
+    then os=`echo $1 | sed 's/.*-/-/'`
+    else os=; fi
+    ;;
+esac
+
+### Let's recognize common machines as not being operating systems so
+### that things like config.sub decstation-3100 work.  We also
+### recognize some manufacturers as not being operating systems, so we
+### can provide default operating systems below.
+case $os in
+	-sun*os*)
+		# Prevent following clause from handling this invalid input.
+		;;
+	-dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
+	-att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
+	-unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
+	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
+	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
+	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
+	-apple | -axis | -knuth | -cray | -microblaze*)
+		os=
+		basic_machine=$1
+		;;
+	-bluegene*)
+		os=-cnk
+		;;
+	-sim | -cisco | -oki | -wec | -winbond)
+		os=
+		basic_machine=$1
+		;;
+	-scout)
+		;;
+	-wrs)
+		os=-vxworks
+		basic_machine=$1
+		;;
+	-chorusos*)
+		os=-chorusos
+		basic_machine=$1
+		;;
+	-chorusrdb)
+		os=-chorusrdb
+		basic_machine=$1
+		;;
+	-hiux*)
+		os=-hiuxwe2
+		;;
+	-sco6)
+		os=-sco5v6
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco5)
+		os=-sco3.2v5
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco4)
+		os=-sco3.2v4
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco3.2.[4-9]*)
+		os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco3.2v[4-9]*)
+		# Don't forget version if it is 3.2v4 or newer.
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco5v6*)
+		# Don't forget version if it is 3.2v4 or newer.
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco*)
+		os=-sco3.2v2
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-udk*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-isc)
+		os=-isc2.2
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-clix*)
+		basic_machine=clipper-intergraph
+		;;
+	-isc*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-lynx*178)
+		os=-lynxos178
+		;;
+	-lynx*5)
+		os=-lynxos5
+		;;
+	-lynx*)
+		os=-lynxos
+		;;
+	-ptx*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
+		;;
+	-windowsnt*)
+		os=`echo $os | sed -e 's/windowsnt/winnt/'`
+		;;
+	-psos*)
+		os=-psos
+		;;
+	-mint | -mint[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
+esac
+
+# Decode aliases for certain CPU-COMPANY combinations.
+case $basic_machine in
+	# Recognize the basic CPU types without company name.
+	# Some are omitted here because they have special meanings below.
+	1750a | 580 \
+	| a29k \
+	| aarch64 | aarch64_be \
+	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
+	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
+	| am33_2.0 \
+	| arc | arceb \
+	| arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \
+	| avr | avr32 \
+	| ba \
+	| be32 | be64 \
+	| bfin \
+	| c4x | c8051 | clipper \
+	| d10v | d30v | dlx | dsp16xx \
+	| e2k | epiphany \
+	| fido | fr30 | frv | ft32 \
+	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
+	| hexagon \
+	| i370 | i860 | i960 | ia16 | ia64 \
+	| ip2k | iq2000 \
+	| k1om \
+	| le32 | le64 \
+	| lm32 \
+	| m32c | m32r | m32rle | m68000 | m68k | m88k \
+	| maxq | mb | microblaze | microblazeel | mcore | mep | metag \
+	| mips | mipsbe | mipseb | mipsel | mipsle \
+	| mips16 \
+	| mips64 | mips64el \
+	| mips64octeon | mips64octeonel \
+	| mips64orion | mips64orionel \
+	| mips64r5900 | mips64r5900el \
+	| mips64vr | mips64vrel \
+	| mips64vr4100 | mips64vr4100el \
+	| mips64vr4300 | mips64vr4300el \
+	| mips64vr5000 | mips64vr5000el \
+	| mips64vr5900 | mips64vr5900el \
+	| mipsisa32 | mipsisa32el \
+	| mipsisa32r2 | mipsisa32r2el \
+	| mipsisa32r6 | mipsisa32r6el \
+	| mipsisa64 | mipsisa64el \
+	| mipsisa64r2 | mipsisa64r2el \
+	| mipsisa64r6 | mipsisa64r6el \
+	| mipsisa64sb1 | mipsisa64sb1el \
+	| mipsisa64sr71k | mipsisa64sr71kel \
+	| mipsr5900 | mipsr5900el \
+	| mipstx39 | mipstx39el \
+	| mn10200 | mn10300 \
+	| moxie \
+	| mt \
+	| msp430 \
+	| nds32 | nds32le | nds32be \
+	| nios | nios2 | nios2eb | nios2el \
+	| ns16k | ns32k \
+	| open8 | or1k | or1knd | or32 \
+	| pdp10 | pdp11 | pj | pjl \
+	| powerpc | powerpc64 | powerpc64le | powerpcle \
+	| pru \
+	| pyramid \
+	| riscv32 | riscv64 \
+	| rl78 | rx \
+	| score \
+	| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[234]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
+	| sh64 | sh64le \
+	| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
+	| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
+	| spu \
+	| tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \
+	| ubicom32 \
+	| v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \
+	| visium \
+	| wasm32 \
+	| we32k \
+	| x86 | xc16x | xstormy16 | xtensa \
+	| z8k | z80)
+		basic_machine=$basic_machine-unknown
+		;;
+	c54x)
+		basic_machine=tic54x-unknown
+		;;
+	c55x)
+		basic_machine=tic55x-unknown
+		;;
+	c6x)
+		basic_machine=tic6x-unknown
+		;;
+	leon|leon[3-9])
+		basic_machine=sparc-$basic_machine
+		;;
+	m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | nvptx | picochip)
+		basic_machine=$basic_machine-unknown
+		os=-none
+		;;
+	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k)
+		;;
+	ms1)
+		basic_machine=mt-unknown
+		;;
+
+	strongarm | thumb | xscale)
+		basic_machine=arm-unknown
+		;;
+	xgate)
+		basic_machine=$basic_machine-unknown
+		os=-none
+		;;
+	xscaleeb)
+		basic_machine=armeb-unknown
+		;;
+
+	xscaleel)
+		basic_machine=armel-unknown
+		;;
+
+	# We use `pc' rather than `unknown'
+	# because (1) that's what they normally are, and
+	# (2) the word "unknown" tends to confuse beginning users.
+	i*86 | x86_64)
+	  basic_machine=$basic_machine-pc
+	  ;;
+	# Object if more than one company name word.
+	*-*-*)
+		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		exit 1
+		;;
+	# Recognize the basic CPU types with company name.
+	580-* \
+	| a29k-* \
+	| aarch64-* | aarch64_be-* \
+	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
+	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
+	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \
+	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
+	| avr-* | avr32-* \
+	| ba-* \
+	| be32-* | be64-* \
+	| bfin-* | bs2000-* \
+	| c[123]* | c30-* | [cjt]90-* | c4x-* \
+	| c8051-* | clipper-* | craynv-* | cydra-* \
+	| d10v-* | d30v-* | dlx-* \
+	| e2k-* | elxsi-* \
+	| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
+	| h8300-* | h8500-* \
+	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
+	| hexagon-* \
+	| i*86-* | i860-* | i960-* | ia16-* | ia64-* \
+	| ip2k-* | iq2000-* \
+	| k1om-* \
+	| le32-* | le64-* \
+	| lm32-* \
+	| m32c-* | m32r-* | m32rle-* \
+	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
+	| m88110-* | m88k-* | maxq-* | mcore-* | metag-* \
+	| microblaze-* | microblazeel-* \
+	| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
+	| mips16-* \
+	| mips64-* | mips64el-* \
+	| mips64octeon-* | mips64octeonel-* \
+	| mips64orion-* | mips64orionel-* \
+	| mips64r5900-* | mips64r5900el-* \
+	| mips64vr-* | mips64vrel-* \
+	| mips64vr4100-* | mips64vr4100el-* \
+	| mips64vr4300-* | mips64vr4300el-* \
+	| mips64vr5000-* | mips64vr5000el-* \
+	| mips64vr5900-* | mips64vr5900el-* \
+	| mipsisa32-* | mipsisa32el-* \
+	| mipsisa32r2-* | mipsisa32r2el-* \
+	| mipsisa32r6-* | mipsisa32r6el-* \
+	| mipsisa64-* | mipsisa64el-* \
+	| mipsisa64r2-* | mipsisa64r2el-* \
+	| mipsisa64r6-* | mipsisa64r6el-* \
+	| mipsisa64sb1-* | mipsisa64sb1el-* \
+	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
+	| mipsr5900-* | mipsr5900el-* \
+	| mipstx39-* | mipstx39el-* \
+	| mmix-* \
+	| mt-* \
+	| msp430-* \
+	| nds32-* | nds32le-* | nds32be-* \
+	| nios-* | nios2-* | nios2eb-* | nios2el-* \
+	| none-* | np1-* | ns16k-* | ns32k-* \
+	| open8-* \
+	| or1k*-* \
+	| orion-* \
+	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
+	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
+	| pru-* \
+	| pyramid-* \
+	| riscv32-* | riscv64-* \
+	| rl78-* | romp-* | rs6000-* | rx-* \
+	| sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
+	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
+	| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
+	| sparclite-* \
+	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx*-* \
+	| tahoe-* \
+	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
+	| tile*-* \
+	| tron-* \
+	| ubicom32-* \
+	| v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \
+	| vax-* \
+	| visium-* \
+	| wasm32-* \
+	| we32k-* \
+	| x86-* | x86_64-* | xc16x-* | xps100-* \
+	| xstormy16-* | xtensa*-* \
+	| ymp-* \
+	| z8k-* | z80-*)
+		;;
+	# Recognize the basic CPU types without company name, with glob match.
+	xtensa*)
+		basic_machine=$basic_machine-unknown
+		;;
+	# Recognize the various machine names and aliases which stand
+	# for a CPU type and a company and sometimes even an OS.
+	386bsd)
+		basic_machine=i386-unknown
+		os=-bsd
+		;;
+	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
+		basic_machine=m68000-att
+		;;
+	3b*)
+		basic_machine=we32k-att
+		;;
+	a29khif)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	abacus)
+		basic_machine=abacus-unknown
+		;;
+	adobe68k)
+		basic_machine=m68010-adobe
+		os=-scout
+		;;
+	alliant | fx80)
+		basic_machine=fx80-alliant
+		;;
+	altos | altos3068)
+		basic_machine=m68k-altos
+		;;
+	am29k)
+		basic_machine=a29k-none
+		os=-bsd
+		;;
+	amd64)
+		basic_machine=x86_64-pc
+		;;
+	amd64-*)
+		basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	amdahl)
+		basic_machine=580-amdahl
+		os=-sysv
+		;;
+	amiga | amiga-*)
+		basic_machine=m68k-unknown
+		;;
+	amigaos | amigados)
+		basic_machine=m68k-unknown
+		os=-amigaos
+		;;
+	amigaunix | amix)
+		basic_machine=m68k-unknown
+		os=-sysv4
+		;;
+	apollo68)
+		basic_machine=m68k-apollo
+		os=-sysv
+		;;
+	apollo68bsd)
+		basic_machine=m68k-apollo
+		os=-bsd
+		;;
+	aros)
+		basic_machine=i386-pc
+		os=-aros
+		;;
+	asmjs)
+		basic_machine=asmjs-unknown
+		;;
+	aux)
+		basic_machine=m68k-apple
+		os=-aux
+		;;
+	balance)
+		basic_machine=ns32k-sequent
+		os=-dynix
+		;;
+	blackfin)
+		basic_machine=bfin-unknown
+		os=-linux
+		;;
+	blackfin-*)
+		basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'`
+		os=-linux
+		;;
+	bluegene*)
+		basic_machine=powerpc-ibm
+		os=-cnk
+		;;
+	c54x-*)
+		basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	c55x-*)
+		basic_machine=tic55x-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	c6x-*)
+		basic_machine=tic6x-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	c90)
+		basic_machine=c90-cray
+		os=-unicos
+		;;
+	cegcc)
+		basic_machine=arm-unknown
+		os=-cegcc
+		;;
+	convex-c1)
+		basic_machine=c1-convex
+		os=-bsd
+		;;
+	convex-c2)
+		basic_machine=c2-convex
+		os=-bsd
+		;;
+	convex-c32)
+		basic_machine=c32-convex
+		os=-bsd
+		;;
+	convex-c34)
+		basic_machine=c34-convex
+		os=-bsd
+		;;
+	convex-c38)
+		basic_machine=c38-convex
+		os=-bsd
+		;;
+	cray | j90)
+		basic_machine=j90-cray
+		os=-unicos
+		;;
+	craynv)
+		basic_machine=craynv-cray
+		os=-unicosmp
+		;;
+	cr16 | cr16-*)
+		basic_machine=cr16-unknown
+		os=-elf
+		;;
+	crds | unos)
+		basic_machine=m68k-crds
+		;;
+	crisv32 | crisv32-* | etraxfs*)
+		basic_machine=crisv32-axis
+		;;
+	cris | cris-* | etrax*)
+		basic_machine=cris-axis
+		;;
+	crx)
+		basic_machine=crx-unknown
+		os=-elf
+		;;
+	da30 | da30-*)
+		basic_machine=m68k-da30
+		;;
+	decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
+		basic_machine=mips-dec
+		;;
+	decsystem10* | dec10*)
+		basic_machine=pdp10-dec
+		os=-tops10
+		;;
+	decsystem20* | dec20*)
+		basic_machine=pdp10-dec
+		os=-tops20
+		;;
+	delta | 3300 | motorola-3300 | motorola-delta \
+	      | 3300-motorola | delta-motorola)
+		basic_machine=m68k-motorola
+		;;
+	delta88)
+		basic_machine=m88k-motorola
+		os=-sysv3
+		;;
+	dicos)
+		basic_machine=i686-pc
+		os=-dicos
+		;;
+	djgpp)
+		basic_machine=i586-pc
+		os=-msdosdjgpp
+		;;
+	dpx20 | dpx20-*)
+		basic_machine=rs6000-bull
+		os=-bosx
+		;;
+	dpx2* | dpx2*-bull)
+		basic_machine=m68k-bull
+		os=-sysv3
+		;;
+	e500v[12])
+		basic_machine=powerpc-unknown
+		os=$os"spe"
+		;;
+	e500v[12]-*)
+		basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
+		os=$os"spe"
+		;;
+	ebmon29k)
+		basic_machine=a29k-amd
+		os=-ebmon
+		;;
+	elxsi)
+		basic_machine=elxsi-elxsi
+		os=-bsd
+		;;
+	encore | umax | mmax)
+		basic_machine=ns32k-encore
+		;;
+	es1800 | OSE68k | ose68k | ose | OSE)
+		basic_machine=m68k-ericsson
+		os=-ose
+		;;
+	fx2800)
+		basic_machine=i860-alliant
+		;;
+	genix)
+		basic_machine=ns32k-ns
+		;;
+	gmicro)
+		basic_machine=tron-gmicro
+		os=-sysv
+		;;
+	go32)
+		basic_machine=i386-pc
+		os=-go32
+		;;
+	h3050r* | hiux*)
+		basic_machine=hppa1.1-hitachi
+		os=-hiuxwe2
+		;;
+	h8300hms)
+		basic_machine=h8300-hitachi
+		os=-hms
+		;;
+	h8300xray)
+		basic_machine=h8300-hitachi
+		os=-xray
+		;;
+	h8500hms)
+		basic_machine=h8500-hitachi
+		os=-hms
+		;;
+	harris)
+		basic_machine=m88k-harris
+		os=-sysv3
+		;;
+	hp300-*)
+		basic_machine=m68k-hp
+		;;
+	hp300bsd)
+		basic_machine=m68k-hp
+		os=-bsd
+		;;
+	hp300hpux)
+		basic_machine=m68k-hp
+		os=-hpux
+		;;
+	hp3k9[0-9][0-9] | hp9[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k2[0-9][0-9] | hp9k31[0-9])
+		basic_machine=m68000-hp
+		;;
+	hp9k3[2-9][0-9])
+		basic_machine=m68k-hp
+		;;
+	hp9k6[0-9][0-9] | hp6[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k7[0-79][0-9] | hp7[0-79][0-9])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k78[0-9] | hp78[0-9])
+		# FIXME: really hppa2.0-hp
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
+		# FIXME: really hppa2.0-hp
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[0-9][13679] | hp8[0-9][13679])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[0-9][0-9] | hp8[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hppa-next)
+		os=-nextstep3
+		;;
+	hppaosf)
+		basic_machine=hppa1.1-hp
+		os=-osf
+		;;
+	hppro)
+		basic_machine=hppa1.1-hp
+		os=-proelf
+		;;
+	i370-ibm* | ibm*)
+		basic_machine=i370-ibm
+		;;
+	i*86v32)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv32
+		;;
+	i*86v4*)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv4
+		;;
+	i*86v)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv
+		;;
+	i*86sol2)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-solaris2
+		;;
+	i386mach)
+		basic_machine=i386-mach
+		os=-mach
+		;;
+	i386-vsta | vsta)
+		basic_machine=i386-unknown
+		os=-vsta
+		;;
+	iris | iris4d)
+		basic_machine=mips-sgi
+		case $os in
+		    -irix*)
+			;;
+		    *)
+			os=-irix4
+			;;
+		esac
+		;;
+	isi68 | isi)
+		basic_machine=m68k-isi
+		os=-sysv
+		;;
+	leon-*|leon[3-9]-*)
+		basic_machine=sparc-`echo $basic_machine | sed 's/-.*//'`
+		;;
+	m68knommu)
+		basic_machine=m68k-unknown
+		os=-linux
+		;;
+	m68knommu-*)
+		basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'`
+		os=-linux
+		;;
+	m88k-omron*)
+		basic_machine=m88k-omron
+		;;
+	magnum | m3230)
+		basic_machine=mips-mips
+		os=-sysv
+		;;
+	merlin)
+		basic_machine=ns32k-utek
+		os=-sysv
+		;;
+	microblaze*)
+		basic_machine=microblaze-xilinx
+		;;
+	mingw64)
+		basic_machine=x86_64-pc
+		os=-mingw64
+		;;
+	mingw32)
+		basic_machine=i686-pc
+		os=-mingw32
+		;;
+	mingw32ce)
+		basic_machine=arm-unknown
+		os=-mingw32ce
+		;;
+	miniframe)
+		basic_machine=m68000-convergent
+		;;
+	*mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
+	mips3*-*)
+		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
+		;;
+	mips3*)
+		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
+		;;
+	monitor)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	morphos)
+		basic_machine=powerpc-unknown
+		os=-morphos
+		;;
+	moxiebox)
+		basic_machine=moxie-unknown
+		os=-moxiebox
+		;;
+	msdos)
+		basic_machine=i386-pc
+		os=-msdos
+		;;
+	ms1-*)
+		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
+		;;
+	msys)
+		basic_machine=i686-pc
+		os=-msys
+		;;
+	mvs)
+		basic_machine=i370-ibm
+		os=-mvs
+		;;
+	nacl)
+		basic_machine=le32-unknown
+		os=-nacl
+		;;
+	ncr3000)
+		basic_machine=i486-ncr
+		os=-sysv4
+		;;
+	netbsd386)
+		basic_machine=i386-unknown
+		os=-netbsd
+		;;
+	netwinder)
+		basic_machine=armv4l-rebel
+		os=-linux
+		;;
+	news | news700 | news800 | news900)
+		basic_machine=m68k-sony
+		os=-newsos
+		;;
+	news1000)
+		basic_machine=m68030-sony
+		os=-newsos
+		;;
+	news-3600 | risc-news)
+		basic_machine=mips-sony
+		os=-newsos
+		;;
+	necv70)
+		basic_machine=v70-nec
+		os=-sysv
+		;;
+	next | m*-next )
+		basic_machine=m68k-next
+		case $os in
+		    -nextstep* )
+			;;
+		    -ns2*)
+		      os=-nextstep2
+			;;
+		    *)
+		      os=-nextstep3
+			;;
+		esac
+		;;
+	nh3000)
+		basic_machine=m68k-harris
+		os=-cxux
+		;;
+	nh[45]000)
+		basic_machine=m88k-harris
+		os=-cxux
+		;;
+	nindy960)
+		basic_machine=i960-intel
+		os=-nindy
+		;;
+	mon960)
+		basic_machine=i960-intel
+		os=-mon960
+		;;
+	nonstopux)
+		basic_machine=mips-compaq
+		os=-nonstopux
+		;;
+	np1)
+		basic_machine=np1-gould
+		;;
+	neo-tandem)
+		basic_machine=neo-tandem
+		;;
+	nse-tandem)
+		basic_machine=nse-tandem
+		;;
+	nsr-tandem)
+		basic_machine=nsr-tandem
+		;;
+	nsx-tandem)
+		basic_machine=nsx-tandem
+		;;
+	op50n-* | op60c-*)
+		basic_machine=hppa1.1-oki
+		os=-proelf
+		;;
+	openrisc | openrisc-*)
+		basic_machine=or32-unknown
+		;;
+	os400)
+		basic_machine=powerpc-ibm
+		os=-os400
+		;;
+	OSE68000 | ose68000)
+		basic_machine=m68000-ericsson
+		os=-ose
+		;;
+	os68k)
+		basic_machine=m68k-none
+		os=-os68k
+		;;
+	pa-hitachi)
+		basic_machine=hppa1.1-hitachi
+		os=-hiuxwe2
+		;;
+	paragon)
+		basic_machine=i860-intel
+		os=-osf
+		;;
+	parisc)
+		basic_machine=hppa-unknown
+		os=-linux
+		;;
+	parisc-*)
+		basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'`
+		os=-linux
+		;;
+	pbd)
+		basic_machine=sparc-tti
+		;;
+	pbb)
+		basic_machine=m68k-tti
+		;;
+	pc532 | pc532-*)
+		basic_machine=ns32k-pc532
+		;;
+	pc98)
+		basic_machine=i386-pc
+		;;
+	pc98-*)
+		basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentium | p5 | k5 | k6 | nexgen | viac3)
+		basic_machine=i586-pc
+		;;
+	pentiumpro | p6 | 6x86 | athlon | athlon_*)
+		basic_machine=i686-pc
+		;;
+	pentiumii | pentium2 | pentiumiii | pentium3)
+		basic_machine=i686-pc
+		;;
+	pentium4)
+		basic_machine=i786-pc
+		;;
+	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
+		basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentiumpro-* | p6-* | 6x86-* | athlon-*)
+		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
+		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentium4-*)
+		basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pn)
+		basic_machine=pn-gould
+		;;
+	power)	basic_machine=power-ibm
+		;;
+	ppc | ppcbe)	basic_machine=powerpc-unknown
+		;;
+	ppc-* | ppcbe-*)
+		basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppcle | powerpclittle)
+		basic_machine=powerpcle-unknown
+		;;
+	ppcle-* | powerpclittle-*)
+		basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppc64)	basic_machine=powerpc64-unknown
+		;;
+	ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppc64le | powerpc64little)
+		basic_machine=powerpc64le-unknown
+		;;
+	ppc64le-* | powerpc64little-*)
+		basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ps2)
+		basic_machine=i386-ibm
+		;;
+	pw32)
+		basic_machine=i586-unknown
+		os=-pw32
+		;;
+	rdos | rdos64)
+		basic_machine=x86_64-pc
+		os=-rdos
+		;;
+	rdos32)
+		basic_machine=i386-pc
+		os=-rdos
+		;;
+	rom68k)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	rm[46]00)
+		basic_machine=mips-siemens
+		;;
+	rtpc | rtpc-*)
+		basic_machine=romp-ibm
+		;;
+	s390 | s390-*)
+		basic_machine=s390-ibm
+		;;
+	s390x | s390x-*)
+		basic_machine=s390x-ibm
+		;;
+	sa29200)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	sb1)
+		basic_machine=mipsisa64sb1-unknown
+		;;
+	sb1el)
+		basic_machine=mipsisa64sb1el-unknown
+		;;
+	sde)
+		basic_machine=mipsisa32-sde
+		os=-elf
+		;;
+	sei)
+		basic_machine=mips-sei
+		os=-seiux
+		;;
+	sequent)
+		basic_machine=i386-sequent
+		;;
+	sh)
+		basic_machine=sh-hitachi
+		os=-hms
+		;;
+	sh5el)
+		basic_machine=sh5le-unknown
+		;;
+	sh64)
+		basic_machine=sh64-unknown
+		;;
+	sparclite-wrs | simso-wrs)
+		basic_machine=sparclite-wrs
+		os=-vxworks
+		;;
+	sps7)
+		basic_machine=m68k-bull
+		os=-sysv2
+		;;
+	spur)
+		basic_machine=spur-unknown
+		;;
+	st2000)
+		basic_machine=m68k-tandem
+		;;
+	stratus)
+		basic_machine=i860-stratus
+		os=-sysv4
+		;;
+	strongarm-* | thumb-*)
+		basic_machine=arm-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	sun2)
+		basic_machine=m68000-sun
+		;;
+	sun2os3)
+		basic_machine=m68000-sun
+		os=-sunos3
+		;;
+	sun2os4)
+		basic_machine=m68000-sun
+		os=-sunos4
+		;;
+	sun3os3)
+		basic_machine=m68k-sun
+		os=-sunos3
+		;;
+	sun3os4)
+		basic_machine=m68k-sun
+		os=-sunos4
+		;;
+	sun4os3)
+		basic_machine=sparc-sun
+		os=-sunos3
+		;;
+	sun4os4)
+		basic_machine=sparc-sun
+		os=-sunos4
+		;;
+	sun4sol2)
+		basic_machine=sparc-sun
+		os=-solaris2
+		;;
+	sun3 | sun3-*)
+		basic_machine=m68k-sun
+		;;
+	sun4)
+		basic_machine=sparc-sun
+		;;
+	sun386 | sun386i | roadrunner)
+		basic_machine=i386-sun
+		;;
+	sv1)
+		basic_machine=sv1-cray
+		os=-unicos
+		;;
+	symmetry)
+		basic_machine=i386-sequent
+		os=-dynix
+		;;
+	t3e)
+		basic_machine=alphaev5-cray
+		os=-unicos
+		;;
+	t90)
+		basic_machine=t90-cray
+		os=-unicos
+		;;
+	tile*)
+		basic_machine=$basic_machine-unknown
+		os=-linux-gnu
+		;;
+	tx39)
+		basic_machine=mipstx39-unknown
+		;;
+	tx39el)
+		basic_machine=mipstx39el-unknown
+		;;
+	toad1)
+		basic_machine=pdp10-xkl
+		os=-tops20
+		;;
+	tower | tower-32)
+		basic_machine=m68k-ncr
+		;;
+	tpf)
+		basic_machine=s390x-ibm
+		os=-tpf
+		;;
+	udi29k)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	ultra3)
+		basic_machine=a29k-nyu
+		os=-sym1
+		;;
+	v810 | necv810)
+		basic_machine=v810-nec
+		os=-none
+		;;
+	vaxv)
+		basic_machine=vax-dec
+		os=-sysv
+		;;
+	vms)
+		basic_machine=vax-dec
+		os=-vms
+		;;
+	vpp*|vx|vx-*)
+		basic_machine=f301-fujitsu
+		;;
+	vxworks960)
+		basic_machine=i960-wrs
+		os=-vxworks
+		;;
+	vxworks68)
+		basic_machine=m68k-wrs
+		os=-vxworks
+		;;
+	vxworks29k)
+		basic_machine=a29k-wrs
+		os=-vxworks
+		;;
+	wasm32)
+		basic_machine=wasm32-unknown
+		;;
+	w65*)
+		basic_machine=w65-wdc
+		os=-none
+		;;
+	w89k-*)
+		basic_machine=hppa1.1-winbond
+		os=-proelf
+		;;
+	xbox)
+		basic_machine=i686-pc
+		os=-mingw32
+		;;
+	xps | xps100)
+		basic_machine=xps100-honeywell
+		;;
+	xscale-* | xscalee[bl]-*)
+		basic_machine=`echo $basic_machine | sed 's/^xscale/arm/'`
+		;;
+	ymp)
+		basic_machine=ymp-cray
+		os=-unicos
+		;;
+	z8k-*-coff)
+		basic_machine=z8k-unknown
+		os=-sim
+		;;
+	z80-*-coff)
+		basic_machine=z80-unknown
+		os=-sim
+		;;
+	none)
+		basic_machine=none-none
+		os=-none
+		;;
+
+# Here we handle the default manufacturer of certain CPU types.  It is in
+# some cases the only manufacturer, in others, it is the most popular.
+	w89k)
+		basic_machine=hppa1.1-winbond
+		;;
+	op50n)
+		basic_machine=hppa1.1-oki
+		;;
+	op60c)
+		basic_machine=hppa1.1-oki
+		;;
+	romp)
+		basic_machine=romp-ibm
+		;;
+	mmix)
+		basic_machine=mmix-knuth
+		;;
+	rs6000)
+		basic_machine=rs6000-ibm
+		;;
+	vax)
+		basic_machine=vax-dec
+		;;
+	pdp10)
+		# there are many clones, so DEC is not a safe bet
+		basic_machine=pdp10-unknown
+		;;
+	pdp11)
+		basic_machine=pdp11-dec
+		;;
+	we32k)
+		basic_machine=we32k-att
+		;;
+	sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele)
+		basic_machine=sh-unknown
+		;;
+	sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v)
+		basic_machine=sparc-sun
+		;;
+	cydra)
+		basic_machine=cydra-cydrome
+		;;
+	orion)
+		basic_machine=orion-highlevel
+		;;
+	orion105)
+		basic_machine=clipper-highlevel
+		;;
+	mac | mpw | mac-mpw)
+		basic_machine=m68k-apple
+		;;
+	pmac | pmac-mpw)
+		basic_machine=powerpc-apple
+		;;
+	*-unknown)
+		# Make sure to match an already-canonicalized machine name.
+		;;
+	*)
+		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		exit 1
+		;;
+esac
+
+# Here we canonicalize certain aliases for manufacturers.
+case $basic_machine in
+	*-digital*)
+		basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
+		;;
+	*-commodore*)
+		basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
+		;;
+	*)
+		;;
+esac
+
+# Decode manufacturer-specific aliases for certain operating systems.
+
+if [ x"$os" != x"" ]
+then
+case $os in
+	# First match some system type aliases
+	# that might get confused with valid system types.
+	# -solaris* is a basic system type, with this one exception.
+	-auroraux)
+		os=-auroraux
+		;;
+	-solaris1 | -solaris1.*)
+		os=`echo $os | sed -e 's|solaris1|sunos4|'`
+		;;
+	-solaris)
+		os=-solaris2
+		;;
+	-svr4*)
+		os=-sysv4
+		;;
+	-unixware*)
+		os=-sysv4.2uw
+		;;
+	-gnu/linux*)
+		os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
+		;;
+	# First accept the basic system types.
+	# The portable systems comes first.
+	# Each alternative MUST END IN A *, to match a version number.
+	# -sysv* is not here because it comes later, after sysvr4.
+	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
+	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
+	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
+	      | -sym* | -kopensolaris* | -plan9* \
+	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
+	      | -aos* | -aros* | -cloudabi* | -sortix* \
+	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
+	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
+	      | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
+	      | -bitrig* | -openbsd* | -solidbsd* | -libertybsd* \
+	      | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
+	      | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
+	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
+	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
+	      | -chorusos* | -chorusrdb* | -cegcc* | -glidix* \
+	      | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
+	      | -midipix* | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \
+	      | -linux-newlib* | -linux-musl* | -linux-uclibc* \
+	      | -uxpv* | -beos* | -mpeix* | -udk* | -moxiebox* \
+	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
+	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
+	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
+	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
+	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
+	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
+	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* \
+	      | -onefs* | -tirtos* | -phoenix* | -fuchsia* | -redox*)
+	# Remember, each alternative MUST END IN *, to match a version number.
+		;;
+	-qnx*)
+		case $basic_machine in
+		    x86-* | i*86-*)
+			;;
+		    *)
+			os=-nto$os
+			;;
+		esac
+		;;
+	-nto-qnx*)
+		;;
+	-nto*)
+		os=`echo $os | sed -e 's|nto|nto-qnx|'`
+		;;
+	-sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
+	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \
+	      | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
+		;;
+	-mac*)
+		os=`echo $os | sed -e 's|mac|macos|'`
+		;;
+	-linux-dietlibc)
+		os=-linux-dietlibc
+		;;
+	-linux*)
+		os=`echo $os | sed -e 's|linux|linux-gnu|'`
+		;;
+	-sunos5*)
+		os=`echo $os | sed -e 's|sunos5|solaris2|'`
+		;;
+	-sunos6*)
+		os=`echo $os | sed -e 's|sunos6|solaris3|'`
+		;;
+	-opened*)
+		os=-openedition
+		;;
+	-os400*)
+		os=-os400
+		;;
+	-wince*)
+		os=-wince
+		;;
+	-osfrose*)
+		os=-osfrose
+		;;
+	-osf*)
+		os=-osf
+		;;
+	-utek*)
+		os=-bsd
+		;;
+	-dynix*)
+		os=-bsd
+		;;
+	-acis*)
+		os=-aos
+		;;
+	-atheos*)
+		os=-atheos
+		;;
+	-syllable*)
+		os=-syllable
+		;;
+	-386bsd)
+		os=-bsd
+		;;
+	-ctix* | -uts*)
+		os=-sysv
+		;;
+	-nova*)
+		os=-rtmk-nova
+		;;
+	-ns2 )
+		os=-nextstep2
+		;;
+	-nsk*)
+		os=-nsk
+		;;
+	# Preserve the version number of sinix5.
+	-sinix5.*)
+		os=`echo $os | sed -e 's|sinix|sysv|'`
+		;;
+	-sinix*)
+		os=-sysv4
+		;;
+	-tpf*)
+		os=-tpf
+		;;
+	-triton*)
+		os=-sysv3
+		;;
+	-oss*)
+		os=-sysv3
+		;;
+	-svr4)
+		os=-sysv4
+		;;
+	-svr3)
+		os=-sysv3
+		;;
+	-sysvr4)
+		os=-sysv4
+		;;
+	# This must come after -sysvr4.
+	-sysv*)
+		;;
+	-ose*)
+		os=-ose
+		;;
+	-es1800*)
+		os=-ose
+		;;
+	-xenix)
+		os=-xenix
+		;;
+	-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+		os=-mint
+		;;
+	-aros*)
+		os=-aros
+		;;
+	-zvmoe)
+		os=-zvmoe
+		;;
+	-dicos*)
+		os=-dicos
+		;;
+	-nacl*)
+		;;
+	-ios)
+		;;
+	-none)
+		;;
+	*)
+		# Get rid of the `-' at the beginning of $os.
+		os=`echo $os | sed 's/[^-]*-//'`
+		echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
+		exit 1
+		;;
+esac
+else
+
+# Here we handle the default operating systems that come with various machines.
+# The value should be what the vendor currently ships out the door with their
+# machine or put another way, the most popular os provided with the machine.
+
+# Note that if you're going to try to match "-MANUFACTURER" here (say,
+# "-sun"), then you have to tell the case statement up towards the top
+# that MANUFACTURER isn't an operating system.  Otherwise, code above
+# will signal an error saying that MANUFACTURER isn't an operating
+# system, and we'll never get to this point.
+
+case $basic_machine in
+	score-*)
+		os=-elf
+		;;
+	spu-*)
+		os=-elf
+		;;
+	*-acorn)
+		os=-riscix1.2
+		;;
+	arm*-rebel)
+		os=-linux
+		;;
+	arm*-semi)
+		os=-aout
+		;;
+	c4x-* | tic4x-*)
+		os=-coff
+		;;
+	c8051-*)
+		os=-elf
+		;;
+	hexagon-*)
+		os=-elf
+		;;
+	tic54x-*)
+		os=-coff
+		;;
+	tic55x-*)
+		os=-coff
+		;;
+	tic6x-*)
+		os=-coff
+		;;
+	# This must come before the *-dec entry.
+	pdp10-*)
+		os=-tops20
+		;;
+	pdp11-*)
+		os=-none
+		;;
+	*-dec | vax-*)
+		os=-ultrix4.2
+		;;
+	m68*-apollo)
+		os=-domain
+		;;
+	i386-sun)
+		os=-sunos4.0.2
+		;;
+	m68000-sun)
+		os=-sunos3
+		;;
+	m68*-cisco)
+		os=-aout
+		;;
+	mep-*)
+		os=-elf
+		;;
+	mips*-cisco)
+		os=-elf
+		;;
+	mips*-*)
+		os=-elf
+		;;
+	or32-*)
+		os=-coff
+		;;
+	*-tti)	# must be before sparc entry or we get the wrong os.
+		os=-sysv3
+		;;
+	sparc-* | *-sun)
+		os=-sunos4.1.1
+		;;
+	pru-*)
+		os=-elf
+		;;
+	*-be)
+		os=-beos
+		;;
+	*-haiku)
+		os=-haiku
+		;;
+	*-ibm)
+		os=-aix
+		;;
+	*-knuth)
+		os=-mmixware
+		;;
+	*-wec)
+		os=-proelf
+		;;
+	*-winbond)
+		os=-proelf
+		;;
+	*-oki)
+		os=-proelf
+		;;
+	*-hp)
+		os=-hpux
+		;;
+	*-hitachi)
+		os=-hiux
+		;;
+	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
+		os=-sysv
+		;;
+	*-cbm)
+		os=-amigaos
+		;;
+	*-dg)
+		os=-dgux
+		;;
+	*-dolphin)
+		os=-sysv3
+		;;
+	m68k-ccur)
+		os=-rtu
+		;;
+	m88k-omron*)
+		os=-luna
+		;;
+	*-next )
+		os=-nextstep
+		;;
+	*-sequent)
+		os=-ptx
+		;;
+	*-crds)
+		os=-unos
+		;;
+	*-ns)
+		os=-genix
+		;;
+	i370-*)
+		os=-mvs
+		;;
+	*-next)
+		os=-nextstep3
+		;;
+	*-gould)
+		os=-sysv
+		;;
+	*-highlevel)
+		os=-bsd
+		;;
+	*-encore)
+		os=-bsd
+		;;
+	*-sgi)
+		os=-irix
+		;;
+	*-siemens)
+		os=-sysv4
+		;;
+	*-masscomp)
+		os=-rtu
+		;;
+	f30[01]-fujitsu | f700-fujitsu)
+		os=-uxpv
+		;;
+	*-rom68k)
+		os=-coff
+		;;
+	*-*bug)
+		os=-coff
+		;;
+	*-apple)
+		os=-macos
+		;;
+	*-atari*)
+		os=-mint
+		;;
+	*)
+		os=-none
+		;;
+esac
+fi
+
+# Here we handle the case where we know the os, and the CPU type, but not the
+# manufacturer.  We pick the logical manufacturer.
+vendor=unknown
+case $basic_machine in
+	*-unknown)
+		case $os in
+			-riscix*)
+				vendor=acorn
+				;;
+			-sunos*)
+				vendor=sun
+				;;
+			-cnk*|-aix*)
+				vendor=ibm
+				;;
+			-beos*)
+				vendor=be
+				;;
+			-hpux*)
+				vendor=hp
+				;;
+			-mpeix*)
+				vendor=hp
+				;;
+			-hiux*)
+				vendor=hitachi
+				;;
+			-unos*)
+				vendor=crds
+				;;
+			-dgux*)
+				vendor=dg
+				;;
+			-luna*)
+				vendor=omron
+				;;
+			-genix*)
+				vendor=ns
+				;;
+			-mvs* | -opened*)
+				vendor=ibm
+				;;
+			-os400*)
+				vendor=ibm
+				;;
+			-ptx*)
+				vendor=sequent
+				;;
+			-tpf*)
+				vendor=ibm
+				;;
+			-vxsim* | -vxworks* | -windiss*)
+				vendor=wrs
+				;;
+			-aux*)
+				vendor=apple
+				;;
+			-hms*)
+				vendor=hitachi
+				;;
+			-mpw* | -macos*)
+				vendor=apple
+				;;
+			-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+				vendor=atari
+				;;
+			-vos*)
+				vendor=stratus
+				;;
+		esac
+		basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
+		;;
+esac
+
+echo $basic_machine$os
+exit
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/core/src/index/thirdparty/faiss/build-aux/install-sh b/core/src/index/thirdparty/faiss/build-aux/install-sh
new file mode 100755
index 0000000000..0360b79e7d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/build-aux/install-sh
@@ -0,0 +1,501 @@
+#!/bin/sh
+# install - install a program, script, or datafile
+
+scriptversion=2016-01-11.22; # UTC
+
+# This originates from X11R5 (mit/util/scripts/install.sh), which was
+# later released in X11R6 (xc/config/util/install.sh) with the
+# following copyright and license.
+#
+# Copyright (C) 1994 X Consortium
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
+# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# Except as contained in this notice, the name of the X Consortium shall not
+# be used in advertising or otherwise to promote the sale, use or other deal-
+# ings in this Software without prior written authorization from the X Consor-
+# tium.
+#
+#
+# FSF changes to this file are in the public domain.
+#
+# Calling this script install-sh is preferred over install.sh, to prevent
+# 'make' implicit rules from creating a file called install from it
+# when there is no Makefile.
+#
+# This script is compatible with the BSD install script, but was written
+# from scratch.
+
+tab='	'
+nl='
+'
+IFS=" $tab$nl"
+
+# Set DOITPROG to "echo" to test this script.
+
+doit=${DOITPROG-}
+doit_exec=${doit:-exec}
+
+# Put in absolute file names if you don't have them in your path;
+# or use environment vars.
+
+chgrpprog=${CHGRPPROG-chgrp}
+chmodprog=${CHMODPROG-chmod}
+chownprog=${CHOWNPROG-chown}
+cmpprog=${CMPPROG-cmp}
+cpprog=${CPPROG-cp}
+mkdirprog=${MKDIRPROG-mkdir}
+mvprog=${MVPROG-mv}
+rmprog=${RMPROG-rm}
+stripprog=${STRIPPROG-strip}
+
+posix_mkdir=
+
+# Desired mode of installed file.
+mode=0755
+
+chgrpcmd=
+chmodcmd=$chmodprog
+chowncmd=
+mvcmd=$mvprog
+rmcmd="$rmprog -f"
+stripcmd=
+
+src=
+dst=
+dir_arg=
+dst_arg=
+
+copy_on_change=false
+is_target_a_directory=possibly
+
+usage="\
+Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE
+   or: $0 [OPTION]... SRCFILES... DIRECTORY
+   or: $0 [OPTION]... -t DIRECTORY SRCFILES...
+   or: $0 [OPTION]... -d DIRECTORIES...
+
+In the 1st form, copy SRCFILE to DSTFILE.
+In the 2nd and 3rd, copy all SRCFILES to DIRECTORY.
+In the 4th, create DIRECTORIES.
+
+Options:
+     --help     display this help and exit.
+     --version  display version info and exit.
+
+  -c            (ignored)
+  -C            install only if different (preserve the last data modification time)
+  -d            create directories instead of installing files.
+  -g GROUP      $chgrpprog installed files to GROUP.
+  -m MODE       $chmodprog installed files to MODE.
+  -o USER       $chownprog installed files to USER.
+  -s            $stripprog installed files.
+  -t DIRECTORY  install into DIRECTORY.
+  -T            report an error if DSTFILE is a directory.
+
+Environment variables override the default commands:
+  CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG
+  RMPROG STRIPPROG
+"
+
+while test $# -ne 0; do
+  case $1 in
+    -c) ;;
+
+    -C) copy_on_change=true;;
+
+    -d) dir_arg=true;;
+
+    -g) chgrpcmd="$chgrpprog $2"
+        shift;;
+
+    --help) echo "$usage"; exit $?;;
+
+    -m) mode=$2
+        case $mode in
+          *' '* | *"$tab"* | *"$nl"* | *'*'* | *'?'* | *'['*)
+            echo "$0: invalid mode: $mode" >&2
+            exit 1;;
+        esac
+        shift;;
+
+    -o) chowncmd="$chownprog $2"
+        shift;;
+
+    -s) stripcmd=$stripprog;;
+
+    -t)
+        is_target_a_directory=always
+        dst_arg=$2
+        # Protect names problematic for 'test' and other utilities.
+        case $dst_arg in
+          -* | [=\(\)!]) dst_arg=./$dst_arg;;
+        esac
+        shift;;
+
+    -T) is_target_a_directory=never;;
+
+    --version) echo "$0 $scriptversion"; exit $?;;
+
+    --) shift
+        break;;
+
+    -*) echo "$0: invalid option: $1" >&2
+        exit 1;;
+
+    *)  break;;
+  esac
+  shift
+done
+
+# We allow the use of options -d and -T together, by making -d
+# take the precedence; this is for compatibility with GNU install.
+
+if test -n "$dir_arg"; then
+  if test -n "$dst_arg"; then
+    echo "$0: target directory not allowed when installing a directory." >&2
+    exit 1
+  fi
+fi
+
+if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then
+  # When -d is used, all remaining arguments are directories to create.
+  # When -t is used, the destination is already specified.
+  # Otherwise, the last argument is the destination.  Remove it from $@.
+  for arg
+  do
+    if test -n "$dst_arg"; then
+      # $@ is not empty: it contains at least $arg.
+      set fnord "$@" "$dst_arg"
+      shift # fnord
+    fi
+    shift # arg
+    dst_arg=$arg
+    # Protect names problematic for 'test' and other utilities.
+    case $dst_arg in
+      -* | [=\(\)!]) dst_arg=./$dst_arg;;
+    esac
+  done
+fi
+
+if test $# -eq 0; then
+  if test -z "$dir_arg"; then
+    echo "$0: no input file specified." >&2
+    exit 1
+  fi
+  # It's OK to call 'install-sh -d' without argument.
+  # This can happen when creating conditional directories.
+  exit 0
+fi
+
+if test -z "$dir_arg"; then
+  if test $# -gt 1 || test "$is_target_a_directory" = always; then
+    if test ! -d "$dst_arg"; then
+      echo "$0: $dst_arg: Is not a directory." >&2
+      exit 1
+    fi
+  fi
+fi
+
+if test -z "$dir_arg"; then
+  do_exit='(exit $ret); exit $ret'
+  trap "ret=129; $do_exit" 1
+  trap "ret=130; $do_exit" 2
+  trap "ret=141; $do_exit" 13
+  trap "ret=143; $do_exit" 15
+
+  # Set umask so as not to create temps with too-generous modes.
+  # However, 'strip' requires both read and write access to temps.
+  case $mode in
+    # Optimize common cases.
+    *644) cp_umask=133;;
+    *755) cp_umask=22;;
+
+    *[0-7])
+      if test -z "$stripcmd"; then
+        u_plus_rw=
+      else
+        u_plus_rw='% 200'
+      fi
+      cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;;
+    *)
+      if test -z "$stripcmd"; then
+        u_plus_rw=
+      else
+        u_plus_rw=,u+rw
+      fi
+      cp_umask=$mode$u_plus_rw;;
+  esac
+fi
+
+for src
+do
+  # Protect names problematic for 'test' and other utilities.
+  case $src in
+    -* | [=\(\)!]) src=./$src;;
+  esac
+
+  if test -n "$dir_arg"; then
+    dst=$src
+    dstdir=$dst
+    test -d "$dstdir"
+    dstdir_status=$?
+  else
+
+    # Waiting for this to be detected by the "$cpprog $src $dsttmp" command
+    # might cause directories to be created, which would be especially bad
+    # if $src (and thus $dsttmp) contains '*'.
+    if test ! -f "$src" && test ! -d "$src"; then
+      echo "$0: $src does not exist." >&2
+      exit 1
+    fi
+
+    if test -z "$dst_arg"; then
+      echo "$0: no destination specified." >&2
+      exit 1
+    fi
+    dst=$dst_arg
+
+    # If destination is a directory, append the input filename; won't work
+    # if double slashes aren't ignored.
+    if test -d "$dst"; then
+      if test "$is_target_a_directory" = never; then
+        echo "$0: $dst_arg: Is a directory" >&2
+        exit 1
+      fi
+      dstdir=$dst
+      dst=$dstdir/`basename "$src"`
+      dstdir_status=0
+    else
+      dstdir=`dirname "$dst"`
+      test -d "$dstdir"
+      dstdir_status=$?
+    fi
+  fi
+
+  obsolete_mkdir_used=false
+
+  if test $dstdir_status != 0; then
+    case $posix_mkdir in
+      '')
+        # Create intermediate dirs using mode 755 as modified by the umask.
+        # This is like FreeBSD 'install' as of 1997-10-28.
+        umask=`umask`
+        case $stripcmd.$umask in
+          # Optimize common cases.
+          *[2367][2367]) mkdir_umask=$umask;;
+          .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;;
+
+          *[0-7])
+            mkdir_umask=`expr $umask + 22 \
+              - $umask % 100 % 40 + $umask % 20 \
+              - $umask % 10 % 4 + $umask % 2
+            `;;
+          *) mkdir_umask=$umask,go-w;;
+        esac
+
+        # With -d, create the new directory with the user-specified mode.
+        # Otherwise, rely on $mkdir_umask.
+        if test -n "$dir_arg"; then
+          mkdir_mode=-m$mode
+        else
+          mkdir_mode=
+        fi
+
+        posix_mkdir=false
+        case $umask in
+          *[123567][0-7][0-7])
+            # POSIX mkdir -p sets u+wx bits regardless of umask, which
+            # is incompatible with FreeBSD 'install' when (umask & 300) != 0.
+            ;;
+          *)
+            tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
+            trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
+
+            if (umask $mkdir_umask &&
+                exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
+            then
+              if test -z "$dir_arg" || {
+                   # Check for POSIX incompatibilities with -m.
+                   # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
+                   # other-writable bit of parent directory when it shouldn't.
+                   # FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
+                   ls_ld_tmpdir=`ls -ld "$tmpdir"`
+                   case $ls_ld_tmpdir in
+                     d????-?r-*) different_mode=700;;
+                     d????-?--*) different_mode=755;;
+                     *) false;;
+                   esac &&
+                   $mkdirprog -m$different_mode -p -- "$tmpdir" && {
+                     ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
+                     test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
+                   }
+                 }
+              then posix_mkdir=:
+              fi
+              rmdir "$tmpdir/d" "$tmpdir"
+            else
+              # Remove any dirs left behind by ancient mkdir implementations.
+              rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
+            fi
+            trap '' 0;;
+        esac;;
+    esac
+
+    if
+      $posix_mkdir && (
+        umask $mkdir_umask &&
+        $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir"
+      )
+    then :
+    else
+
+      # The umask is ridiculous, or mkdir does not conform to POSIX,
+      # or it failed possibly due to a race condition.  Create the
+      # directory the slow way, step by step, checking for races as we go.
+
+      case $dstdir in
+        /*) prefix='/';;
+        [-=\(\)!]*) prefix='./';;
+        *)  prefix='';;
+      esac
+
+      oIFS=$IFS
+      IFS=/
+      set -f
+      set fnord $dstdir
+      shift
+      set +f
+      IFS=$oIFS
+
+      prefixes=
+
+      for d
+      do
+        test X"$d" = X && continue
+
+        prefix=$prefix$d
+        if test -d "$prefix"; then
+          prefixes=
+        else
+          if $posix_mkdir; then
+            (umask=$mkdir_umask &&
+             $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break
+            # Don't fail if two instances are running concurrently.
+            test -d "$prefix" || exit 1
+          else
+            case $prefix in
+              *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;;
+              *) qprefix=$prefix;;
+            esac
+            prefixes="$prefixes '$qprefix'"
+          fi
+        fi
+        prefix=$prefix/
+      done
+
+      if test -n "$prefixes"; then
+        # Don't fail if two instances are running concurrently.
+        (umask $mkdir_umask &&
+         eval "\$doit_exec \$mkdirprog $prefixes") ||
+          test -d "$dstdir" || exit 1
+        obsolete_mkdir_used=true
+      fi
+    fi
+  fi
+
+  if test -n "$dir_arg"; then
+    { test -z "$chowncmd" || $doit $chowncmd "$dst"; } &&
+    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } &&
+    { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false ||
+      test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1
+  else
+
+    # Make a couple of temp file names in the proper directory.
+    dsttmp=$dstdir/_inst.$$_
+    rmtmp=$dstdir/_rm.$$_
+
+    # Trap to clean up those temp files at exit.
+    trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0
+
+    # Copy the file name to the temp name.
+    (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") &&
+
+    # and set any options; do chmod last to preserve setuid bits.
+    #
+    # If any of these fail, we abort the whole thing.  If we want to
+    # ignore errors from any of these, just make sure not to ignore
+    # errors from the above "$doit $cpprog $src $dsttmp" command.
+    #
+    { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } &&
+    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } &&
+    { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } &&
+    { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } &&
+
+    # If -C, don't bother to copy if it wouldn't change the file.
+    if $copy_on_change &&
+       old=`LC_ALL=C ls -dlL "$dst"     2>/dev/null` &&
+       new=`LC_ALL=C ls -dlL "$dsttmp"  2>/dev/null` &&
+       set -f &&
+       set X $old && old=:$2:$4:$5:$6 &&
+       set X $new && new=:$2:$4:$5:$6 &&
+       set +f &&
+       test "$old" = "$new" &&
+       $cmpprog "$dst" "$dsttmp" >/dev/null 2>&1
+    then
+      rm -f "$dsttmp"
+    else
+      # Rename the file to the real destination.
+      $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null ||
+
+      # The rename failed, perhaps because mv can't rename something else
+      # to itself, or perhaps because mv is so ancient that it does not
+      # support -f.
+      {
+        # Now remove or move aside any old file at destination location.
+        # We try this two ways since rm can't unlink itself on some
+        # systems and the destination file might be busy for other
+        # reasons.  In this case, the final cleanup might fail but the new
+        # file should still install successfully.
+        {
+          test ! -f "$dst" ||
+          $doit $rmcmd -f "$dst" 2>/dev/null ||
+          { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null &&
+            { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; }
+          } ||
+          { echo "$0: cannot unlink or rename $dst" >&2
+            (exit 1); exit 1
+          }
+        } &&
+
+        # Now rename the file to the real destination.
+        $doit $mvcmd "$dsttmp" "$dst"
+      }
+    fi || exit 1
+
+    trap '' 0
+  fi
+done
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC0"
+# time-stamp-end: "; # UTC"
+# End:
diff --git a/core/src/index/thirdparty/faiss/build.sh b/core/src/index/thirdparty/faiss/build.sh
new file mode 100755
index 0000000000..9dbdd9ea73
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/build.sh
@@ -0,0 +1,2 @@
+./configure CPUFLAGS='-mavx -mf16c -msse4 -mpopcnt'   CXXFLAGS='-O0 -g -fPIC -m64 -Wno-sign-compare -Wall -Wextra' --prefix=$PWD --with-cuda-arch=-gencode=arch=compute_75,code=sm_75 --with-cuda=/usr/local/cuda
+make install -j
diff --git a/core/src/index/thirdparty/faiss/c_api/AutoTune_c.cpp b/core/src/index/thirdparty/faiss/c_api/AutoTune_c.cpp
new file mode 100644
index 0000000000..008d6f8482
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/AutoTune_c.cpp
@@ -0,0 +1,93 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include <cstring>
+#include "AutoTune.h"
+#include "AutoTune_c.h"
+#include "macros_impl.h"
+
+using faiss::Index;
+using faiss::ParameterRange;
+using faiss::ParameterSpace;
+
+/** Build and index with the sequence of processing steps described in
+ *  the string.
+ */
+int faiss_index_factory(FaissIndex** p_index, int d, const char* description, FaissMetricType metric) {
+    try {
+        *p_index = reinterpret_cast<FaissIndex*>(faiss::index_factory(
+            d, description, static_cast<faiss::MetricType>(metric)));
+    } CATCH_AND_HANDLE
+}
+
+const char* faiss_ParameterRange_name(const FaissParameterRange* range) {
+    return reinterpret_cast<const ParameterRange*>(range)->name.c_str();
+}
+
+void faiss_ParameterRange_values(FaissParameterRange* range, double** p_values, size_t* p_size) {
+    auto& values = reinterpret_cast<ParameterRange*>(range)->values;
+    *p_values = values.data();
+    *p_size = values.size();
+}
+
+int faiss_ParameterSpace_new(FaissParameterSpace** space) {
+    try {
+        auto new_space = new ParameterSpace();
+        *space = reinterpret_cast<FaissParameterSpace*>(new_space);
+    } CATCH_AND_HANDLE
+}
+
+DEFINE_DESTRUCTOR(ParameterSpace)
+
+size_t faiss_ParameterSpace_n_combinations(const FaissParameterSpace* space) {
+    return reinterpret_cast<const ParameterSpace*>(space)->n_combinations();
+}
+
+int faiss_ParameterSpace_combination_name(const FaissParameterSpace* space, size_t cno, char* char_buffer, size_t size) {
+    try {
+        auto rep = reinterpret_cast<const ParameterSpace*>(space)->combination_name(cno);
+        strncpy(char_buffer, rep.c_str(), size);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_ParameterSpace_set_index_parameters(const FaissParameterSpace* space, FaissIndex* cindex, const char* param_string) {
+    try {
+        auto index = reinterpret_cast<Index*>(cindex);
+        reinterpret_cast<const ParameterSpace*>(space)->set_index_parameters(index, param_string);
+    } CATCH_AND_HANDLE
+}
+
+/// set a combination of parameters on an index
+int faiss_ParameterSpace_set_index_parameters_cno(const FaissParameterSpace* space, FaissIndex* cindex, size_t cno) {
+    try {
+        auto index = reinterpret_cast<Index*>(cindex);
+        reinterpret_cast<const ParameterSpace*>(space)->set_index_parameters(index, cno);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_ParameterSpace_set_index_parameter(const FaissParameterSpace* space, FaissIndex* cindex, const char * name, double value) {
+    try {
+        auto index = reinterpret_cast<Index*>(cindex);
+        reinterpret_cast<const ParameterSpace*>(space)->set_index_parameter(index, name, value);
+    } CATCH_AND_HANDLE
+}
+
+void faiss_ParameterSpace_display(const FaissParameterSpace* space) {
+    reinterpret_cast<const ParameterSpace*>(space)->display();
+}
+
+int faiss_ParameterSpace_add_range(FaissParameterSpace* space, const char* name, FaissParameterRange** p_range) {
+    try {
+        ParameterRange& range = reinterpret_cast<ParameterSpace*>(space)->add_range(name);
+        if (p_range) {
+            *p_range = reinterpret_cast<FaissParameterRange*>(&range);
+        }
+    } CATCH_AND_HANDLE
+}
diff --git a/core/src/index/thirdparty/faiss/c_api/AutoTune_c.h b/core/src/index/thirdparty/faiss/c_api/AutoTune_c.h
new file mode 100644
index 0000000000..908f355a4d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/AutoTune_c.h
@@ -0,0 +1,69 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#ifndef FAISS_AUTO_TUNE_C_H
+#define FAISS_AUTO_TUNE_C_H
+
+#include "faiss_c.h"
+#include "Index_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Build and index with the sequence of processing steps described in
+ *  the string.
+ */
+int faiss_index_factory(FaissIndex** p_index, int d, const char* description, FaissMetricType metric);
+
+/// possible values of a parameter, sorted from least to most expensive/accurate
+FAISS_DECLARE_CLASS(ParameterRange)
+
+FAISS_DECLARE_GETTER(ParameterRange, const char*, name)
+
+/// Getter for the values in the range. The output values are invalidated
+/// upon any other modification of the range.
+void faiss_ParameterRange_values(FaissParameterRange*, double**, size_t*);
+
+/** Uses a-priori knowledge on the Faiss indexes to extract tunable parameters.
+ */
+FAISS_DECLARE_CLASS(ParameterSpace)
+
+/// Parameter space default constructor
+int faiss_ParameterSpace_new(FaissParameterSpace** space);
+
+/// nb of combinations, = product of values sizes
+size_t faiss_ParameterSpace_n_combinations(const FaissParameterSpace*);
+
+/// get string representation of the combination
+/// by writing it to the given character buffer.
+/// A buffer size of 1000 ensures that the full name is collected.
+int faiss_ParameterSpace_combination_name(const FaissParameterSpace*, size_t, char*, size_t);
+
+/// set a combination of parameters described by a string
+int faiss_ParameterSpace_set_index_parameters(const FaissParameterSpace*, FaissIndex*, const char *);
+
+/// set a combination of parameters on an index
+int faiss_ParameterSpace_set_index_parameters_cno(const FaissParameterSpace*, FaissIndex*, size_t);
+
+/// set one of the parameters
+int faiss_ParameterSpace_set_index_parameter(const FaissParameterSpace*, FaissIndex*, const char *, double);
+
+/// print a description on stdout
+void faiss_ParameterSpace_display(const FaissParameterSpace*);
+
+/// add a new parameter (or return it if it exists)
+int faiss_ParameterSpace_add_range(FaissParameterSpace*, const char*, FaissParameterRange**);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/c_api/AuxIndexStructures_c.cpp b/core/src/index/thirdparty/faiss/c_api/AuxIndexStructures_c.cpp
new file mode 100644
index 0000000000..d0a0d380ee
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/AuxIndexStructures_c.cpp
@@ -0,0 +1,193 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "AuxIndexStructures_c.h"
+#include "AuxIndexStructures.h"
+#include "macros_impl.h"
+#include <iostream>
+
+using faiss::BufferList;
+using faiss::IDSelector;
+using faiss::IDSelectorBatch;
+using faiss::IDSelectorRange;
+using faiss::RangeSearchResult;
+using faiss::RangeSearchPartialResult;
+using faiss::RangeQueryResult;
+
+DEFINE_GETTER(RangeSearchResult, size_t, nq)
+
+int faiss_RangeSearchResult_new(FaissRangeSearchResult** p_rsr, idx_t nq) {
+    try {
+        *p_rsr = reinterpret_cast<FaissRangeSearchResult*>(
+            new RangeSearchResult(nq));
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_RangeSearchResult_new_with(FaissRangeSearchResult** p_rsr, idx_t nq, int alloc_lims) {
+    try {
+        *p_rsr = reinterpret_cast<FaissRangeSearchResult*>(
+            new RangeSearchResult(nq, static_cast<bool>(alloc_lims)));
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+/// called when lims contains the nb of elements result entries
+/// for each query
+int faiss_RangeSearchResult_do_allocation(FaissRangeSearchResult* rsr) {
+    try {
+        reinterpret_cast<RangeSearchResult*>(rsr)->do_allocation();
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+DEFINE_DESTRUCTOR(RangeSearchResult)
+
+/// getter for buffer_size
+DEFINE_GETTER(RangeSearchResult, size_t, buffer_size)
+
+/// getter for lims: size (nq + 1)
+void faiss_RangeSearchResult_lims(FaissRangeSearchResult* rsr, size_t** lims) {
+    *lims = reinterpret_cast<RangeSearchResult*>(rsr)->lims;
+}
+
+/// getter for labels and respective distances (not sorted):
+/// result for query i is labels[lims[i]:lims[i+1]]
+void faiss_RangeSearchResult_labels(FaissRangeSearchResult* rsr, idx_t** labels, float** distances) {
+    auto sr = reinterpret_cast<RangeSearchResult*>(rsr);
+    *labels = sr->labels;
+    *distances = sr->distances;
+}
+
+DEFINE_DESTRUCTOR(IDSelector)
+
+int faiss_IDSelector_is_member(const FaissIDSelector* sel, idx_t id)  {
+    return reinterpret_cast<const IDSelector*>(sel)->is_member(id);
+}
+
+DEFINE_DESTRUCTOR(IDSelectorRange)
+
+DEFINE_GETTER(IDSelectorRange, idx_t, imin)
+DEFINE_GETTER(IDSelectorRange, idx_t, imax)
+
+int faiss_IDSelectorRange_new(FaissIDSelectorRange** p_sel, idx_t imin, idx_t imax) {
+    try {
+        *p_sel = reinterpret_cast<FaissIDSelectorRange*>(
+            new IDSelectorRange(imin, imax)
+        );
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+DEFINE_GETTER(IDSelectorBatch, int, nbits)
+DEFINE_GETTER(IDSelectorBatch, idx_t, mask)
+
+int faiss_IDSelectorBatch_new(FaissIDSelectorBatch** p_sel, size_t n, const idx_t* indices) {
+    try {
+        *p_sel = reinterpret_cast<FaissIDSelectorBatch*>(
+            new IDSelectorBatch(n, indices)
+        );
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+// Below are structures used only by Index implementations
+
+DEFINE_DESTRUCTOR(BufferList)
+
+DEFINE_GETTER(BufferList, size_t, buffer_size)
+DEFINE_GETTER(BufferList, size_t, wp)
+
+int faiss_BufferList_append_buffer(FaissBufferList* bl) {
+    try {
+        reinterpret_cast<BufferList*>(bl)->append_buffer();
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_BufferList_new(FaissBufferList** p_bl, size_t buffer_size) {
+    try {
+        *p_bl = reinterpret_cast<FaissBufferList*>(
+            new BufferList(buffer_size)
+        );
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_BufferList_add(FaissBufferList* bl, idx_t id, float dis) {
+    try {
+        reinterpret_cast<BufferList*>(bl)->add(id, dis);
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+/// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
+/// tables dest_ids, dest_dis
+int faiss_BufferList_copy_range(
+    FaissBufferList* bl, size_t ofs, size_t n, idx_t *dest_ids, float *dest_dis) {
+    try {
+        reinterpret_cast<BufferList*>(bl)->copy_range(ofs, n, dest_ids, dest_dis);
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+DEFINE_GETTER(RangeQueryResult, idx_t, qno)
+DEFINE_GETTER(RangeQueryResult, size_t, nres)
+DEFINE_GETTER_PERMISSIVE(RangeQueryResult, FaissRangeSearchPartialResult*, pres)
+
+int faiss_RangeQueryResult_add(FaissRangeQueryResult* qr, float dis, idx_t id) {
+    try {
+        reinterpret_cast<RangeQueryResult*>(qr)->add(dis, id);
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+DEFINE_GETTER_PERMISSIVE(RangeSearchPartialResult, FaissRangeSearchResult*, res)
+
+int faiss_RangeSearchPartialResult_new(
+    FaissRangeSearchPartialResult** p_res, FaissRangeSearchResult* res_in) {
+    try {
+        *p_res = reinterpret_cast<FaissRangeSearchPartialResult*>(
+            new RangeSearchPartialResult(
+                reinterpret_cast<RangeSearchResult*>(res_in))
+        );
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_RangeSearchPartialResult_finalize(
+    FaissRangeSearchPartialResult* res) {
+    try {
+        reinterpret_cast<RangeSearchPartialResult*>(res)->finalize();
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+/// called by range_search before do_allocation
+int faiss_RangeSearchPartialResult_set_lims(
+    FaissRangeSearchPartialResult* res) {
+    try {
+        reinterpret_cast<RangeSearchPartialResult*>(res)->set_lims();
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_RangeSearchPartialResult_new_result(
+    FaissRangeSearchPartialResult* res, idx_t qno, FaissRangeQueryResult** qr) {
+
+    try {
+        auto q = 
+            &reinterpret_cast<RangeSearchPartialResult*>(res)->new_result(qno);
+        if (qr) {
+            *qr = reinterpret_cast<FaissRangeQueryResult*>(&q);
+        }
+        return 0;
+    } CATCH_AND_HANDLE
+}
diff --git a/core/src/index/thirdparty/faiss/c_api/AuxIndexStructures_c.h b/core/src/index/thirdparty/faiss/c_api/AuxIndexStructures_c.h
new file mode 100644
index 0000000000..ebcbc1cc34
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/AuxIndexStructures_c.h
@@ -0,0 +1,133 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#ifndef FAISS_AUX_INDEX_STRUCTURES_C_H
+#define FAISS_AUX_INDEX_STRUCTURES_C_H
+
+#include "Index_c.h"
+#include "faiss_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+FAISS_DECLARE_CLASS(RangeSearchResult)
+
+FAISS_DECLARE_GETTER(RangeSearchResult, size_t, nq)
+
+int faiss_RangeSearchResult_new(FaissRangeSearchResult** p_rsr, idx_t nq);
+
+int faiss_RangeSearchResult_new_with(FaissRangeSearchResult** p_rsr, idx_t nq, int alloc_lims);
+
+/// called when lims contains the nb of elements result entries
+/// for each query
+int faiss_RangeSearchResult_do_allocation(FaissRangeSearchResult* rsr);
+
+FAISS_DECLARE_DESTRUCTOR(RangeSearchResult)
+
+/// getter for buffer_size
+FAISS_DECLARE_GETTER(RangeSearchResult, size_t, buffer_size)
+
+/// getter for lims: size (nq + 1)
+void faiss_RangeSearchResult_lims(
+    FaissRangeSearchResult* rsr, size_t** lims);
+
+/// getter for labels and respective distances (not sorted):
+/// result for query i is labels[lims[i]:lims[i+1]]
+void faiss_RangeSearchResult_labels(
+    FaissRangeSearchResult* rsr, idx_t** labels, float** distances);
+
+
+/** Encapsulates a set of ids to remove. */
+FAISS_DECLARE_CLASS(IDSelector)
+FAISS_DECLARE_DESTRUCTOR(IDSelector)
+
+int faiss_IDSelector_is_member(const FaissIDSelector* sel, idx_t id);
+
+/** remove ids between [imni, imax) */
+FAISS_DECLARE_CLASS(IDSelectorRange)
+FAISS_DECLARE_DESTRUCTOR(IDSelectorRange)
+
+FAISS_DECLARE_GETTER(IDSelectorRange, idx_t, imin)
+FAISS_DECLARE_GETTER(IDSelectorRange, idx_t, imax)
+
+int faiss_IDSelectorRange_new(FaissIDSelectorRange** p_sel, idx_t imin, idx_t imax);
+
+/** Remove ids from a set. Repetitions of ids in the indices set
+ * passed to the constructor does not hurt performance. The hash
+ * function used for the bloom filter and GCC's implementation of
+ * unordered_set are just the least significant bits of the id. This
+ * works fine for random ids or ids in sequences but will produce many
+ * hash collisions if lsb's are always the same */
+FAISS_DECLARE_CLASS(IDSelectorBatch)
+
+FAISS_DECLARE_GETTER(IDSelectorBatch, int, nbits)
+FAISS_DECLARE_GETTER(IDSelectorBatch, idx_t, mask)
+
+int faiss_IDSelectorBatch_new(FaissIDSelectorBatch** p_sel, size_t n, const idx_t* indices);
+
+// Below are structures used only by Index implementations
+
+/** List of temporary buffers used to store results before they are
+ *  copied to the RangeSearchResult object. */
+FAISS_DECLARE_CLASS(BufferList)
+FAISS_DECLARE_DESTRUCTOR(BufferList)
+
+FAISS_DECLARE_GETTER(BufferList, size_t, buffer_size)
+FAISS_DECLARE_GETTER(BufferList, size_t, wp)
+
+typedef struct FaissBuffer {
+    idx_t *ids;
+    float *dis;
+} FaissBuffer;
+
+int faiss_BufferList_append_buffer(FaissBufferList* bl);
+
+int faiss_BufferList_new(FaissBufferList** p_bl, size_t buffer_size);
+
+int faiss_BufferList_add(FaissBufferList* bl, idx_t id, float dis);
+
+/// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
+/// tables dest_ids, dest_dis
+int faiss_BufferList_copy_range(
+    FaissBufferList* bl, size_t ofs, size_t n, idx_t *dest_ids, float *dest_dis);
+
+/// the entries in the buffers are split per query
+FAISS_DECLARE_CLASS(RangeSearchPartialResult)
+
+/// result structure for a single query
+FAISS_DECLARE_CLASS(RangeQueryResult)
+FAISS_DECLARE_GETTER(RangeQueryResult, idx_t, qno)
+FAISS_DECLARE_GETTER(RangeQueryResult, size_t, nres)
+FAISS_DECLARE_GETTER(RangeQueryResult, FaissRangeSearchPartialResult*, pres)
+
+int faiss_RangeQueryResult_add(FaissRangeQueryResult* qr, float dis, idx_t id);
+
+
+FAISS_DECLARE_GETTER(RangeSearchPartialResult, FaissRangeSearchResult*, res)
+
+int faiss_RangeSearchPartialResult_new(
+    FaissRangeSearchPartialResult** p_res, FaissRangeSearchResult* res_in);
+
+int faiss_RangeSearchPartialResult_finalize(
+    FaissRangeSearchPartialResult* res);
+
+/// called by range_search before do_allocation
+int faiss_RangeSearchPartialResult_set_lims(
+    FaissRangeSearchPartialResult* res);
+
+int faiss_RangeSearchPartialResult_new_result(
+    FaissRangeSearchPartialResult* res, idx_t qno, FaissRangeQueryResult** qr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/c_api/Clustering_c.cpp b/core/src/index/thirdparty/faiss/c_api/Clustering_c.cpp
new file mode 100644
index 0000000000..1687ed1e45
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/Clustering_c.cpp
@@ -0,0 +1,139 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "Clustering_c.h"
+#include "Clustering.h"
+#include "Index.h"
+#include <vector>
+#include "macros_impl.h"
+
+extern "C" {
+
+using faiss::Clustering;
+using faiss::ClusteringParameters;
+using faiss::Index;
+
+DEFINE_GETTER(Clustering, int, niter)
+DEFINE_GETTER(Clustering, int, nredo)
+DEFINE_GETTER(Clustering, int, verbose)
+DEFINE_GETTER(Clustering, int, spherical)
+DEFINE_GETTER(Clustering, int, update_index)
+DEFINE_GETTER(Clustering, int, frozen_centroids)
+
+DEFINE_GETTER(Clustering, int, min_points_per_centroid)
+DEFINE_GETTER(Clustering, int, max_points_per_centroid)
+
+DEFINE_GETTER(Clustering, int, seed)
+
+/// getter for d
+DEFINE_GETTER(Clustering, size_t, d)
+
+/// getter for k
+DEFINE_GETTER(Clustering, size_t, k)
+
+void faiss_ClusteringParameters_init(FaissClusteringParameters* params) {
+    ClusteringParameters d;
+    params->frozen_centroids = d.frozen_centroids;
+    params->max_points_per_centroid = d.max_points_per_centroid;
+    params->min_points_per_centroid = d.min_points_per_centroid;
+    params->niter = d.niter;
+    params->nredo = d.nredo;
+    params->seed = d.seed;
+    params->spherical = d.spherical;
+    params->update_index = d.update_index;
+    params->verbose = d.verbose;   
+}
+
+// This conversion is required because the two types are not memory-compatible
+inline ClusteringParameters from_faiss_c(const FaissClusteringParameters* params) {
+    ClusteringParameters o;
+    o.frozen_centroids = params->frozen_centroids;
+    o.max_points_per_centroid = params->max_points_per_centroid;
+    o.min_points_per_centroid = params->min_points_per_centroid;
+    o.niter = params->niter;
+    o.nredo = params->nredo;
+    o.seed = params->seed;
+    o.spherical = params->spherical;
+    o.update_index = params->update_index;
+    o.verbose = params->verbose;
+    return o;
+}
+
+/// getter for centroids (size = k * d)
+void faiss_Clustering_centroids(
+    FaissClustering* clustering, float** centroids, size_t* size) {
+    std::vector<float>& v = reinterpret_cast<Clustering*>(clustering)->centroids;
+    if (centroids) {
+        *centroids = v.data();
+    }
+    if (size) {
+        *size = v.size();
+    }
+}
+
+/// getter for objective values (sum of distances reported by index)
+/// over iterations
+void faiss_Clustering_obj(
+    FaissClustering* clustering, float** obj, size_t* size) {
+    std::vector<float>& v = reinterpret_cast<Clustering*>(clustering)->obj;
+    if (obj) {
+        *obj = v.data();
+    }
+    if (size) {
+        *size = v.size();
+    }
+}
+
+/// the only mandatory parameters are k and d
+int faiss_Clustering_new(FaissClustering** p_clustering, int d, int k) {
+    try {
+        Clustering* c = new Clustering(d, k);
+        *p_clustering = reinterpret_cast<FaissClustering*>(c);
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_Clustering_new_with_params(
+    FaissClustering** p_clustering, int d, int k, const FaissClusteringParameters* cp) {
+    try {
+        Clustering* c = new Clustering(d, k, from_faiss_c(cp));
+        *p_clustering = reinterpret_cast<FaissClustering*>(c);
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+/// Index is used during the assignment stage
+int faiss_Clustering_train(
+    FaissClustering* clustering, idx_t n, const float* x, FaissIndex* index) {
+    try {
+        reinterpret_cast<Clustering*>(clustering)->train(
+            n, x, *reinterpret_cast<Index*>(index));
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+void faiss_Clustering_free(FaissClustering* clustering) {
+    delete reinterpret_cast<Clustering*>(clustering);
+}
+
+int faiss_kmeans_clustering (size_t d, size_t n, size_t k,
+                             const float *x,
+                             float *centroids,
+                             float *q_error) {
+    try {
+        float out = faiss::kmeans_clustering(d, n, k, x, centroids);
+        if (q_error) {
+            *q_error = out;
+        }
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+}
diff --git a/core/src/index/thirdparty/faiss/c_api/Clustering_c.h b/core/src/index/thirdparty/faiss/c_api/Clustering_c.h
new file mode 100644
index 0000000000..75f25ba4f5
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/Clustering_c.h
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved
+// -*- c -*-
+
+#ifndef FAISS_CLUSTERING_C_H
+#define FAISS_CLUSTERING_C_H
+
+#include "Index_c.h"
+#include "faiss_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Class for the clustering parameters. Can be passed to the
+ * constructor of the Clustering object.
+ */
+typedef struct FaissClusteringParameters {
+    int niter;          ///< clustering iterations
+    int nredo;          ///< redo clustering this many times and keep best
+
+    int verbose;       ///< (bool)
+    int spherical;     ///< (bool) do we want normalized centroids?
+    int update_index;  ///< (bool) update index after each iteration?
+    int frozen_centroids;  ///< (bool) use the centroids provided as input and do not change them during iterations
+
+    int min_points_per_centroid; ///< otherwise you get a warning
+    int max_points_per_centroid;  ///< to limit size of dataset
+
+    int seed; ///< seed for the random number generator
+} FaissClusteringParameters;
+
+
+/// Sets the ClusteringParameters object with reasonable defaults
+void faiss_ClusteringParameters_init(FaissClusteringParameters* params);
+
+
+/** clustering based on assignment - centroid update iterations
+ *
+ * The clustering is based on an Index object that assigns training
+ * points to the centroids. Therefore, at each iteration the centroids
+ * are added to the index.
+ *
+ * On output, the centoids table is set to the latest version
+ * of the centroids and they are also added to the index. If the
+ * centroids table it is not empty on input, it is also used for
+ * initialization.
+ *
+ * To do several clusterings, just call train() several times on
+ * different training sets, clearing the centroid table in between.
+ */
+FAISS_DECLARE_CLASS(Clustering)
+
+FAISS_DECLARE_GETTER(Clustering, int, niter)
+FAISS_DECLARE_GETTER(Clustering, int, nredo)
+FAISS_DECLARE_GETTER(Clustering, int, verbose)
+FAISS_DECLARE_GETTER(Clustering, int, spherical)
+FAISS_DECLARE_GETTER(Clustering, int, update_index)
+FAISS_DECLARE_GETTER(Clustering, int, frozen_centroids)
+
+FAISS_DECLARE_GETTER(Clustering, int, min_points_per_centroid)
+FAISS_DECLARE_GETTER(Clustering, int, max_points_per_centroid)
+
+FAISS_DECLARE_GETTER(Clustering, int, seed)
+
+/// getter for d
+FAISS_DECLARE_GETTER(Clustering, size_t, d)
+
+/// getter for k
+FAISS_DECLARE_GETTER(Clustering, size_t, k)
+
+/// getter for centroids (size = k * d)
+void faiss_Clustering_centroids(
+    FaissClustering* clustering, float** centroids, size_t* size);
+
+/// getter for objective values (sum of distances reported by index)
+/// over iterations
+void faiss_Clustering_obj(
+    FaissClustering* clustering, float** obj, size_t* size);
+
+/// the only mandatory parameters are k and d
+int faiss_Clustering_new(FaissClustering** p_clustering, int d, int k);
+
+int faiss_Clustering_new_with_params(
+    FaissClustering** p_clustering, int d, int k, const FaissClusteringParameters* cp);
+
+int faiss_Clustering_train(
+    FaissClustering* clustering, idx_t n, const float* x, FaissIndex* index);
+
+void faiss_Clustering_free(FaissClustering* clustering);
+
+/** simplified interface
+ *
+ * @param d dimension of the data
+ * @param n nb of training vectors
+ * @param k nb of output centroids
+ * @param x training set (size n * d)
+ * @param centroids output centroids (size k * d)
+ * @param q_error final quantization error
+ * @return error code
+ */
+int faiss_kmeans_clustering (size_t d, size_t n, size_t k,
+                       const float *x,
+                       float *centroids,
+                       float *q_error);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/c_api/INSTALL.md b/core/src/index/thirdparty/faiss/c_api/INSTALL.md
new file mode 100644
index 0000000000..b640d7db73
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/INSTALL.md
@@ -0,0 +1,100 @@
+Faiss C API
+===========
+
+Faiss provides a pure C interface, which can subsequently be used either in pure C programs or to produce bindings for programming languages with Foreign Function Interface (FFI) support. Although this is not required for the Python interface, some other programming languages (e.g. Rust and Julia) do not have SWIG support.
+
+Compilation instructions
+------------------------
+
+The full contents of the pure C API are in the ["c_api"](c_api/) folder.
+Please be sure to follow the instructions on [building the main C++ library](../INSTALL.md#step-1-compiling-the-c-faiss) first.
+Then, enter the [c_api](c_api/) directory and run
+
+  `make`
+
+This builds the dynamic library "faiss_c", containing the full implementation of Faiss and the necessary wrappers for the C interface. It does not depend on libfaiss.a or the C++ standard library. It will also build an example program `bin/example_c`.
+
+Using the API
+-------------
+
+The C API is composed of:
+
+- A set of C header files comprising the main Faiss interfaces, converted for use in C. Each file follows the format `«name»_c.h`, where `«name»` is the respective name from the C++ API. For example, the file [Index_c.h](./Index_c.h) file corresponds to the base `Index` API. Functions are declared with the `faiss_` prefix (e.g. `faiss_IndexFlat_new`), whereas new types have the `Faiss` prefix (e.g. `FaissIndex`, `FaissMetricType`, ...).
+- A dynamic library, compiled from the sources in the same folder, encloses the implementation of the library and wrapper functions.
+
+The index factory is available via the `faiss_index_factory` function in `AutoTune_c.h`:
+
+```c
+FaissIndex* index = NULL;
+int c = faiss_index_factory(&index, 64, "Flat", METRIC_L2);
+if (c) {
+    // operation failed
+}
+```
+
+Most operations that you would find as member functions are available with the format `faiss_«classname»_«member»`.
+
+```c
+idx_t ntotal = faiss_Index_ntotal(index);
+```
+
+Since this is C, the index needs to be freed manually in the end:
+
+```c
+faiss_Index_free(index);
+```
+
+Error handling is done by examining the error code returned by operations with recoverable errors.
+The code identifies the type of exception that rose from the implementation. Fetching the 
+corresponding error message can be done by calling the function `faiss_get_last_error()` from
+`error_c.h`. Getter functions and `free` functions do not return an error code.
+
+```c
+int c = faiss_Index_add(index, nb, xb);
+if (c) {
+    printf("%s", faiss_get_last_error());
+    exit(-1);
+}
+```
+
+An example is included, which is built automatically for the target `all`. It can also be built separately:
+
+  `make bin/example_c`
+
+Building with GPU support
+-------------------------
+
+For GPU support, a separate dynamic library in the "c_api/gpu" directory needs to be built.
+
+  `make`
+
+The "gpufaiss_c" dynamic library contains the GPU and CPU implementations of Faiss, which means that
+it can be used in place of "faiss_c". The same library will dynamically link with the CUDA runtime
+and cuBLAS.
+
+Using the GPU with the C API
+----------------------------
+
+A standard GPU resurces object can be obtained by the name `FaissStandardGpuResources`:
+
+```c
+FaissStandardGpuResources* gpu_res = NULL;
+int c = faiss_StandardGpuResources_new(&gpu_res);
+if (c) {
+    printf("%s", faiss_get_last_error());
+    exit(-1);
+}
+```
+
+Similarly to the C++ API, a CPU index can be converted to a GPU index:
+
+```c
+FaissIndex* cpu_index = NULL;
+int c = faiss_index_factory(&cpu_index, d, "Flat", METRIC_L2);
+if (c) { /* ... */ }
+FaissGpuIndex* gpu_index = NULL;
+c = faiss_index_cpu_to_gpu(gpu_res, 0, cpu_index, &gpu_index);
+if (c) { /* ... */ }
+```
+
+A more complete example is available by the name `bin/example_gpu_c`.
diff --git a/core/src/index/thirdparty/faiss/c_api/IndexFlat_c.cpp b/core/src/index/thirdparty/faiss/c_api/IndexFlat_c.cpp
new file mode 100644
index 0000000000..4b741922e8
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/IndexFlat_c.cpp
@@ -0,0 +1,140 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "IndexFlat_c.h"
+#include "IndexFlat.h"
+#include "Index.h"
+#include "macros_impl.h"
+
+extern "C" {
+
+using faiss::Index;
+using faiss::IndexFlat;
+using faiss::IndexFlatIP;
+using faiss::IndexFlatL2;
+using faiss::IndexFlatL2BaseShift;
+using faiss::IndexRefineFlat;
+using faiss::IndexFlat1D;
+
+DEFINE_DESTRUCTOR(IndexFlat)
+DEFINE_INDEX_DOWNCAST(IndexFlat)
+
+int faiss_IndexFlat_new(FaissIndexFlat** p_index) {
+    try {
+        *p_index = reinterpret_cast<FaissIndexFlat*>(new IndexFlat());
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlat_new_with(FaissIndexFlat** p_index, idx_t d, FaissMetricType metric) {
+    try {
+        IndexFlat* index = new IndexFlat(d, static_cast<faiss::MetricType>(metric));
+        *p_index = reinterpret_cast<FaissIndexFlat*>(index);
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+void faiss_IndexFlat_xb(FaissIndexFlat* index, float** p_xb, size_t* p_size) {
+    auto& xb = reinterpret_cast<IndexFlat*>(index)->xb;
+    *p_xb = xb.data();
+    if (p_size) {
+        *p_size = xb.size();
+    }
+}
+
+int faiss_IndexFlat_compute_distance_subset(
+    FaissIndex* index,
+    idx_t n,
+    const float *x,
+    idx_t k,
+    float *distances,
+    const idx_t *labels) {
+    try {
+        reinterpret_cast<IndexFlat*>(index)->compute_distance_subset(
+            n, x, k, distances, labels);
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlatIP_new(FaissIndexFlatIP** p_index) {
+    try {
+        IndexFlatIP* index = new IndexFlatIP();
+        *p_index = reinterpret_cast<FaissIndexFlatIP*>(index);
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlatIP_new_with(FaissIndexFlatIP** p_index, idx_t d) {
+    try {
+        IndexFlatIP* index = new IndexFlatIP(d);
+        *p_index = reinterpret_cast<FaissIndexFlatIP*>(index);
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlatL2_new(FaissIndexFlatL2** p_index) {
+    try {
+        IndexFlatL2* index = new IndexFlatL2();
+        *p_index = reinterpret_cast<FaissIndexFlatL2*>(index);
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlatL2_new_with(FaissIndexFlatL2** p_index, idx_t d) {
+    try {
+        IndexFlatL2* index = new IndexFlatL2(d);
+        *p_index = reinterpret_cast<FaissIndexFlatL2*>(index);
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlatL2BaseShift_new(FaissIndexFlatL2BaseShift** p_index, idx_t d, size_t nshift, const float *shift) {
+    try {
+        IndexFlatL2BaseShift* index = new IndexFlatL2BaseShift(d, nshift, shift);
+        *p_index = reinterpret_cast<FaissIndexFlatL2BaseShift*>(index);
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexRefineFlat_new(FaissIndexRefineFlat** p_index, FaissIndex* base_index) {
+    try {
+        IndexRefineFlat* index = new IndexRefineFlat(
+            reinterpret_cast<faiss::Index*>(base_index));
+        *p_index = reinterpret_cast<FaissIndexRefineFlat*>(index);
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+DEFINE_DESTRUCTOR(IndexRefineFlat)
+
+int faiss_IndexFlat1D_new(FaissIndexFlat1D** p_index) {
+    try {
+        IndexFlat1D* index = new IndexFlat1D();
+        *p_index = reinterpret_cast<FaissIndexFlat1D*>(index);
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlat1D_new_with(FaissIndexFlat1D** p_index, int continuous_update) {
+    try {
+        IndexFlat1D* index = new IndexFlat1D(static_cast<bool>(continuous_update));
+        *p_index = reinterpret_cast<FaissIndexFlat1D*>(index);
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlat1D_update_permutation(FaissIndexFlat1D* index) {
+    try {
+        reinterpret_cast<IndexFlat1D*>(index)->update_permutation();
+        return 0;
+    } CATCH_AND_HANDLE
+}
+
+}
diff --git a/core/src/index/thirdparty/faiss/c_api/IndexFlat_c.h b/core/src/index/thirdparty/faiss/c_api/IndexFlat_c.h
new file mode 100644
index 0000000000..072ba7dcf3
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/IndexFlat_c.h
@@ -0,0 +1,115 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved
+// -*- c -*-
+
+#ifndef FAISS_INDEX_FLAT_C_H
+#define FAISS_INDEX_FLAT_C_H
+
+#include "Index_c.h"
+#include "faiss_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// forward declaration
+typedef enum FaissMetricType FaissMetricType;
+
+/** Opaque type for IndexFlat */
+FAISS_DECLARE_CLASS_INHERITED(IndexFlat, Index)
+
+int faiss_IndexFlat_new(FaissIndexFlat** p_index);
+
+int faiss_IndexFlat_new_with(FaissIndexFlat** p_index, idx_t d, FaissMetricType metric);
+
+/** get a pointer to the index's internal data (the `xb` field). The outputs
+ * become invalid after any data addition or removal operation.
+ * 
+ * @param index   opaque pointer to index object
+ * @param p_xb    output, the pointer to the beginning of `xb`.
+ * @param p_size  output, the current size of `sb` in number of float values.
+ */
+void faiss_IndexFlat_xb(FaissIndexFlat* index, float** p_xb, size_t* p_size);
+
+/** attempt a dynamic cast to a flat index, thus checking
+ * check whether the underlying index type is `IndexFlat`.
+ * 
+ * @param index opaque pointer to index object
+ * @return the same pointer if the index is a flat index, NULL otherwise
+ */
+FAISS_DECLARE_INDEX_DOWNCAST(IndexFlat)
+
+FAISS_DECLARE_DESTRUCTOR(IndexFlat)
+
+/** compute distance with a subset of vectors
+ *
+ * @param index   opaque pointer to index object
+ * @param x       query vectors, size n * d
+ * @param labels  indices of the vectors that should be compared
+ *                for each query vector, size n * k
+ * @param distances
+ *                corresponding output distances, size n * k
+ */
+int faiss_IndexFlat_compute_distance_subset(
+    FaissIndex *index,
+    idx_t n,
+    const float *x,
+    idx_t k,
+    float *distances,
+    const idx_t *labels);
+
+/** Opaque type for IndexFlatIP */
+FAISS_DECLARE_CLASS_INHERITED(IndexFlatIP, Index)
+
+int faiss_IndexFlatIP_new(FaissIndexFlatIP** p_index);
+
+int faiss_IndexFlatIP_new_with(FaissIndexFlatIP** p_index, idx_t d);
+
+/** Opaque type for IndexFlatL2 */
+FAISS_DECLARE_CLASS_INHERITED(IndexFlatL2, Index)
+
+int faiss_IndexFlatL2_new(FaissIndexFlatL2** p_index);
+
+int faiss_IndexFlatL2_new_with(FaissIndexFlatL2** p_index, idx_t d);
+
+/** Opaque type for IndexFlatL2BaseShift
+ *
+ * same as an IndexFlatL2 but a value is subtracted from each distance
+ */
+FAISS_DECLARE_CLASS_INHERITED(IndexFlatL2BaseShift, Index)
+
+int faiss_IndexFlatL2BaseShift_new(FaissIndexFlatL2BaseShift** p_index, idx_t d, size_t nshift, const float *shift);
+
+/** Opaque type for IndexRefineFlat
+ *
+ * Index that queries in a base_index (a fast one) and refines the
+ * results with an exact search, hopefully improving the results.
+ */
+FAISS_DECLARE_CLASS_INHERITED(IndexRefineFlat, Index)
+
+int faiss_IndexRefineFlat_new(FaissIndexRefineFlat** p_index, FaissIndex* base_index);
+
+FAISS_DECLARE_DESTRUCTOR(IndexRefineFlat)
+
+/** Opaque type for IndexFlat1D
+ * 
+ * optimized version for 1D "vectors"
+ */
+FAISS_DECLARE_CLASS_INHERITED(IndexFlat1D, Index)
+
+int faiss_IndexFlat1D_new(FaissIndexFlat1D** p_index);
+int faiss_IndexFlat1D_new_with(FaissIndexFlat1D** p_index, int continuous_update);
+
+int faiss_IndexFlat1D_update_permutation(FaissIndexFlat1D* index);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/c_api/IndexIVFFlat_c.cpp b/core/src/index/thirdparty/faiss/c_api/IndexIVFFlat_c.cpp
new file mode 100644
index 0000000000..410e39a6c5
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/IndexIVFFlat_c.cpp
@@ -0,0 +1,64 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "Index_c.h"
+#include "Clustering_c.h"
+#include "IndexIVFFlat_c.h"
+#include "IndexIVFFlat.h"
+#include "macros_impl.h"
+
+using faiss::Index;
+using faiss::IndexIVFFlat;
+using faiss::MetricType;
+
+DEFINE_DESTRUCTOR(IndexIVFFlat)
+DEFINE_INDEX_DOWNCAST(IndexIVFFlat)
+
+int faiss_IndexIVFFlat_new(FaissIndexIVFFlat** p_index) {
+    try {
+        *p_index = reinterpret_cast<FaissIndexIVFFlat*>(new IndexIVFFlat());
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexIVFFlat_new_with(FaissIndexIVFFlat** p_index,
+    FaissIndex* quantizer, size_t d, size_t nlist)
+{
+    try {
+        auto q = reinterpret_cast<Index*>(quantizer);
+        *p_index = reinterpret_cast<FaissIndexIVFFlat*>(new IndexIVFFlat(q, d, nlist));
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexIVFFlat_new_with_metric(
+    FaissIndexIVFFlat** p_index, FaissIndex* quantizer, size_t d, size_t nlist,
+    FaissMetricType metric)
+{
+    try {
+        auto q = reinterpret_cast<Index*>(quantizer);
+        auto m = static_cast<MetricType>(metric);
+        *p_index = reinterpret_cast<FaissIndexIVFFlat*>(new IndexIVFFlat(q, d, nlist, m));
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexIVFFlat_add_core(FaissIndexIVFFlat* index, idx_t n, 
+    const float * x, const idx_t *xids, const int64_t *precomputed_idx)
+{
+    try {
+        reinterpret_cast<IndexIVFFlat*>(index)->add_core(n, x, xids, precomputed_idx);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexIVFFlat_update_vectors(FaissIndexIVFFlat* index, int nv,
+    idx_t *idx, const float *v)
+{
+    try {
+        reinterpret_cast<IndexIVFFlat*>(index)->update_vectors(nv, idx, v);
+    } CATCH_AND_HANDLE
+}
diff --git a/core/src/index/thirdparty/faiss/c_api/IndexIVFFlat_c.h b/core/src/index/thirdparty/faiss/c_api/IndexIVFFlat_c.h
new file mode 100644
index 0000000000..4c5f3ec25b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/IndexIVFFlat_c.h
@@ -0,0 +1,58 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#ifndef FAISS_INDEX_IVF_FLAT_C_H
+#define FAISS_INDEX_IVF_FLAT_C_H
+
+#include "faiss_c.h"
+#include "Index_c.h"
+#include "Clustering_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Inverted file with stored vectors. Here the inverted file
+ * pre-selects the vectors to be searched, but they are not otherwise
+ * encoded, the code array just contains the raw float entries.
+ */
+FAISS_DECLARE_CLASS(IndexIVFFlat)
+FAISS_DECLARE_DESTRUCTOR(IndexIVFFlat)
+FAISS_DECLARE_INDEX_DOWNCAST(IndexIVFFlat)
+
+int faiss_IndexIVFFlat_new(FaissIndexIVFFlat** p_index);
+
+int faiss_IndexIVFFlat_new_with(FaissIndexIVFFlat** p_index,
+    FaissIndex* quantizer, size_t d, size_t nlist);
+
+int faiss_IndexIVFFlat_new_with_metric(
+    FaissIndexIVFFlat** p_index, FaissIndex* quantizer, size_t d, size_t nlist,
+    FaissMetricType metric);
+
+int faiss_IndexIVFFlat_add_core(FaissIndexIVFFlat* index, idx_t n, 
+    const float * x, const idx_t *xids, const int64_t *precomputed_idx);
+
+/** Update a subset of vectors.
+ *
+ * The index must have a direct_map
+ *
+ * @param nv     nb of vectors to update
+ * @param idx    vector indices to update, size nv
+ * @param v      vectors of new values, size nv*d
+ */
+int faiss_IndexIVFFlat_update_vectors(FaissIndexIVFFlat* index, int nv,
+    idx_t *idx, const float *v);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.cpp b/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.cpp
new file mode 100644
index 0000000000..a4d4acd4c1
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.cpp
@@ -0,0 +1,92 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "Index_c.h"
+#include "Clustering_c.h"
+#include "IndexIVF_c.h"
+#include "IndexIVF.h"
+#include "macros_impl.h"
+
+using faiss::IndexIVF;
+using faiss::IndexIVFStats;
+
+DEFINE_DESTRUCTOR(IndexIVF)
+DEFINE_INDEX_DOWNCAST(IndexIVF)
+
+/// number of possible key values
+DEFINE_GETTER(IndexIVF, size_t, nlist)
+/// number of probes at query time
+DEFINE_GETTER(IndexIVF, size_t, nprobe)
+/// quantizer that maps vectors to inverted lists
+DEFINE_GETTER_PERMISSIVE(IndexIVF, FaissIndex*, quantizer)
+
+/**
+ * = 0: use the quantizer as index in a kmeans training
+ * = 1: just pass on the training set to the train() of the quantizer
+ * = 2: kmeans training on a flat index + add the centroids to the quantizer
+ */
+DEFINE_GETTER(IndexIVF, char, quantizer_trains_alone)
+
+/// whether object owns the quantizer
+DEFINE_GETTER(IndexIVF, int, own_fields)
+
+using faiss::IndexIVF;
+
+int faiss_IndexIVF_merge_from(
+    FaissIndexIVF* index, FaissIndexIVF* other, idx_t add_id) {
+    try {
+        reinterpret_cast<IndexIVF*>(index)->merge_from(
+            *reinterpret_cast<IndexIVF*>(other), add_id);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexIVF_copy_subset_to(
+    const FaissIndexIVF* index, FaissIndexIVF* other, int subset_type, idx_t a1,
+    idx_t a2) {
+    try {
+        reinterpret_cast<const IndexIVF*>(index)->copy_subset_to(
+            *reinterpret_cast<IndexIVF*>(other), subset_type, a1, a2);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexIVF_search_preassigned (const FaissIndexIVF* index,
+    idx_t n, const float *x, idx_t k, const idx_t *assign,
+    const float *centroid_dis, float *distances, idx_t *labels,
+    int store_pairs) {
+    try {
+        reinterpret_cast<const IndexIVF*>(index)->search_preassigned(
+            n, x, k, assign, centroid_dis, distances, labels, store_pairs);
+   } CATCH_AND_HANDLE
+}
+
+size_t faiss_IndexIVF_get_list_size(const FaissIndexIVF* index, size_t list_no) {
+    return reinterpret_cast<const IndexIVF*>(index)->get_list_size(list_no);
+}
+
+int faiss_IndexIVF_make_direct_map(FaissIndexIVF* index,
+    int new_maintain_direct_map) {
+    try {
+        reinterpret_cast<IndexIVF*>(index)->make_direct_map(
+            static_cast<bool>(new_maintain_direct_map));
+    } CATCH_AND_HANDLE
+}
+
+double faiss_IndexIVF_imbalance_factor (const FaissIndexIVF* index) {
+    return reinterpret_cast<const IndexIVF*>(index)->invlists->imbalance_factor();
+}
+
+/// display some stats about the inverted lists
+void faiss_IndexIVF_print_stats (const FaissIndexIVF* index) {
+    reinterpret_cast<const IndexIVF*>(index)->invlists->print_stats();
+}
+
+void faiss_IndexIVFStats_reset(FaissIndexIVFStats* stats) {
+    reinterpret_cast<IndexIVFStats*>(stats)->reset();    
+}
diff --git a/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.h b/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.h
new file mode 100644
index 0000000000..b2176aac58
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.h
@@ -0,0 +1,135 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#ifndef FAISS_INDEX_IVF_C_H
+#define FAISS_INDEX_IVF_C_H
+
+#include "faiss_c.h"
+#include "Index_c.h"
+#include "Clustering_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Index based on a inverted file (IVF)
+ *
+ * In the inverted file, the quantizer (an Index instance) provides a
+ * quantization index for each vector to be added. The quantization
+ * index maps to a list (aka inverted list or posting list), where the
+ * id of the vector is then stored.
+ *
+ * At search time, the vector to be searched is also quantized, and
+ * only the list corresponding to the quantization index is
+ * searched. This speeds up the search by making it
+ * non-exhaustive. This can be relaxed using multi-probe search: a few
+ * (nprobe) quantization indices are selected and several inverted
+ * lists are visited.
+ *
+ * Sub-classes implement a post-filtering of the index that refines
+ * the distance estimation from the query to databse vectors.
+ */
+FAISS_DECLARE_CLASS_INHERITED(IndexIVF, Index)
+FAISS_DECLARE_DESTRUCTOR(IndexIVF)
+FAISS_DECLARE_INDEX_DOWNCAST(IndexIVF)
+
+/// number of possible key values
+FAISS_DECLARE_GETTER(IndexIVF, size_t, nlist)
+/// number of probes at query time
+FAISS_DECLARE_GETTER(IndexIVF, size_t, nprobe)
+/// quantizer that maps vectors to inverted lists
+FAISS_DECLARE_GETTER(IndexIVF, FaissIndex*, quantizer)
+/**
+ * = 0: use the quantizer as index in a kmeans training
+ * = 1: just pass on the training set to the train() of the quantizer
+ * = 2: kmeans training on a flat index + add the centroids to the quantizer
+ */
+FAISS_DECLARE_GETTER(IndexIVF, char, quantizer_trains_alone)
+
+/// whether object owns the quantizer
+FAISS_DECLARE_GETTER(IndexIVF, int, own_fields)
+
+/** moves the entries from another dataset to self. On output,
+ * other is empty. add_id is added to all moved ids (for
+ * sequential ids, this would be this->ntotal */
+int faiss_IndexIVF_merge_from(
+    FaissIndexIVF* index, FaissIndexIVF* other, idx_t add_id);
+
+/** copy a subset of the entries index to the other index
+ *
+ * if subset_type == 0: copies ids in [a1, a2)
+ * if subset_type == 1: copies ids if id % a1 == a2
+ * if subset_type == 2: copies inverted lists such that a1
+ *                      elements are left before and a2 elements are after
+ */
+int faiss_IndexIVF_copy_subset_to(
+    const FaissIndexIVF* index, FaissIndexIVF* other, int subset_type, idx_t a1,
+    idx_t a2);
+
+/** search a set of vectors, that are pre-quantized by the IVF
+ *  quantizer. Fill in the corresponding heaps with the query
+ *  results. search() calls this.
+ *
+ * @param n      nb of vectors to query
+ * @param x      query vectors, size nx * d
+ * @param assign coarse quantization indices, size nx * nprobe
+ * @param centroid_dis
+ *               distances to coarse centroids, size nx * nprobe
+ * @param distance
+ *               output distances, size n * k
+ * @param labels output labels, size n * k
+ * @param store_pairs store inv list index + inv list offset
+ *                     instead in upper/lower 32 bit of result,
+ *                     instead of ids (used for reranking).
+ */
+int faiss_IndexIVF_search_preassigned (const FaissIndexIVF* index,
+    idx_t n, const float *x, idx_t k, const idx_t *assign,
+    const float *centroid_dis, float *distances, idx_t *labels,
+    int store_pairs);
+
+size_t faiss_IndexIVF_get_list_size(const FaissIndexIVF* index,
+    size_t list_no);
+
+/** intialize a direct map
+ *
+ * @param new_maintain_direct_map    if true, create a direct map,
+ *                                   else clear it
+ */
+int faiss_IndexIVF_make_direct_map(FaissIndexIVF* index,
+    int new_maintain_direct_map);
+
+/** Check the inverted lists' imbalance factor.
+ *
+ * 1= perfectly balanced, >1: imbalanced
+ */
+double faiss_IndexIVF_imbalance_factor (const FaissIndexIVF* index);
+
+/// display some stats about the inverted lists of the index
+void faiss_IndexIVF_print_stats (const FaissIndexIVF* index);
+
+
+typedef struct FaissIndexIVFStats {
+    size_t nq;       // nb of queries run
+    size_t nlist;    // nb of inverted lists scanned
+    size_t ndis;     // nb of distancs computed
+} FaissIndexIVFStats;
+
+void faiss_IndexIVFStats_reset(FaissIndexIVFStats* stats);
+
+inline void faiss_IndexIVFStats_init(FaissIndexIVFStats* stats) {
+    faiss_IndexIVFStats_reset(stats);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/c_api/IndexLSH_c.cpp b/core/src/index/thirdparty/faiss/c_api/IndexLSH_c.cpp
new file mode 100644
index 0000000000..39a348f807
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/IndexLSH_c.cpp
@@ -0,0 +1,37 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "IndexLSH_c.h"
+#include "IndexLSH.h"
+#include "macros_impl.h"
+
+using faiss::Index;
+using faiss::IndexLSH;
+
+DEFINE_DESTRUCTOR(IndexLSH)
+DEFINE_INDEX_DOWNCAST(IndexLSH)
+
+DEFINE_GETTER(IndexLSH, int, nbits)
+DEFINE_GETTER(IndexLSH, int, bytes_per_vec)
+DEFINE_GETTER_PERMISSIVE(IndexLSH, int, rotate_data)
+DEFINE_GETTER_PERMISSIVE(IndexLSH, int, train_thresholds)
+
+int faiss_IndexLSH_new(FaissIndexLSH** p_index, idx_t d, int nbits) {
+    try {
+        *p_index = reinterpret_cast<FaissIndexLSH*>(new IndexLSH(d, nbits));
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexLSH_new_with_options(FaissIndexLSH** p_index, idx_t d, int nbits, int rotate_data, int train_thresholds) {
+    try {
+        *p_index = reinterpret_cast<FaissIndexLSH*>(
+            new IndexLSH(d, nbits, static_cast<bool>(rotate_data), static_cast<bool>(train_thresholds)));
+    } CATCH_AND_HANDLE
+}
diff --git a/core/src/index/thirdparty/faiss/c_api/IndexLSH_c.h b/core/src/index/thirdparty/faiss/c_api/IndexLSH_c.h
new file mode 100644
index 0000000000..4a3dab418d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/IndexLSH_c.h
@@ -0,0 +1,40 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#ifndef INDEX_LSH_C_H
+#define INDEX_LSH_C_H
+
+#include "faiss_c.h"
+#include "Index_c.h"
+#include "Clustering_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** The sign of each vector component is put in a binary signature */
+FAISS_DECLARE_CLASS_INHERITED(IndexLSH, Index)
+FAISS_DECLARE_DESTRUCTOR(IndexLSH)
+FAISS_DECLARE_INDEX_DOWNCAST(IndexLSH)
+
+FAISS_DECLARE_GETTER(IndexLSH, int, nbits)
+FAISS_DECLARE_GETTER(IndexLSH, int, bytes_per_vec)
+FAISS_DECLARE_GETTER(IndexLSH, int, rotate_data)
+FAISS_DECLARE_GETTER(IndexLSH, int, train_thresholds)
+
+int faiss_IndexLSH_new(FaissIndexLSH** p_index, idx_t d, int nbits);
+
+int faiss_IndexLSH_new_with_options(FaissIndexLSH** p_index, idx_t d, int nbits, int rotate_data, int train_thresholds);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/c_api/IndexShards_c.cpp b/core/src/index/thirdparty/faiss/c_api/IndexShards_c.cpp
new file mode 100644
index 0000000000..e66aeb7ed0
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/IndexShards_c.cpp
@@ -0,0 +1,44 @@
+#include "IndexShards_c.h"
+#include "IndexShards.h"
+#include "macros_impl.h"
+
+using faiss::Index;
+using faiss::IndexShards;
+
+DEFINE_GETTER(IndexShards, int, own_fields)
+DEFINE_SETTER(IndexShards, int, own_fields)
+
+DEFINE_GETTER(IndexShards, int, successive_ids)
+DEFINE_SETTER(IndexShards, int, successive_ids)
+
+int faiss_IndexShards_new(FaissIndexShards** p_index, idx_t d) {
+    try {
+        auto out = new IndexShards(d);
+        *p_index = reinterpret_cast<FaissIndexShards*>(out);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexShards_new_with_options(FaissIndexShards** p_index, idx_t d, int threaded, int successive_ids) {
+    try {
+        auto out = new IndexShards(d, static_cast<bool>(threaded), static_cast<bool>(successive_ids));
+        *p_index = reinterpret_cast<FaissIndexShards*>(out);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexShards_add_shard(FaissIndexShards* index, FaissIndex* shard) {
+    try {
+        reinterpret_cast<IndexShards*>(index)->add_shard(
+            reinterpret_cast<Index*>(shard));
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexShards_sync_with_shard_indexes(FaissIndexShards* index) {
+    try {
+        reinterpret_cast<IndexShards*>(index)->sync_with_shard_indexes();
+    } CATCH_AND_HANDLE
+}
+
+FaissIndex* faiss_IndexShards_at(FaissIndexShards* index, int i) {
+    auto shard = reinterpret_cast<IndexShards*>(index)->at(i);
+    return reinterpret_cast<FaissIndex*>(shard);
+}
diff --git a/core/src/index/thirdparty/faiss/c_api/IndexShards_c.h b/core/src/index/thirdparty/faiss/c_api/IndexShards_c.h
new file mode 100644
index 0000000000..7e6a30b2a9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/IndexShards_c.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#ifndef INDEXSHARDS_C_H
+#define INDEXSHARDS_C_H
+
+#include "faiss_c.h"
+#include "Index_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Index that concatenates the results from several sub-indexes
+ */
+FAISS_DECLARE_CLASS_INHERITED(IndexShards, Index)
+
+FAISS_DECLARE_GETTER_SETTER(IndexShards, int, own_fields)
+FAISS_DECLARE_GETTER_SETTER(IndexShards, int, successive_ids)
+
+int faiss_IndexShards_new(FaissIndexShards** p_index, idx_t d);
+
+int faiss_IndexShards_new_with_options(FaissIndexShards** p_index, idx_t d, int threaded, int successive_ids);
+
+int faiss_IndexShards_add_shard(FaissIndexShards* index, FaissIndex* shard);
+
+/// update metric_type and ntotal
+int faiss_IndexShards_sync_with_shard_indexes(FaissIndexShards* index);
+
+FaissIndex* faiss_IndexShards_at(FaissIndexShards* index, int i);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/Index_c.cpp b/core/src/index/thirdparty/faiss/c_api/Index_c.cpp
new file mode 100644
index 0000000000..87085fd192
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/Index_c.cpp
@@ -0,0 +1,106 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "Index_c.h"
+#include "Index.h"
+#include "macros_impl.h"
+
+extern "C" {
+
+DEFINE_DESTRUCTOR(Index)
+
+DEFINE_GETTER(Index, int, d)
+
+DEFINE_GETTER(Index, int, is_trained)
+
+DEFINE_GETTER(Index, idx_t, ntotal)
+
+DEFINE_GETTER(Index, FaissMetricType, metric_type)
+
+int faiss_Index_train(FaissIndex* index, idx_t n, const float* x) {
+    try {
+        reinterpret_cast<faiss::Index*>(index)->train(n, x);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_Index_add(FaissIndex* index, idx_t n, const float* x) {
+    try {
+        reinterpret_cast<faiss::Index*>(index)->add(n, x);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_Index_add_with_ids(FaissIndex* index, idx_t n, const float* x, const idx_t* xids) {
+    try {
+        reinterpret_cast<faiss::Index*>(index)->add_with_ids(n, x, xids);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_Index_search(const FaissIndex* index, idx_t n, const float* x, idx_t k,
+                       float* distances, idx_t* labels) {
+    try {
+        reinterpret_cast<const faiss::Index*>(index)->search(n, x, k, distances, labels);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_Index_range_search(const FaissIndex* index, idx_t n, const float* x, float radius,
+                             FaissRangeSearchResult* result) {
+    try {
+        reinterpret_cast<const faiss::Index*>(index)->range_search(
+            n, x, radius, reinterpret_cast<faiss::RangeSearchResult*>(result));
+    } CATCH_AND_HANDLE
+}
+
+int faiss_Index_assign(FaissIndex* index, idx_t n, const float * x, idx_t * labels, idx_t k) {
+    try {
+        reinterpret_cast<faiss::Index*>(index)->assign(n, x, labels, k);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_Index_reset(FaissIndex* index) {
+    try {
+        reinterpret_cast<faiss::Index*>(index)->reset();
+    } CATCH_AND_HANDLE
+}
+
+int faiss_Index_remove_ids(FaissIndex* index, const FaissIDSelector* sel, size_t* n_removed) {
+    try {
+        size_t n {reinterpret_cast<faiss::Index*>(index)->remove_ids(
+            *reinterpret_cast<const faiss::IDSelector*>(sel))};
+        if (n_removed) {
+            *n_removed = n;
+        }
+    } CATCH_AND_HANDLE
+}
+
+int faiss_Index_reconstruct(const FaissIndex* index, idx_t key, float* recons) {
+    try {
+        reinterpret_cast<const faiss::Index*>(index)->reconstruct(key, recons);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_Index_reconstruct_n (const FaissIndex* index, idx_t i0, idx_t ni, float* recons) {
+    try {
+        reinterpret_cast<const faiss::Index*>(index)->reconstruct_n(i0, ni, recons);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_Index_compute_residual(const FaissIndex* index, const float* x, float* residual, idx_t key) {
+    try {
+        reinterpret_cast<const faiss::Index*>(index)->compute_residual(x, residual, key);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_Index_display(const FaissIndex* index) {
+    try {
+        reinterpret_cast<const faiss::Index*>(index)->display();
+    } CATCH_AND_HANDLE
+}
+
+}
diff --git a/core/src/index/thirdparty/faiss/c_api/Index_c.h b/core/src/index/thirdparty/faiss/c_api/Index_c.h
new file mode 100644
index 0000000000..5e143211e4
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/Index_c.h
@@ -0,0 +1,164 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved
+// -*- c -*-
+
+#ifndef FAISS_INDEX_C_H
+#define FAISS_INDEX_C_H
+
+#include <stddef.h>
+#include "faiss_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// forward declaration required here
+FAISS_DECLARE_CLASS(RangeSearchResult)
+
+//typedef struct FaissRangeSearchResult_H FaissRangeSearchResult;
+typedef struct FaissIDSelector_H FaissIDSelector;
+
+/// Some algorithms support both an inner product version and a L2 search version.
+typedef enum FaissMetricType {
+    METRIC_INNER_PRODUCT = 0,
+    METRIC_L2 = 1,
+} FaissMetricType;
+
+/// Opaque type for referencing to an index object
+FAISS_DECLARE_CLASS(Index)
+FAISS_DECLARE_DESTRUCTOR(Index)
+
+/// Getter for d
+FAISS_DECLARE_GETTER(Index, int, d)
+
+/// Getter for is_trained
+FAISS_DECLARE_GETTER(Index, int, is_trained)
+
+/// Getter for ntotal
+FAISS_DECLARE_GETTER(Index, idx_t, ntotal)
+
+/// Getter for metric_type
+FAISS_DECLARE_GETTER(Index, FaissMetricType, metric_type)
+
+/** Perform training on a representative set of vectors
+ *
+ * @param index  opaque pointer to index object
+ * @param n      nb of training vectors
+ * @param x      training vecors, size n * d
+ */
+int faiss_Index_train(FaissIndex* index, idx_t n, const float* x);
+
+/** Add n vectors of dimension d to the index.
+ *
+ * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
+ * This function slices the input vectors in chuncks smaller than
+ * blocksize_add and calls add_core.
+ * @param index  opaque pointer to index object
+ * @param x      input matrix, size n * d
+ */
+int faiss_Index_add(FaissIndex* index, idx_t n, const float* x);
+
+/** Same as add, but stores xids instead of sequential ids.
+ *
+ * The default implementation fails with an assertion, as it is
+ * not supported by all indexes.
+ *
+ * @param index  opaque pointer to index object
+ * @param xids   if non-null, ids to store for the vectors (size n)
+ */
+int faiss_Index_add_with_ids(FaissIndex* index, idx_t n, const float* x, const idx_t* xids);
+
+/** query n vectors of dimension d to the index.
+ *
+ * return at most k vectors. If there are not enough results for a
+ * query, the result array is padded with -1s.
+ *
+ * @param index       opaque pointer to index object
+ * @param x           input vectors to search, size n * d
+ * @param labels      output labels of the NNs, size n*k
+ * @param distances   output pairwise distances, size n*k
+ */
+int faiss_Index_search(const FaissIndex* index, idx_t n, const float* x, idx_t k,
+                       float* distances, idx_t* labels);
+
+/** query n vectors of dimension d to the index.
+ *
+ * return all vectors with distance < radius. Note that many
+ * indexes do not implement the range_search (only the k-NN search
+ * is mandatory).
+ *
+ * @param index       opaque pointer to index object
+ * @param x           input vectors to search, size n * d
+ * @param radius      search radius
+ * @param result      result table
+ */
+int faiss_Index_range_search(const FaissIndex* index, idx_t n, const float* x,
+                             float radius, FaissRangeSearchResult* result);
+
+/** return the indexes of the k vectors closest to the query x.
+ *
+ * This function is identical as search but only return labels of neighbors.
+ * @param index       opaque pointer to index object
+ * @param x           input vectors to search, size n * d
+ * @param labels      output labels of the NNs, size n*k
+ */
+int faiss_Index_assign(FaissIndex* index, idx_t n, const float * x, idx_t * labels, idx_t k);
+
+/** removes all elements from the database.
+ * @param index       opaque pointer to index object
+ */
+int faiss_Index_reset(FaissIndex* index);
+
+/** removes IDs from the index. Not supported by all indexes
+ * @param index       opaque pointer to index object
+ * @param nremove     output for the number of IDs removed
+ */
+int faiss_Index_remove_ids(FaissIndex* index, const FaissIDSelector* sel, size_t* n_removed);
+
+/** Reconstruct a stored vector (or an approximation if lossy coding)
+ *
+ * this function may not be defined for some indexes
+ * @param index       opaque pointer to index object
+ * @param key         id of the vector to reconstruct
+ * @param recons      reconstucted vector (size d)
+ */
+int faiss_Index_reconstruct(const FaissIndex* index, idx_t key, float* recons);
+
+/** Reconstruct vectors i0 to i0 + ni - 1
+ *
+ * this function may not be defined for some indexes
+ * @param index       opaque pointer to index object
+ * @param recons      reconstucted vector (size ni * d)
+ */
+int faiss_Index_reconstruct_n (const FaissIndex* index, idx_t i0, idx_t ni, float* recons);
+
+/** Computes a residual vector after indexing encoding.
+ *
+ * The residual vector is the difference between a vector and the
+ * reconstruction that can be decoded from its representation in
+ * the index. The residual can be used for multiple-stage indexing
+ * methods, like IndexIVF's methods.
+ *
+ * @param index       opaque pointer to index object
+ * @param x           input vector, size d
+ * @param residual    output residual vector, size d
+ * @param key         encoded index, as returned by search and assign
+ */
+int faiss_Index_compute_residual(const FaissIndex* index, const float* x, float* residual, idx_t key);
+
+/** Display the actual class name and some more info
+ * @param index       opaque pointer to index object
+ */
+int faiss_Index_display(const FaissIndex* index);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/c_api/Makefile b/core/src/index/thirdparty/faiss/c_api/Makefile
new file mode 100644
index 0000000000..04c84e68d2
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/Makefile
@@ -0,0 +1,79 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+.SUFFIXES: .cpp .o
+
+# C API
+
+include ../makefile.inc
+DEBUGFLAG=-DNDEBUG # no debugging
+
+LIBNAME=libfaiss
+CLIBNAME=libfaiss_c
+LIBCOBJ=error_impl.o Index_c.o IndexFlat_c.o Clustering_c.o AutoTune_c.o \
+	AuxIndexStructures_c.o IndexIVF_c.o IndexIVFFlat_c.o IndexLSH_c.o \
+	index_io_c.o MetaIndexes_c.o IndexShards_c.o
+CFLAGS=-fPIC -m64 -Wno-sign-compare -g -O3 -Wall -Wextra
+
+# Build static and shared object files by default
+all: $(CLIBNAME).a $(CLIBNAME).$(SHAREDEXT)
+
+# Build static object file containing the wrapper implementation only.
+# Consumers are required to link with libfaiss.a and libstdc++.
+$(CLIBNAME).a: $(LIBCOBJ)
+	ar r $@ $^
+
+# Build dynamic library (independent object)
+$(CLIBNAME).$(SHAREDEXT): $(LIBCOBJ) ../$(LIBNAME).a
+	$(CXX) $(LDFLAGS) $(SHAREDFLAGS) -o $@ \
+	-Wl,--whole-archive $^ -Wl,--no-whole-archive $(LIBS) -static-libstdc++
+
+bin/example_c: example_c.c $(CLIBNAME).$(SHAREDEXT)
+	$(CC) $(CFLAGS) -std=c99 -I. -I.. -L. -o $@ example_c.c \
+	$(LDFLAGS) -lm -lfaiss_c
+
+clean:
+	rm -f $(CLIBNAME).a $(CLIBNAME).$(SHAREDEXT)* *.o bin/example_c
+
+%.o: %.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -c $< -o $@
+
+# Dependencies
+
+error_impl.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+error_impl.o: error_impl.cpp error_c.h error_impl.h macros_impl.h
+
+index_io_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+index_io_c.o: index_io_c.cpp error_impl.cpp ../index_io.h macros_impl.h
+
+Index_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+Index_c.o: Index_c.cpp Index_c.h ../Index.h macros_impl.h
+
+IndexFlat_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+IndexFlat_c.o: IndexFlat_c.cpp IndexFlat_c.h ../IndexFlat.h macros_impl.h
+
+IndexIVF_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+IndexIVF_c.o: IndexIVF_c.cpp IndexIVF_c.h ../IndexIVF.h macros_impl.h
+
+IndexIVFFlat_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+IndexIVFFlat_c.o: IndexIVFFlat_c.cpp IndexIVFFlat_c.h ../IndexIVFFlat.h macros_impl.h
+
+IndexLSH_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+IndexLSH_c.o: IndexLSH_c.cpp IndexLSH_c.h ../IndexLSH.h macros_impl.h
+
+IndexShards_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+IndexShards_c.o: IndexShards_c.cpp IndexShards_c.h ../Index.h ../IndexShards.h macros_impl.h
+
+Clustering_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+Clustering_c.o: Clustering_c.cpp Clustering_c.h ../Clustering.h macros_impl.h
+
+AutoTune_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+AutoTune_c.o: AutoTune_c.cpp AutoTune_c.h ../AutoTune.h macros_impl.h
+
+AuxIndexStructures_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+AuxIndexStructures_c.o: AuxIndexStructures_c.cpp AuxIndexStructures_c.h ../AuxIndexStructures.h macros_impl.h
+
+MetaIndexes_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+MetaIndexes_c.o: MetaIndexes_c.cpp MetaIndexes_c.h ../MetaIndexes.h macros_impl.h
diff --git a/core/src/index/thirdparty/faiss/c_api/MetaIndexes_c.cpp b/core/src/index/thirdparty/faiss/c_api/MetaIndexes_c.cpp
new file mode 100644
index 0000000000..72abd9e793
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/MetaIndexes_c.cpp
@@ -0,0 +1,49 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "MetaIndexes_c.h"
+#include "MetaIndexes.h"
+#include "macros_impl.h"
+
+using faiss::Index;
+using faiss::IndexIDMap;
+using faiss::IndexIDMap2;
+
+DEFINE_GETTER(IndexIDMap, int, own_fields)
+DEFINE_SETTER(IndexIDMap, int, own_fields)
+
+int faiss_IndexIDMap_new(FaissIndexIDMap** p_index, FaissIndex* index) {
+    try {
+        auto out = new IndexIDMap(reinterpret_cast<Index*>(index));
+        *p_index = reinterpret_cast<FaissIndexIDMap*>(out);
+    } CATCH_AND_HANDLE
+}
+
+void faiss_IndexIDMap_id_map(FaissIndexIDMap* index, idx_t** p_id_map, size_t* p_size) {
+    auto idx = reinterpret_cast<IndexIDMap*>(index);
+    if (p_id_map)
+        *p_id_map = idx->id_map.data();
+    if (p_size)
+        *p_size = idx->id_map.size();
+}
+
+int faiss_IndexIDMap2_new(FaissIndexIDMap2** p_index, FaissIndex* index) {
+    try {
+        auto out = new IndexIDMap2(reinterpret_cast<Index*>(index));
+        *p_index = reinterpret_cast<FaissIndexIDMap2*>(out);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_IndexIDMap2_construct_rev_map(FaissIndexIDMap2* index) {
+    try {
+        reinterpret_cast<IndexIDMap2*>(index)->construct_rev_map();
+    } CATCH_AND_HANDLE
+}
+
diff --git a/core/src/index/thirdparty/faiss/c_api/MetaIndexes_c.h b/core/src/index/thirdparty/faiss/c_api/MetaIndexes_c.h
new file mode 100644
index 0000000000..940394f92f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/MetaIndexes_c.h
@@ -0,0 +1,49 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#ifndef METAINDEXES_C_H
+#define METAINDEXES_C_H
+
+#include "faiss_c.h"
+#include "Index_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Index that translates search results to ids */
+FAISS_DECLARE_CLASS_INHERITED(IndexIDMap, Index)
+
+FAISS_DECLARE_GETTER_SETTER(IndexIDMap, int, own_fields)
+
+int faiss_IndexIDMap_new(FaissIndexIDMap** p_index, FaissIndex* index);
+
+/** get a pointer to the index map's internal ID vector (the `id_map` field). The
+ * outputs of this function become invalid after any operation that can modify the index.
+ * 
+ * @param index   opaque pointer to index object
+ * @param p_id_map    output, the pointer to the beginning of `id_map`.
+ * @param p_size  output, the current length of `id_map`.
+ */
+void faiss_IndexIDMap_id_map(FaissIndexIDMap* index, idx_t** p_id_map, size_t* p_size);
+
+/** same as IndexIDMap but also provides an efficient reconstruction
+    implementation via a 2-way index */
+FAISS_DECLARE_CLASS_INHERITED(IndexIDMap2, IndexIDMap)
+
+int faiss_IndexIDMap2_new(FaissIndexIDMap2** p_index, FaissIndex* index);
+
+/// make the rev_map from scratch
+int faiss_IndexIDMap2_construct_rev_map(FaissIndexIDMap2* index);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/core/src/index/thirdparty/faiss/c_api/error_c.h b/core/src/index/thirdparty/faiss/c_api/error_c.h
new file mode 100644
index 0000000000..5aa5664feb
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/error_c.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#ifndef FAISS_ERROR_C_H
+#define FAISS_ERROR_C_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// An error code which depends on the exception thrown from the previous 
+/// operation. See `faiss_get_last_error` to retrieve the error message.
+typedef enum FaissErrorCode {
+    /// No error
+    OK = 0,
+    /// Any exception other than Faiss or standard C++ library exceptions
+    UNKNOWN_EXCEPT = -1,
+    /// Faiss library exception
+    FAISS_EXCEPT = -2,
+    /// Standard C++ library exception
+    STD_EXCEPT = -4
+} FaissErrorCode;
+
+/**
+ * Get the error message of the last failed operation performed by Faiss.
+ * The given pointer is only invalid until another Faiss function is
+ * called.
+ */
+const char* faiss_get_last_error();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/error_impl.cpp b/core/src/index/thirdparty/faiss/c_api/error_impl.cpp
new file mode 100644
index 0000000000..25793eb0e8
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/error_impl.cpp
@@ -0,0 +1,27 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "error_c.h"
+#include "error_impl.h"
+#include "FaissException.h"
+#include <exception>
+
+thread_local std::exception_ptr faiss_last_exception;
+
+const char* faiss_get_last_error() {
+    if (faiss_last_exception) {
+        try {
+            std::rethrow_exception(faiss_last_exception);
+        } catch (std::exception& e) {
+            return e.what();
+        }
+    }
+    return nullptr;
+}
diff --git a/core/src/index/thirdparty/faiss/c_api/error_impl.h b/core/src/index/thirdparty/faiss/c_api/error_impl.h
new file mode 100644
index 0000000000..b44254ad94
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/error_impl.h
@@ -0,0 +1,16 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include <exception>
+
+/** global variable for holding the last exception thrown by
+ * calls to Faiss functions through the C API
+ */
+extern thread_local std::exception_ptr faiss_last_exception;
diff --git a/core/src/index/thirdparty/faiss/c_api/example_c.c b/core/src/index/thirdparty/faiss/c_api/example_c.c
new file mode 100644
index 0000000000..597c2920ee
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/example_c.c
@@ -0,0 +1,96 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "error_c.h"
+#include "index_io_c.h"
+#include "Index_c.h"
+#include "IndexFlat_c.h"
+#include "AutoTune_c.h"
+
+#define FAISS_TRY(C)                                       \
+    {                                                      \
+        if (C) {                                           \
+            fprintf(stderr, "%s", faiss_get_last_error()); \
+            exit(-1);                                      \
+        }                                                  \
+    }
+
+double drand() {
+    return (double)rand() / (double)RAND_MAX;
+}
+
+int main() {
+    time_t seed = time(NULL);
+    srand(seed);
+    printf("Generating some data...\n");
+    int d = 128;                           // dimension
+    int nb = 100000;                       // database size
+    int nq = 10000;                        // nb of queries
+    float *xb = malloc(d * nb * sizeof(float));
+    float *xq = malloc(d * nq * sizeof(float));
+
+    for(int i = 0; i < nb; i++) {
+        for(int j = 0; j < d; j++) xb[d * i + j] = drand();
+        xb[d * i] += i / 1000.;
+    }
+    for(int i = 0; i < nq; i++) {
+        for(int j = 0; j < d; j++) xq[d * i + j] = drand();
+        xq[d * i] += i / 1000.;
+    }
+
+    printf("Building an index...\n");
+
+    FaissIndex* index = NULL;
+    FAISS_TRY(faiss_index_factory(&index, d, "Flat", METRIC_L2));  // use factory to create index
+    printf("is_trained = %s\n", faiss_Index_is_trained(index) ? "true" : "false");
+    FAISS_TRY(faiss_Index_add(index, nb, xb));                     // add vectors to the index
+    printf("ntotal = %ld\n", faiss_Index_ntotal(index));
+
+    printf("Searching...\n");
+    int k = 5;
+
+    {       // sanity check: search 5 first vectors of xb
+        idx_t *I = malloc(k * 5 * sizeof(idx_t));
+        float *D = malloc(k * 5 * sizeof(float));
+        FAISS_TRY(faiss_Index_search(index, 5, xb, k, D, I));
+        printf("I=\n");
+        for(int i = 0; i < 5; i++) {
+            for(int j = 0; j < k; j++) printf("%5ld (d=%2.3f)  ", I[i * k + j], D[i * k + j]);
+            printf("\n");
+        }
+        free(I);
+        free(D);
+    }
+    {       // search xq
+        idx_t *I = malloc(k * nq * sizeof(idx_t));
+        float *D = malloc(k * nq * sizeof(float));
+        FAISS_TRY(faiss_Index_search(index, 5, xb, k, D, I));
+        printf("I=\n");
+        for(int i = 0; i < 5; i++) {
+            for(int j = 0; j < k; j++) printf("%5ld (d=%2.3f)  ", I[i * k + j], D[i * k + j]);
+            printf("\n");
+        }
+        free(I);
+        free(D);
+    }
+
+    printf("Saving index to disk...\n");
+    FAISS_TRY(faiss_write_index_fname(index, "example.index"));
+
+    printf("Freeing index...\n");
+    faiss_Index_free(index);
+    printf("Done.\n");
+
+    return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/c_api/faiss_c.h b/core/src/index/thirdparty/faiss/c_api/faiss_c.h
new file mode 100644
index 0000000000..2357f71327
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/faiss_c.h
@@ -0,0 +1,58 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+/// Macros and typedefs for C wrapper API declarations
+
+#ifndef FAISS_C_H
+#define FAISS_C_H
+
+#include <stdint.h>
+
+typedef int64_t faiss_idx_t;    ///< all indices are this type
+typedef faiss_idx_t idx_t;
+typedef float faiss_component_t;    ///< all vector components are this type
+typedef float faiss_distance_t;    ///< all distances between vectors are this type
+
+/// Declare an opaque type for a class type `clazz`.
+#define FAISS_DECLARE_CLASS(clazz) \
+    typedef struct Faiss ## clazz ## _H Faiss ## clazz;
+
+/// Declare an opaque type for a class type `clazz`, while
+/// actually aliasing it to an existing parent class type `parent`.
+#define FAISS_DECLARE_CLASS_INHERITED(clazz, parent) \
+    typedef struct Faiss ## parent ## _H Faiss ## clazz;
+
+/// Declare a dynamic downcast operation from a base `FaissIndex*` pointer
+/// type to a more specific index type. The function returns the same pointer
+/// if the downcast is valid, and `NULL` otherwise.
+#define FAISS_DECLARE_INDEX_DOWNCAST(clazz) \
+    Faiss ## clazz * faiss_ ## clazz ## _cast (FaissIndex*);
+
+/// Declare a getter for the field `name` in class `clazz`,
+/// of return type `ty`
+#define FAISS_DECLARE_GETTER(clazz, ty, name) \
+    ty faiss_ ## clazz ## _ ## name (const Faiss ## clazz *);
+
+/// Declare a setter for the field `name` in class `clazz`,
+/// in which the user provides a value of type `ty`
+#define FAISS_DECLARE_SETTER(clazz, ty, name) \
+    void faiss_ ## clazz ## _set_ ## name (Faiss ## clazz *, ty);
+
+/// Declare a getter and setter for the field `name` in class `clazz`.
+#define FAISS_DECLARE_GETTER_SETTER(clazz, ty, name) \
+    FAISS_DECLARE_GETTER(clazz, ty, name) \
+    FAISS_DECLARE_SETTER(clazz, ty, name)
+    
+/// Declare a destructor function which frees an object of
+/// type `clazz`.
+#define FAISS_DECLARE_DESTRUCTOR(clazz) \
+    void faiss_ ## clazz ## _free (Faiss ## clazz *obj);
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/c_api/gpu/GpuAutoTune_c.cpp b/core/src/index/thirdparty/faiss/c_api/gpu/GpuAutoTune_c.cpp
new file mode 100644
index 0000000000..7336d5d7d3
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/gpu/GpuAutoTune_c.cpp
@@ -0,0 +1,96 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "GpuAutoTune_c.h"
+#include "GpuClonerOptions_c.h"
+#include "macros_impl.h"
+#include "Index.h"
+#include "gpu/GpuAutoTune.h"
+#include "gpu/GpuClonerOptions.h"
+#include <vector>
+
+using faiss::Index;
+using faiss::gpu::GpuResources;
+using faiss::gpu::GpuClonerOptions;
+using faiss::gpu::GpuMultipleClonerOptions;
+
+int faiss_index_gpu_to_cpu(const FaissIndex* gpu_index, FaissIndex** p_out) {
+    try {
+        auto cpu_index = faiss::gpu::index_gpu_to_cpu(
+            reinterpret_cast<const Index*>(gpu_index)
+        );
+        *p_out = reinterpret_cast<FaissIndex*>(cpu_index);
+    } CATCH_AND_HANDLE
+}
+
+/// converts any CPU index that can be converted to GPU
+int faiss_index_cpu_to_gpu(FaissGpuResources* resources, int device, const FaissIndex *index, FaissGpuIndex** p_out) {
+    try {
+        auto res = reinterpret_cast<GpuResources*>(resources);
+        auto gpu_index = faiss::gpu::index_cpu_to_gpu(
+            res, device, reinterpret_cast<const Index*>(index)
+        );
+        *p_out = reinterpret_cast<FaissGpuIndex*>(gpu_index);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_index_cpu_to_gpu_with_options(
+    FaissGpuResources* resources, int device,
+    const FaissIndex *index, const FaissGpuClonerOptions* options,
+    FaissGpuIndex** p_out)
+{
+    try {
+        auto res = reinterpret_cast<GpuResources*>(resources);
+        auto gpu_index = faiss::gpu::index_cpu_to_gpu(
+            res, device, reinterpret_cast<const Index*>(index),
+            reinterpret_cast<const GpuClonerOptions*>(options));
+        *p_out = reinterpret_cast<FaissGpuIndex*>(gpu_index);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_index_cpu_to_gpu_multiple(
+    FaissGpuResources* const* resources_vec,
+    const int* devices, size_t devices_size,
+    const FaissIndex* index, FaissGpuIndex** p_out)
+{
+    try {
+        std::vector<GpuResources*> res(devices_size);
+        for (auto i = 0u; i < devices_size; ++i) {
+            res[i] = reinterpret_cast<GpuResources*>(resources_vec[i]);
+        }
+
+        std::vector<int> dev(devices, devices + devices_size);
+
+        auto gpu_index = faiss::gpu::index_cpu_to_gpu_multiple(
+            res, dev, reinterpret_cast<const Index*>(index));
+        *p_out = reinterpret_cast<FaissGpuIndex*>(gpu_index);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_index_cpu_to_gpu_multiple_with_options(
+    FaissGpuResources** resources_vec, size_t resources_vec_size,
+    int* devices, size_t devices_size,
+    const FaissIndex* index, const FaissGpuMultipleClonerOptions* options,
+    FaissGpuIndex** p_out)
+{
+    try {
+        std::vector<GpuResources*> res(resources_vec_size);
+        for (auto i = 0u; i < resources_vec_size; ++i) {
+            res[i] = reinterpret_cast<GpuResources*>(resources_vec[i]);
+        }
+
+        std::vector<int> dev(devices, devices + devices_size);
+
+        auto gpu_index = faiss::gpu::index_cpu_to_gpu_multiple(
+            res, dev, reinterpret_cast<const Index*>(index),
+            reinterpret_cast<const GpuMultipleClonerOptions*>(options));
+        *p_out = reinterpret_cast<FaissGpuIndex*>(gpu_index);
+    } CATCH_AND_HANDLE
+}
diff --git a/core/src/index/thirdparty/faiss/c_api/gpu/GpuAutoTune_c.h b/core/src/index/thirdparty/faiss/c_api/gpu/GpuAutoTune_c.h
new file mode 100644
index 0000000000..5dbd15c977
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/gpu/GpuAutoTune_c.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#ifndef FAISS_GPU_AUTO_TUNE_C_H
+#define FAISS_GPU_AUTO_TUNE_C_H
+
+#include <stddef.h>
+#include "faiss_c.h"
+#include "GpuClonerOptions_c.h"
+#include "GpuResources_c.h"
+#include "GpuIndex_c.h"
+#include "Index_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// converts any GPU index inside gpu_index to a CPU index
+int faiss_index_gpu_to_cpu(const FaissIndex* gpu_index, FaissIndex** p_out);
+
+/// converts any CPU index that can be converted to GPU
+int faiss_index_cpu_to_gpu(
+    FaissGpuResources* resources, int device,
+    const FaissIndex *index, FaissGpuIndex** p_out);
+
+/// converts any CPU index that can be converted to GPU
+int faiss_index_cpu_to_gpu_with_options(
+    FaissGpuResources* resources, int device,
+    const FaissIndex *index, const FaissGpuClonerOptions* options,
+    FaissGpuIndex** p_out);
+
+/// converts any CPU index that can be converted to GPU
+int faiss_index_cpu_to_gpu_multiple(
+    FaissGpuResources* const* resources_vec, const int* devices, size_t devices_size,
+    const FaissIndex* index, FaissGpuIndex** p_out);
+
+/// converts any CPU index that can be converted to GPU
+int faiss_index_cpu_to_gpu_multiple_with_options(
+    FaissGpuResources* const* resources_vec, const int* devices, size_t devices_size,
+    const FaissIndex* index, const FaissGpuMultipleClonerOptions* options,
+    FaissGpuIndex** p_out);
+
+/// parameter space and setters for GPU indexes
+FAISS_DECLARE_CLASS_INHERITED(GpuParameterSpace, ParameterSpace)
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/core/src/index/thirdparty/faiss/c_api/gpu/GpuClonerOptions_c.cpp b/core/src/index/thirdparty/faiss/c_api/gpu/GpuClonerOptions_c.cpp
new file mode 100644
index 0000000000..c61fc5e34c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/gpu/GpuClonerOptions_c.cpp
@@ -0,0 +1,52 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "GpuClonerOptions_c.h"
+#include "gpu/GpuClonerOptions.h"
+#include "macros_impl.h"
+
+using faiss::gpu::IndicesOptions;
+using faiss::gpu::GpuClonerOptions;
+using faiss::gpu::GpuMultipleClonerOptions;
+
+int faiss_GpuClonerOptions_new(FaissGpuClonerOptions** p) {
+    try {
+        *p = reinterpret_cast<FaissGpuClonerOptions*>(new GpuClonerOptions());
+    } CATCH_AND_HANDLE
+}
+
+int faiss_GpuMultipleClonerOptions_new(FaissGpuMultipleClonerOptions** p) {
+    try {
+        *p = reinterpret_cast<FaissGpuMultipleClonerOptions*>(new GpuMultipleClonerOptions());
+    } CATCH_AND_HANDLE
+}
+
+DEFINE_DESTRUCTOR(GpuClonerOptions)
+DEFINE_DESTRUCTOR(GpuMultipleClonerOptions)
+
+DEFINE_GETTER(GpuClonerOptions, FaissIndicesOptions, indicesOptions)
+DEFINE_GETTER(GpuClonerOptions, int, useFloat16CoarseQuantizer)
+DEFINE_GETTER(GpuClonerOptions, int, useFloat16)
+DEFINE_GETTER(GpuClonerOptions, int, usePrecomputed)
+DEFINE_GETTER(GpuClonerOptions, long, reserveVecs)
+DEFINE_GETTER(GpuClonerOptions, int, storeTransposed)
+DEFINE_GETTER(GpuClonerOptions, int, verbose)
+DEFINE_GETTER(GpuMultipleClonerOptions, int, shard)
+DEFINE_GETTER(GpuMultipleClonerOptions, int, shard_type)
+
+DEFINE_SETTER_STATIC(GpuClonerOptions, IndicesOptions, FaissIndicesOptions, indicesOptions)
+DEFINE_SETTER_STATIC(GpuClonerOptions, bool, int, useFloat16CoarseQuantizer)
+DEFINE_SETTER_STATIC(GpuClonerOptions, bool, int, useFloat16)
+DEFINE_SETTER_STATIC(GpuClonerOptions, bool, int, usePrecomputed)
+DEFINE_SETTER(GpuClonerOptions, long, reserveVecs)
+DEFINE_SETTER_STATIC(GpuClonerOptions, bool, int, storeTransposed)
+DEFINE_SETTER_STATIC(GpuClonerOptions, bool, int, verbose)
+DEFINE_SETTER_STATIC(GpuMultipleClonerOptions, bool, int, shard)
+DEFINE_SETTER(GpuMultipleClonerOptions, int, shard_type)
diff --git a/core/src/index/thirdparty/faiss/c_api/gpu/GpuClonerOptions_c.h b/core/src/index/thirdparty/faiss/c_api/gpu/GpuClonerOptions_c.h
new file mode 100644
index 0000000000..94ff403e7a
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/gpu/GpuClonerOptions_c.h
@@ -0,0 +1,68 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#ifndef FAISS_GPU_CLONER_OPTIONS_C_H
+#define FAISS_GPU_CLONER_OPTIONS_C_H
+
+#include "faiss_c.h"
+#include "GpuIndicesOptions_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+FAISS_DECLARE_CLASS(GpuClonerOptions)
+
+FAISS_DECLARE_DESTRUCTOR(GpuClonerOptions)
+
+/// Default constructor for GpuClonerOptions
+int faiss_GpuClonerOptions_new(FaissGpuClonerOptions**);
+
+/// how should indices be stored on index types that support indices
+/// (anything but GpuIndexFlat*)?
+FAISS_DECLARE_GETTER_SETTER(GpuClonerOptions, FaissIndicesOptions, indicesOptions)
+
+/// (boolean) is the coarse quantizer in float16?
+FAISS_DECLARE_GETTER_SETTER(GpuClonerOptions, int, useFloat16CoarseQuantizer)
+
+/// (boolean) for GpuIndexIVFFlat, is storage in float16?
+/// for GpuIndexIVFPQ, are intermediate calculations in float16?
+FAISS_DECLARE_GETTER_SETTER(GpuClonerOptions, int, useFloat16)
+
+/// (boolean) use precomputed tables?
+FAISS_DECLARE_GETTER_SETTER(GpuClonerOptions, int, usePrecomputed)
+
+/// reserve vectors in the invfiles?
+FAISS_DECLARE_GETTER_SETTER(GpuClonerOptions, long, reserveVecs)
+
+/// (boolean) For GpuIndexFlat, store data in transposed layout?
+FAISS_DECLARE_GETTER_SETTER(GpuClonerOptions, int, storeTransposed)
+
+/// (boolean) Set verbose options on the index
+FAISS_DECLARE_GETTER_SETTER(GpuClonerOptions, int, verbose)
+
+FAISS_DECLARE_CLASS_INHERITED(GpuMultipleClonerOptions, GpuClonerOptions)
+
+FAISS_DECLARE_DESTRUCTOR(GpuMultipleClonerOptions)
+
+/// Default constructor for GpuMultipleClonerOptions
+int faiss_GpuMultipleClonerOptions_new(FaissGpuMultipleClonerOptions**);
+
+/// (boolean) Whether to shard the index across GPUs, versus replication
+/// across GPUs
+FAISS_DECLARE_GETTER_SETTER(GpuMultipleClonerOptions, int, shard)
+
+/// IndexIVF::copy_subset_to subset type
+FAISS_DECLARE_GETTER_SETTER(GpuMultipleClonerOptions, int, shard_type)
+
+#ifdef __cplusplus
+}
+#endif
+#endif
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/gpu/GpuIndex_c.cpp b/core/src/index/thirdparty/faiss/c_api/gpu/GpuIndex_c.cpp
new file mode 100644
index 0000000000..bdef82766e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/gpu/GpuIndex_c.cpp
@@ -0,0 +1,17 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "gpu/GpuIndex.h"
+#include "GpuIndex_c.h"
+#include "macros_impl.h"
+
+using faiss::gpu::GpuIndexConfig;
+
+DEFINE_GETTER(GpuIndexConfig, int, device)
diff --git a/core/src/index/thirdparty/faiss/c_api/gpu/GpuIndex_c.h b/core/src/index/thirdparty/faiss/c_api/gpu/GpuIndex_c.h
new file mode 100644
index 0000000000..664c76101f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/gpu/GpuIndex_c.h
@@ -0,0 +1,30 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#ifndef FAISS_GPU_INDEX_C_H
+#define FAISS_GPU_INDEX_C_H
+
+#include "faiss_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+FAISS_DECLARE_CLASS(GpuIndexConfig)
+
+FAISS_DECLARE_GETTER(GpuIndexConfig, int, device)
+
+FAISS_DECLARE_CLASS_INHERITED(GpuIndex, Index)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/c_api/gpu/GpuIndicesOptions_c.h b/core/src/index/thirdparty/faiss/c_api/gpu/GpuIndicesOptions_c.h
new file mode 100644
index 0000000000..6a49773bc6
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/gpu/GpuIndicesOptions_c.h
@@ -0,0 +1,38 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#ifndef FAISS_GPU_INDICES_OPTIONS_C_H
+#define FAISS_GPU_INDICES_OPTIONS_C_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// How user vector index data is stored on the GPU
+typedef enum FaissIndicesOptions {
+  /// The user indices are only stored on the CPU; the GPU returns
+  /// (inverted list, offset) to the CPU which is then translated to
+  /// the real user index.
+  INDICES_CPU = 0,
+  /// The indices are not stored at all, on either the CPU or
+  /// GPU. Only (inverted list, offset) is returned to the user as the
+  /// index.
+  INDICES_IVF = 1,
+  /// Indices are stored as 32 bit integers on the GPU, but returned
+  /// as 64 bit integers
+  INDICES_32_BIT = 2,
+  /// Indices are stored as 64 bit integers on the GPU
+  INDICES_64_BIT = 3,
+} FaissIndicesOptions;
+
+#ifdef __cplusplus
+}
+#endif
+#endif
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/gpu/GpuResources_c.cpp b/core/src/index/thirdparty/faiss/c_api/gpu/GpuResources_c.cpp
new file mode 100644
index 0000000000..3f6525125d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/gpu/GpuResources_c.cpp
@@ -0,0 +1,86 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "gpu/GpuResources_c.h"
+#include "gpu/GpuResources.h"
+#include "macros_impl.h"
+
+using faiss::gpu::GpuResources;
+
+DEFINE_DESTRUCTOR(GpuResources)
+
+int faiss_GpuResources_initializeForDevice(FaissGpuResources* res, int device) {
+    try {
+        reinterpret_cast<GpuResources*>(res)->initializeForDevice(device);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_GpuResources_getBlasHandle(FaissGpuResources* res, int device, cublasHandle_t* out) {
+    try {
+        auto o = reinterpret_cast<GpuResources*>(res)->getBlasHandle(device);
+        *out = o;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_GpuResources_getDefaultStream(FaissGpuResources* res, int device, cudaStream_t* out) {
+    try {
+        auto o = reinterpret_cast<GpuResources*>(res)->getDefaultStream(device);
+        *out = o;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_GpuResources_getPinnedMemory(FaissGpuResources* res, void** p_buffer, size_t* p_size) {
+    try {
+        auto o = reinterpret_cast<GpuResources*>(res)->getPinnedMemory();
+        *p_buffer = o.first;
+        *p_size = o.second;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_GpuResources_getAsyncCopyStream(FaissGpuResources* res, int device, cudaStream_t* out) {
+    try {
+        auto o = reinterpret_cast<GpuResources*>(res)->getAsyncCopyStream(device);
+        *out = o;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_GpuResources_getBlasHandleCurrentDevice(FaissGpuResources* res, cublasHandle_t* out) {
+    try {
+        auto o = reinterpret_cast<GpuResources*>(res)->getBlasHandleCurrentDevice();
+        *out = o;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_GpuResources_getDefaultStreamCurrentDevice(FaissGpuResources* res, cudaStream_t* out) {
+    try {
+        auto o = reinterpret_cast<GpuResources*>(res)->getDefaultStreamCurrentDevice();
+        *out = o;
+    } CATCH_AND_HANDLE
+}
+
+int faiss_GpuResources_syncDefaultStream(FaissGpuResources* res, int device) {
+    try {
+        reinterpret_cast<GpuResources*>(res)->syncDefaultStream(device);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_GpuResources_syncDefaultStreamCurrentDevice(FaissGpuResources* res) {
+    try {
+        reinterpret_cast<GpuResources*>(res)->syncDefaultStreamCurrentDevice();
+    } CATCH_AND_HANDLE
+}
+
+int faiss_GpuResources_getAsyncCopyStreamCurrentDevice(FaissGpuResources* res, cudaStream_t* out) {
+    try {
+        auto o = reinterpret_cast<GpuResources*>(res)->getAsyncCopyStreamCurrentDevice();
+        *out = o;
+    } CATCH_AND_HANDLE
+}
+
diff --git a/core/src/index/thirdparty/faiss/c_api/gpu/GpuResources_c.h b/core/src/index/thirdparty/faiss/c_api/gpu/GpuResources_c.h
new file mode 100644
index 0000000000..bb9cefde36
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/gpu/GpuResources_c.h
@@ -0,0 +1,66 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#ifndef FAISS_GPU_RESOURCES_C_H
+#define FAISS_GPU_RESOURCES_C_H
+
+#include <cuda_runtime_api.h>
+#include <cublas.h>
+#include "faiss_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// Base class of GPU-side resource provider; hides provision of
+/// cuBLAS handles, CUDA streams and a temporary memory manager
+FAISS_DECLARE_CLASS(GpuResources)
+
+FAISS_DECLARE_DESTRUCTOR(GpuResources)
+
+/// Call to pre-allocate resources for a particular device. If this is
+/// not called, then resources will be allocated at the first time
+/// of demand
+int faiss_GpuResources_initializeForDevice(FaissGpuResources*, int);
+
+/// Returns the cuBLAS handle that we use for the given device
+int faiss_GpuResources_getBlasHandle(FaissGpuResources*, int, cublasHandle_t*);
+
+/// Returns the stream that we order all computation on for the
+/// given device
+int faiss_GpuResources_getDefaultStream(FaissGpuResources*, int, cudaStream_t*);
+
+/// Returns the available CPU pinned memory buffer
+int faiss_GpuResources_getPinnedMemory(FaissGpuResources*, void**, size_t*);
+
+/// Returns the stream on which we perform async CPU <-> GPU copies
+int faiss_GpuResources_getAsyncCopyStream(FaissGpuResources*, int, cudaStream_t*);
+
+/// Calls getBlasHandle with the current device
+int faiss_GpuResources_getBlasHandleCurrentDevice(FaissGpuResources*, cublasHandle_t*);
+
+/// Calls getDefaultStream with the current device
+int faiss_GpuResources_getDefaultStreamCurrentDevice(FaissGpuResources*, cudaStream_t*);
+
+/// Synchronizes the CPU with respect to the default stream for the
+/// given device
+// equivalent to cudaDeviceSynchronize(getDefaultStream(device))
+int faiss_GpuResources_syncDefaultStream(FaissGpuResources*, int);
+
+/// Calls syncDefaultStream for the current device
+int faiss_GpuResources_syncDefaultStreamCurrentDevice(FaissGpuResources*);
+
+/// Calls getAsyncCopyStream for the current device
+int faiss_GpuResources_getAsyncCopyStreamCurrentDevice(FaissGpuResources*, cudaStream_t*);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/gpu/Makefile b/core/src/index/thirdparty/faiss/c_api/gpu/Makefile
new file mode 100644
index 0000000000..ab1f707cee
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/gpu/Makefile
@@ -0,0 +1,63 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+.SUFFIXES: .cpp .o
+
+# C API with GPU support
+
+include ../../makefile.inc
+DEBUGFLAG=-DNDEBUG # no debugging
+
+LIBNAME=libgpufaiss
+CLIBNAME=libgpufaiss_c
+LIBGPUCOBJ=GpuAutoTune_c.o GpuClonerOptions_c.o GpuIndex_c.o GpuResources_c.o \
+	StandardGpuResources_c.o
+LIBCOBJ=../libfaiss_c.a
+CFLAGS=-fPIC -m64 -Wno-sign-compare -g -O3 -Wall -Wextra
+CUDACFLAGS=-I$(CUDA_ROOT)/include
+
+# Build shared object file by default
+all: $(CLIBNAME).$(SHAREDEXT)
+
+# Build static object file containing the wrapper implementation only.
+# Consumers are required to link with the C++ standard library and remaining
+# portions of this library: libfaiss_c.a, libfaiss.a, libgpufaiss.a, and libstdc++.
+$(CLIBNAME).a: $(LIBGPUCOBJ) ../../gpu/$(LIBNAME).a
+	ar r $@ $^
+
+# Build dynamic library
+$(CLIBNAME).$(SHAREDEXT): $(LIBCOBJ) $(LIBGPUCOBJ) ../../libfaiss.a ../../gpu/$(LIBNAME).a
+	$(CXX) $(LDFLAGS) $(SHAREDFLAGS) $(CUDACFLAGS) -o $@ \
+	-Wl,--whole-archive $(LIBCOBJ) ../../libfaiss.a \
+	-Wl,--no-whole-archive -static-libstdc++ $(LIBGPUCOBJ) $(LIBS) ../../gpu/$(LIBNAME).a \
+	$(NVCCLDFLAGS) $(NVCCLIBS)
+
+# Build GPU example
+bin/example_gpu_c: example_gpu_c.c $(CLIBNAME).$(SHAREDEXT)
+	$(CC) $(CFLAGS) $(CUDACFLAGS) $(NVCCLIBS) -std=c99 -I. -I.. -o $@ example_gpu_c.c \
+	-L. -lgpufaiss_c 
+
+clean:
+	rm -f $(CLIBNAME).a $(CLIBNAME).$(SHAREDEXT)* *.o bin/example_gpu_c
+
+%.o: %.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -c $< -o $@
+
+# Dependencies
+
+GpuAutoTune_c.o: CXXFLAGS += -I.. -I../.. $(CUDACFLAGS) $(DEBUGFLAG)
+GpuAutoTune_c.o: GpuAutoTune_c.cpp GpuAutoTune_c.h ../../gpu/GpuAutoTune.h ../Index_c.h ../macros_impl.h
+
+GpuClonerOptions_c.o: CXXFLAGS += -I.. -I../.. $(CUDACFLAGS) $(DEBUGFLAG)
+GpuClonerOptions_c.o: GpuClonerOptions_c.cpp GpuClonerOptions_c.h GpuIndicesOptions_c.h ../../gpu/GpuClonerOptions.h ../macros_impl.h
+
+GpuIndex_c.o: CXXFLAGS += -I.. -I../.. $(CUDACFLAGS) $(DEBUGFLAG)
+GpuIndex_c.o: GpuIndex_c.cpp GpuIndex_c.h ../../gpu/GpuIndex.h ../macros_impl.h
+
+GpuResources_c.o: CXXFLAGS += -I.. -I../.. $(CUDACFLAGS) $(DEBUGFLAG)
+GpuResources_c.o: GpuResources_c.cpp GpuResources_c.h ../../gpu/GpuResources.h ../macros_impl.h
+
+StandardGpuResources_c.o: CXXFLAGS += -I.. -I../.. $(CUDACFLAGS) $(DEBUGFLAG)
+StandardGpuResources_c.o: StandardGpuResources_c.cpp StandardGpuResources_c.h ../../gpu/StandardGpuResources.h ../macros_impl.h
diff --git a/core/src/index/thirdparty/faiss/c_api/gpu/StandardGpuResources_c.cpp b/core/src/index/thirdparty/faiss/c_api/gpu/StandardGpuResources_c.cpp
new file mode 100644
index 0000000000..84afb027eb
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/gpu/StandardGpuResources_c.cpp
@@ -0,0 +1,54 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "gpu/StandardGpuResources_c.h"
+#include "gpu/StandardGpuResources.h"
+#include "macros_impl.h"
+
+using faiss::gpu::StandardGpuResources;
+
+DEFINE_DESTRUCTOR(StandardGpuResources)
+
+int faiss_StandardGpuResources_new(FaissStandardGpuResources** p_res) {
+    try {
+        auto p = new StandardGpuResources();
+        *p_res = reinterpret_cast<FaissStandardGpuResources*>(p);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_StandardGpuResources_noTempMemory(FaissStandardGpuResources* res) {
+    try {
+        reinterpret_cast<StandardGpuResources*>(res)->noTempMemory();
+    } CATCH_AND_HANDLE
+}
+
+int faiss_StandardGpuResources_setTempMemory(FaissStandardGpuResources* res, size_t size) {
+    try {
+        reinterpret_cast<StandardGpuResources*>(res)->setTempMemory(size);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_StandardGpuResources_setPinnedMemory(FaissStandardGpuResources* res, size_t size) {
+    try {
+        reinterpret_cast<StandardGpuResources*>(res)->setPinnedMemory(size);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_StandardGpuResources_setDefaultStream(FaissStandardGpuResources* res, int device, cudaStream_t stream) {
+    try {
+        reinterpret_cast<StandardGpuResources*>(res)->setDefaultStream(device, stream);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_StandardGpuResources_setDefaultNullStreamAllDevices(FaissStandardGpuResources* res) {
+    try {
+        reinterpret_cast<StandardGpuResources*>(res)->setDefaultNullStreamAllDevices();
+    } CATCH_AND_HANDLE
+}
diff --git a/core/src/index/thirdparty/faiss/c_api/gpu/StandardGpuResources_c.h b/core/src/index/thirdparty/faiss/c_api/gpu/StandardGpuResources_c.h
new file mode 100644
index 0000000000..f9a3c854f0
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/gpu/StandardGpuResources_c.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#ifndef FAISS_STANDARD_GPURESOURCES_C_H
+#define FAISS_STANDARD_GPURESOURCES_C_H
+
+#include <cuda_runtime_api.h>
+#include "faiss_c.h"
+#include "gpu/GpuResources_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// Default implementation of GpuResources that allocates a cuBLAS
+/// stream and 2 streams for use, as well as temporary memory
+FAISS_DECLARE_CLASS_INHERITED(StandardGpuResources, GpuResources)
+
+FAISS_DECLARE_DESTRUCTOR(StandardGpuResources)
+
+/// Default constructor for StandardGpuResources
+int faiss_StandardGpuResources_new(FaissStandardGpuResources**);
+
+/// Disable allocation of temporary memory; all temporary memory
+/// requests will call cudaMalloc / cudaFree at the point of use
+int faiss_StandardGpuResources_noTempMemory(FaissStandardGpuResources*);
+
+/// Specify that we wish to use a certain fixed size of memory on
+/// all devices as temporary memory
+int faiss_StandardGpuResources_setTempMemory(FaissStandardGpuResources*, size_t size);
+
+/// Set amount of pinned memory to allocate, for async GPU <-> CPU
+/// transfers
+int faiss_StandardGpuResources_setPinnedMemory(FaissStandardGpuResources*, size_t size);
+
+/// Called to change the stream for work ordering
+int faiss_StandardGpuResources_setDefaultStream(FaissStandardGpuResources*, int device, cudaStream_t stream);
+
+/// Called to change the work ordering streams to the null stream
+/// for all devices
+int faiss_StandardGpuResources_setDefaultNullStreamAllDevices(FaissStandardGpuResources*);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/gpu/example_gpu_c.c b/core/src/index/thirdparty/faiss/c_api/gpu/example_gpu_c.c
new file mode 100644
index 0000000000..c2a10a2e30
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/gpu/example_gpu_c.c
@@ -0,0 +1,106 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "error_c.h"
+#include "Index_c.h"
+#include "AutoTune_c.h"
+#include "GpuAutoTune_c.h"
+#include "StandardGpuResources_c.h"
+
+#define FAISS_TRY(C)                                       \
+    {                                                      \
+        if (C) {                                           \
+            fprintf(stderr, "%s", faiss_get_last_error()); \
+            exit(-1);                                      \
+        }                                                  \
+    }
+
+double drand() {
+    return (double)rand() / (double)RAND_MAX;
+}
+
+int main() {
+    time_t seed = time(NULL);
+    srand(seed);
+    printf("Generating some data...\n");
+    int d = 128;                           // dimension
+    int nb = 100000;                       // database size
+    int nq = 10000;                        // nb of queries
+    float *xb = malloc(d * nb * sizeof(float));
+    float *xq = malloc(d * nq * sizeof(float));
+
+    for(int i = 0; i < nb; i++) {
+        for(int j = 0; j < d; j++) xb[d * i + j] = drand();
+        xb[d * i] += i / 1000.;
+    }
+    for(int i = 0; i < nq; i++) {
+        for(int j = 0; j < d; j++) xq[d * i + j] = drand();
+        xq[d * i] += i / 1000.;
+    }
+
+    printf("Loading standard GPU resources...\n");
+    FaissStandardGpuResources* gpu_res = NULL;
+    FAISS_TRY(faiss_StandardGpuResources_new(&gpu_res));
+
+    printf("Building an index...\n");
+    FaissIndex* cpu_index = NULL;
+    FAISS_TRY(faiss_index_factory(&cpu_index, d, "Flat", METRIC_L2));  // use factory to create index
+
+    printf("Moving index to the GPU...\n");
+    FaissGpuIndex* index = NULL;
+    FaissGpuClonerOptions* options = NULL;
+    FAISS_TRY(faiss_GpuClonerOptions_new(&options));
+    FAISS_TRY(faiss_index_cpu_to_gpu_with_options(gpu_res, 0, cpu_index, options, &index));
+
+    printf("is_trained = %s\n", faiss_Index_is_trained(index) ? "true" : "false");
+    FAISS_TRY(faiss_Index_add(index, nb, xb));                     // add vectors to the index
+    printf("ntotal = %ld\n", faiss_Index_ntotal(index));
+
+    printf("Searching...\n");
+    int k = 5;
+
+    {       // sanity check: search 5 first vectors of xb
+        idx_t *I = malloc(k * 5 * sizeof(idx_t));
+        float *D = malloc(k * 5 * sizeof(float));
+        FAISS_TRY(faiss_Index_search(index, 5, xb, k, D, I));
+        printf("I=\n");
+        for(int i = 0; i < 5; i++) {
+            for(int j = 0; j < k; j++) printf("%5ld (d=%2.3f)  ", I[i * k + j], D[i * k + j]);
+            printf("\n");
+        }
+        free(I);
+        free(D);
+    }
+    {       // search xq
+        idx_t *I = malloc(k * nq * sizeof(idx_t));
+        float *D = malloc(k * nq * sizeof(float));
+        FAISS_TRY(faiss_Index_search(index, 5, xb, k, D, I));
+        printf("I=\n");
+        for(int i = 0; i < 5; i++) {
+            for(int j = 0; j < k; j++) printf("%5ld (d=%2.3f)  ", I[i * k + j], D[i * k + j]);
+            printf("\n");
+        }
+        free(I);
+        free(D);
+    }
+
+    printf("Freeing index...\n");
+    faiss_Index_free(index);
+    printf("Freeing GPU resources...\n");
+    faiss_GpuResources_free(gpu_res);
+    faiss_GpuClonerOptions_free(options);
+    printf("Done.\n");
+
+    return 0;
+}
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/gpu/macros_impl.h b/core/src/index/thirdparty/faiss/c_api/gpu/macros_impl.h
new file mode 100644
index 0000000000..3f6ea5844a
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/gpu/macros_impl.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#ifndef GPU_MACROS_IMPL_H
+#define GPU_MACROS_IMPL_H
+#include "../macros_impl.h"
+
+#undef DEFINE_GETTER
+#define DEFINE_GETTER(clazz, ty, name)                               \
+    ty faiss_ ## clazz ## _ ## name (const Faiss ## clazz *obj) {    \
+        return static_cast< ty >(                                    \
+            reinterpret_cast< const faiss::gpu::clazz *>(obj)-> name \
+        );                                                           \
+    }
+
+#undef DEFINE_SETTER
+#define DEFINE_SETTER(clazz, ty, name)                                    \
+    void faiss_ ## clazz ## _set_ ## name (Faiss ## clazz *obj, ty val) { \
+        reinterpret_cast< faiss::gpu::clazz *>(obj)-> name = val;              \
+    }
+
+#undef DEFINE_SETTER_STATIC
+#define DEFINE_SETTER_STATIC(clazz, ty_to, ty_from, name)                      \
+    void faiss_ ## clazz ## _set_ ## name (Faiss ## clazz *obj, ty_from val) { \
+        reinterpret_cast< faiss::gpu::clazz *>(obj)-> name =                   \
+            static_cast< ty_to >(val);                                         \
+    }
+
+#undef DEFINE_DESTRUCTOR
+#define DEFINE_DESTRUCTOR(clazz)                           \
+    void faiss_ ## clazz ## _free (Faiss ## clazz *obj) {  \
+        delete reinterpret_cast<faiss::gpu::clazz *>(obj); \
+    }
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/c_api/index_io_c.cpp b/core/src/index/thirdparty/faiss/c_api/index_io_c.cpp
new file mode 100644
index 0000000000..479045e1fb
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/index_io_c.cpp
@@ -0,0 +1,49 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+//  Copyright 2004-present Facebook. All Rights Reserved
+// -*- c++ -*-
+// I/O code for indexes
+
+#include "index_io_c.h"
+#include "index_io.h"
+#include "macros_impl.h"
+
+using faiss::Index;
+
+int faiss_write_index(const FaissIndex *idx, FILE *f) {
+    try {
+        faiss::write_index(reinterpret_cast<const Index*>(idx), f);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_write_index_fname(const FaissIndex *idx, const char *fname) {
+    try {
+        faiss::write_index(reinterpret_cast<const Index*>(idx), fname);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_read_index(FILE *f, int io_flags, FaissIndex **p_out) {
+    try {
+        auto out = faiss::read_index(f, io_flags);
+        *p_out = reinterpret_cast<FaissIndex*>(out);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_read_index_fname(const char *fname, int io_flags, FaissIndex **p_out) {
+    try {
+        auto out = faiss::read_index(fname, io_flags);
+        *p_out = reinterpret_cast<FaissIndex*>(out);
+    } CATCH_AND_HANDLE
+}
+
+int faiss_clone_index (const FaissIndex *idx, FaissIndex **p_out) {
+    try {
+        auto out = faiss::clone_index(reinterpret_cast<const Index*>(idx));
+        *p_out = reinterpret_cast<FaissIndex*>(out);
+    } CATCH_AND_HANDLE
+}
diff --git a/core/src/index/thirdparty/faiss/c_api/index_io_c.h b/core/src/index/thirdparty/faiss/c_api/index_io_c.h
new file mode 100644
index 0000000000..f20ce2e644
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/index_io_c.h
@@ -0,0 +1,55 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+//  Copyright 2004-present Facebook. All Rights Reserved
+// -*- c++ -*-
+// I/O code for indexes
+
+
+#ifndef FAISS_INDEX_IO_C_H
+#define FAISS_INDEX_IO_C_H
+
+#include <stdio.h>
+#include "faiss_c.h"
+#include "Index_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Write index to a file.
+ * This is equivalent to `faiss::write_index` when a file descriptor is provided.
+ */
+int faiss_write_index(const FaissIndex *idx, FILE *f);
+
+/** Write index to a file.
+ * This is equivalent to `faiss::write_index` when a file path is provided.
+ */
+int faiss_write_index_fname(const FaissIndex *idx, const char *fname);
+
+#define FAISS_IO_FLAG_MMAP 1
+#define FAISS_IO_FLAG_READ_ONLY 2
+
+/** Read index from a file.
+ * This is equivalent to `faiss:read_index` when a file descriptor is given.
+ */
+int faiss_read_index(FILE *f, int io_flags, FaissIndex **p_out);
+
+/** Read index from a file.
+ * This is equivalent to `faiss:read_index` when a file path is given.
+ */
+int faiss_read_index_fname(const char *fname, int io_flags, FaissIndex **p_out);
+
+/* cloning functions */
+
+/** Clone an index. This is equivalent to `faiss::clone_index` */
+int faiss_clone_index (const FaissIndex *, FaissIndex ** p_out);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/core/src/index/thirdparty/faiss/c_api/macros_impl.h b/core/src/index/thirdparty/faiss/c_api/macros_impl.h
new file mode 100644
index 0000000000..af07938018
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/macros_impl.h
@@ -0,0 +1,110 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+/// Utility macros for the C wrapper implementation.
+
+#ifndef MACROS_IMPL_H
+#define MACROS_IMPL_H
+
+#include "faiss_c.h"
+#include "FaissException.h"
+#include "error_impl.h"
+#include <stdexcept>
+#include <iostream>
+
+#ifdef NDEBUG
+#define CATCH_AND_HANDLE                              \
+    catch (faiss::FaissException& e) {                \
+        faiss_last_exception =                        \
+            std::make_exception_ptr(e);               \
+        return -2;                                    \
+    } catch (std::exception& e) {                     \
+        faiss_last_exception =                        \
+            std::make_exception_ptr(e);               \
+        return -4;                                    \
+    } catch (...) {                                   \
+        faiss_last_exception =                        \
+            std::make_exception_ptr(                  \
+                std::runtime_error("Unknown error")); \
+        return -1;                                    \
+    } return 0;
+#else
+#define CATCH_AND_HANDLE                              \
+    catch (faiss::FaissException& e) {                \
+        std::cerr << e.what() << '\n';                \
+        faiss_last_exception =                        \
+            std::make_exception_ptr(e);               \
+        return -2;                                    \
+    } catch (std::exception& e) {                     \
+        std::cerr << e.what() << '\n';                \
+        faiss_last_exception =                        \
+            std::make_exception_ptr(e);               \
+        return -4;                                    \
+    } catch (...) {                                   \
+        std::cerr << "Unrecognized exception!\n";     \
+        faiss_last_exception =                        \
+            std::make_exception_ptr(                  \
+                std::runtime_error("Unknown error")); \
+        return -1;                                    \
+    } return 0;
+#endif
+
+#define DEFINE_GETTER(clazz, ty, name)                            \
+    ty faiss_ ## clazz ## _ ## name (const Faiss ## clazz *obj) { \
+        return static_cast< ty >(                                 \
+            reinterpret_cast< const faiss::clazz *>(obj)-> name   \
+        );                                                        \
+    }
+
+#define DEFINE_GETTER_SUBCLASS(clazz, parent, ty, name)                \
+    ty faiss_ ## clazz ## _ ## name (const Faiss ## clazz *obj) {      \
+        return static_cast< ty >(                                      \
+            reinterpret_cast<const faiss::parent::clazz *>(obj)-> name \
+        );                                                             \
+    }
+
+#define DEFINE_GETTER_PERMISSIVE(clazz, ty, name)                 \
+    ty faiss_ ## clazz ## _ ## name (const Faiss ## clazz *obj) { \
+        return ( ty ) (                                           \
+            reinterpret_cast<const faiss::clazz *>(obj)-> name    \
+        );                                                        \
+    }
+
+#define DEFINE_GETTER_SUBCLASS_PERMISSIVE(clazz, parent, ty, name)     \
+    ty faiss_ ## clazz ## _ ## name (const Faiss ## clazz *obj) {      \
+        return ( ty ) (                                                \
+            reinterpret_cast<const faiss::parent::clazz *>(obj)-> name \
+        );                                                             \
+    }
+
+#define DEFINE_SETTER(clazz, ty, name)                                    \
+    void faiss_ ## clazz ## _set_ ## name (Faiss ## clazz *obj, ty val) { \
+        reinterpret_cast< faiss::clazz *>(obj)-> name = val;              \
+    }
+
+#define DEFINE_SETTER_STATIC(clazz, ty_to, ty_from, name)                      \
+    void faiss_ ## clazz ## _set_ ## name (Faiss ## clazz *obj, ty_from val) { \
+        reinterpret_cast< faiss::clazz *>(obj)-> name =                        \
+            static_cast< ty_to >(val);                                         \
+    }
+
+#define DEFINE_DESTRUCTOR(clazz)                           \
+    void faiss_ ## clazz ## _free (Faiss ## clazz *obj) {  \
+        delete reinterpret_cast<faiss::clazz *>(obj);      \
+    }
+
+#define DEFINE_INDEX_DOWNCAST(clazz)                                \
+    Faiss ## clazz * faiss_ ## clazz ## _cast (FaissIndex* index) { \
+        return reinterpret_cast<Faiss ## clazz *>(                  \
+            dynamic_cast< faiss::clazz *>(                          \
+                reinterpret_cast<faiss::Index*>(index)));           \
+    }
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/ci/jenkins/Jenkinsfile b/core/src/index/thirdparty/faiss/ci/jenkins/Jenkinsfile
new file mode 100644
index 0000000000..347e53a693
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/ci/jenkins/Jenkinsfile
@@ -0,0 +1,579 @@
+pipeline {
+    agent none
+    
+    options {
+        timestamps()
+    }
+
+    parameters{
+    	choice choices: ['Release', 'Debug'], description: 'Build Type', name: 'BUILD_TYPE'
+        choice choices: ['False', 'True'], description: 'Whether it is origin Faiss', name: 'IS_ORIGIN_FAISS'
+    	string defaultValue: 'registry.zilliz.com', description: 'DOCKER REGISTRY URL', name: 'DOKCER_REGISTRY_URL', trim: true
+    	string defaultValue: 'a54e38ef-c424-4ea9-9224-b25fc20e3924', description: 'DOCKER CREDENTIALS ID', name: 'DOCKER_CREDENTIALS_ID', trim: true
+    	string defaultValue: 'http://192.168.1.201/artifactory/milvus', description: 'JFROG ARTFACTORY URL', name: 'JFROG_ARTFACTORY_URL', trim: true
+    	string defaultValue: '76fd48ab-2b8e-4eed-834d-2eefd23bb3a6', description: 'JFROG CREDENTIALS ID', name: 'JFROG_CREDENTIALS_ID', trim: true
+    }
+
+    environment {
+        PROJECT_NAME = "milvus"
+        LOWER_BUILD_TYPE = params.BUILD_TYPE.toLowerCase()
+        SEMVER = "0.6.0"
+        JOBNAMES = env.JOB_NAME.split('/')
+        PIPELINE_NAME = "${JOBNAMES[0]}"
+        FAISS_ROOT_PATH="/usr/local/faiss"
+        NATIVE_FAISS_VERSION="1.6.0"
+    }
+
+    stages {
+    	stage("Ubuntu 18.04 x86_64") {
+    		environment {
+    			OS_NAME = "ubuntu18.04"
+                CPU_ARCH = "amd64"
+    		}
+
+    		parallel {
+    			stage("GPU Version") {
+    				environment {
+    					BINRARY_VERSION = "gpu"
+                        PACKAGE_VERSION = VersionNumber([
+                            versionNumberString : '${SEMVER}-gpu-${OS_NAME}-${CPU_ARCH}-${LOWER_BUILD_TYPE}-${BUILD_DATE_FORMATTED, "yyyyMMdd"}-${BUILDS_TODAY}'
+                        ]);
+                        DOCKER_VERSION = "${SEMVER}-gpu-${OS_NAME}-${LOWER_BUILD_TYPE}"
+    				}
+
+    				stages {
+                        stage("Run Build") {
+                            agent {
+                                kubernetes {
+                                    label "${env.BINRARY_VERSION}-build"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  name: milvus-gpu-build-env
+  labels:
+    app: milvus
+    componet: gpu-build-env
+spec:
+  containers:
+  - name: milvus-gpu-build-env
+    image: registry.zilliz.com/milvus/milvus-gpu-build-env:v0.6.0-ubuntu18.04
+    env:
+    - name: POD_IP
+      valueFrom:
+        fieldRef:
+          fieldPath: status.podIP
+    - name: BUILD_ENV_IMAGE_ID
+      value: "da9023b0f858f072672f86483a869aa87e90a5140864f89e5a012ec766d96dea"
+    command:
+    - cat
+    tty: true
+    resources:
+      limits:
+        memory: "24Gi"
+        cpu: "8.0"
+        nvidia.com/gpu: 1
+      requests:
+        memory: "16Gi"
+        cpu: "4.0"
+  - name: milvus-mysql
+    image: mysql:5.6
+    env:
+    - name: MYSQL_ROOT_PASSWORD
+      value: 123456
+    ports:
+    - containerPort: 3306
+      name: mysql
+                                    """
+                                }
+                            }
+
+                            stages {
+                                stage('Prepare') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                dir ("milvus") {
+                                                    checkout([$class: 'GitSCM', branches: [[name: "${env.SEMVER}"]], userRemoteConfigs: [[url: "https://github.com/milvus-io/milvus.git", name: 'origin', refspec: "+refs/heads/${env.SEMVER}:refs/remotes/origin/${env.SEMVER}"]]])
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage('Build') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/build.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                                stage('Code Coverage') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/coverage.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                                stage('Upload Package') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                dir ("milvus") {
+                                                    load "ci/jenkins/step/package.groovy"
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        stage("Publish docker images") {
+                            agent {
+                                kubernetes {
+                                    label "${BINRARY_VERSION}-publish"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  labels:
+    app: publish
+    componet: docker
+spec:
+  containers:
+  - name: publish-images
+    image: registry.zilliz.com/library/docker:v1.0.0
+    securityContext:
+      privileged: true
+    command:
+    - cat
+    tty: true
+    volumeMounts:
+    - name: docker-sock
+      mountPath: /var/run/docker.sock
+  volumes:
+  - name: docker-sock
+    hostPath:
+      path: /var/run/docker.sock
+"""
+                                }
+                            }
+
+                            stages {
+                                stage('Prepare') {
+                                    steps {
+                                        container('publish-images') {
+                                            script {
+                                                dir ("milvus") {
+                                                    checkout([$class: 'GitSCM', branches: [[name: "${env.SEMVER}"]], userRemoteConfigs: [[url: "https://github.com/milvus-io/milvus.git", name: 'origin', refspec: "+refs/heads/${env.SEMVER}:refs/remotes/origin/${env.SEMVER}"]]])
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage('Publish') {
+                                    steps {
+                                        container('publish-images') {
+                                            script {
+                                                dir ("milvus") {
+                                                    load "ci/jenkins/step/publishImages.groovy"
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        stage("Deploy to Development") {
+                            environment {
+                                FROMAT_SEMVER = "${env.SEMVER}".replaceAll("\\.", "-")
+                                HELM_RELEASE_NAME = "${env.PIPELINE_NAME}-${env.FROMAT_SEMVER}-${env.BUILD_NUMBER}-single-${env.BINRARY_VERSION}".toLowerCase()
+                            }
+
+                            agent {
+                                kubernetes {
+                                    label "${env.BINRARY_VERSION}-dev-test"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  labels:
+    app: milvus
+    componet: test-env
+spec:
+  containers:
+  - name: milvus-test-env
+    image: registry.zilliz.com/milvus/milvus-test-env:v0.1
+    command:
+    - cat
+    tty: true
+    volumeMounts:
+    - name: kubeconf
+      mountPath: /root/.kube/
+      readOnly: true
+  volumes:
+  - name: kubeconf
+    secret:
+      secretName: test-cluster-config
+"""
+                                }
+                            }
+
+                            stages {
+                                stage('Prepare') {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                dir ("milvus") {
+                                                    checkout([$class: 'GitSCM', branches: [[name: "${env.SEMVER}"]], userRemoteConfigs: [[url: "https://github.com/milvus-io/milvus.git", name: 'origin', refspec: "+refs/heads/${env.SEMVER}:refs/remotes/origin/${env.SEMVER}"]]])
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage("Deploy to Dev") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                dir ("milvus") {
+                                                    load "ci/jenkins/step/deploySingle2Dev.groovy"
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage("Dev Test") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                boolean isNightlyTest = isTimeTriggeredBuild()
+                                                if (isNightlyTest) {
+                                                    dir ("milvus") {
+                                                        load "ci/jenkins/step/singleDevNightlyTest.groovy"
+                                                    }
+                                                } else {
+                                                    dir ("milvus") {
+                                                        load "ci/jenkins/step/singleDevTest.groovy"
+                                                    }
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage ("Cleanup Dev") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                dir ("milvus") {
+                                                    load "ci/jenkins/step/cleanupSingleDev.groovy"
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                            post {
+                                unsuccessful {
+                                    container('milvus-test-env') {
+                                        script {
+                                            dir ("milvus") {
+                                                load "ci/jenkins/step/cleanupSingleDev.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+    				}
+                }
+
+                stage("CPU Version") {
+                    environment {
+                        BINRARY_VERSION = "cpu"
+                        PACKAGE_VERSION = VersionNumber([
+                            versionNumberString : '${SEMVER}-cpu-${OS_NAME}-${CPU_ARCH}-${LOWER_BUILD_TYPE}-${BUILD_DATE_FORMATTED, "yyyyMMdd"}-${BUILDS_TODAY}'
+                        ]);
+                        DOCKER_VERSION = "${SEMVER}-cpu-${OS_NAME}-${LOWER_BUILD_TYPE}"
+                    }
+
+                    stages {
+                        stage("Run Build") {
+                            agent {
+                                kubernetes {
+                                    label "${env.BINRARY_VERSION}-build"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  name: milvus-cpu-build-env
+  labels:
+    app: milvus
+    componet: cpu-build-env
+spec:
+  containers:
+  - name: milvus-cpu-build-env
+    image: registry.zilliz.com/milvus/milvus-cpu-build-env:v0.6.0-ubuntu18.04
+    env:
+    - name: POD_IP
+      valueFrom:
+        fieldRef:
+          fieldPath: status.podIP
+    - name: BUILD_ENV_IMAGE_ID
+      value: "23476391bec80c64f10d44a6370c73c71f011a6b95114b10ff82a60e771e11c7"
+    command:
+    - cat
+    tty: true
+    resources:
+      limits:
+        memory: "24Gi"
+        cpu: "8.0"
+      requests:
+        memory: "16Gi"
+        cpu: "4.0"
+  - name: milvus-mysql
+    image: mysql:5.6
+    env:
+    - name: MYSQL_ROOT_PASSWORD
+      value: 123456
+    ports:
+    - containerPort: 3306
+      name: mysql
+                                    """
+                                }
+                            }
+
+                            stages {
+                                stage('Prepare') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                dir ("milvus") {
+                                                    checkout([$class: 'GitSCM', branches: [[name: "${env.SEMVER}"]], userRemoteConfigs: [[url: "https://github.com/milvus-io/milvus.git", name: 'origin', refspec: "+refs/heads/${env.SEMVER}:refs/remotes/origin/${env.SEMVER}"]]])
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage('Build') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/build.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                                stage('Code Coverage') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/coverage.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                                stage('Upload Package') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                dir ("milvus") {
+                                                    load "ci/jenkins/step/package.groovy"
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        stage("Publish docker images") {
+                            agent {
+                                kubernetes {
+                                    label "${BINRARY_VERSION}-publish"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  labels:
+    app: publish
+    componet: docker
+spec:
+  containers:
+  - name: publish-images
+    image: registry.zilliz.com/library/docker:v1.0.0
+    securityContext:
+      privileged: true
+    command:
+    - cat
+    tty: true
+    volumeMounts:
+    - name: docker-sock
+      mountPath: /var/run/docker.sock
+  volumes:
+  - name: docker-sock
+    hostPath:
+      path: /var/run/docker.sock
+"""
+                                }
+                            }
+
+                            stages {
+                                stage('Prepare') {
+                                    steps {
+                                        container("publish-images") {
+                                            script {
+                                                dir ("milvus") {
+                                                    checkout([$class: 'GitSCM', branches: [[name: "${env.SEMVER}"]], userRemoteConfigs: [[url: "https://github.com/milvus-io/milvus.git", name: 'origin', refspec: "+refs/heads/${env.SEMVER}:refs/remotes/origin/${env.SEMVER}"]]])
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage('Publish') {
+                                    steps {
+                                        container('publish-images'){
+                                            script {
+                                                dir ("milvus") {
+                                                    load "ci/jenkins/step/publishImages.groovy"
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        stage("Deploy to Development") {
+                            environment {
+                                FROMAT_SEMVER = "${env.SEMVER}".replaceAll("\\.", "-")
+                                HELM_RELEASE_NAME = "${env.PIPELINE_NAME}-${env.FROMAT_SEMVER}-${env.BUILD_NUMBER}-single-${env.BINRARY_VERSION}".toLowerCase()
+                            }
+
+                            agent {
+                                kubernetes {
+                                    label "${env.BINRARY_VERSION}-dev-test"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  labels:
+    app: milvus
+    componet: test-env
+spec:
+  containers:
+  - name: milvus-test-env
+    image: registry.zilliz.com/milvus/milvus-test-env:v0.1
+    command:
+    - cat
+    tty: true
+    volumeMounts:
+    - name: kubeconf
+      mountPath: /root/.kube/
+      readOnly: true
+  volumes:
+  - name: kubeconf
+    secret:
+      secretName: test-cluster-config
+"""
+                                }
+                            }
+
+                            stages {
+                                stage('Prepare') {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                dir ("milvus") {
+                                                    checkout([$class: 'GitSCM', branches: [[name: "${env.SEMVER}"]], userRemoteConfigs: [[url: "https://github.com/milvus-io/milvus.git", name: 'origin', refspec: "+refs/heads/${env.SEMVER}:refs/remotes/origin/${env.SEMVER}"]]])
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage("Deploy to Dev") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                dir ("milvus") {
+                                                    load "ci/jenkins/step/deploySingle2Dev.groovy"
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage("Dev Test") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                boolean isNightlyTest = isTimeTriggeredBuild()
+                                                if (isNightlyTest) {
+                                                    dir ("milvus") {
+                                                        load "ci/jenkins/step/singleDevNightlyTest.groovy"
+                                                    }
+                                                } else {
+                                                    dir ("milvus") {
+                                                        load "ci/jenkins/step/singleDevTest.groovy"
+                                                    }
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage ("Cleanup Dev") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                dir ("milvus") {
+                                                    load "ci/jenkins/step/cleanupSingleDev.groovy"
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                            post {
+                                unsuccessful {
+                                    container('milvus-test-env') {
+                                        script {
+                                            dir ("milvus") {
+                                                load "ci/jenkins/step/cleanupSingleDev.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+    		}
+    	}
+    }
+}
+
+boolean isTimeTriggeredBuild() {
+    if (currentBuild.getBuildCauses('hudson.triggers.TimerTrigger$TimerTriggerCause').size() != 0) {
+        return true
+    }
+    return false
+}
diff --git a/core/src/index/thirdparty/faiss/ci/jenkins/step/build.groovy b/core/src/index/thirdparty/faiss/ci/jenkins/step/build.groovy
new file mode 100644
index 0000000000..6013aea5ea
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/ci/jenkins/step/build.groovy
@@ -0,0 +1,35 @@
+timeout(time: 60, unit: 'MINUTES') {
+    dir ("ci/scripts") {
+        if ("${env.BINRARY_VERSION}" == "gpu") {
+            if ("${params.IS_ORIGIN_FAISS}" == "False") {
+                sh "./build.sh -o ${env.FAISS_ROOT_PATH} -i -g"
+            } else {
+                sh "wget https://github.com/facebookresearch/faiss/archive/v${env.NATIVE_FAISS_VERSION}.tar.gz && \
+                        tar zxvf v${env.NATIVE_FAISS_VERSION}.tar.gz"
+                sh "./build.sh -o ${env.FAISS_ROOT_PATH} -s ./faiss-${env.NATIVE_FAISS_VERSION} -i -g"
+            }
+        } else {
+            sh "wget https://github.com/facebookresearch/faiss/archive/v${env.NATIVE_FAISS_VERSION}.tar.gz && \
+                    tar zxvf v${env.NATIVE_FAISS_VERSION}.tar.gz"
+            sh "./build.sh -o ${env.FAISS_ROOT_PATH} -s ./faiss-${env.NATIVE_FAISS_VERSION} -i"
+        }
+    }
+
+    dir ("milvus") {
+        dir ("ci/scripts") {
+            withCredentials([usernamePassword(credentialsId: "${params.JFROG_CREDENTIALS_ID}", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
+                def checkResult = sh(script: "./check_ccache.sh -l ${params.JFROG_ARTFACTORY_URL}/ccache", returnStatus: true)
+                if ("${env.BINRARY_VERSION}" == "gpu") {
+                    if ("${params.IS_ORIGIN_FAISS}" == "False") {
+                        sh ". ./before-install.sh && ./build.sh -t ${params.BUILD_TYPE} -o /opt/milvus -f ${env.FAISS_ROOT_PATH} -l -m -g -x -u -c"
+                    } else {
+                        sh ". ./before-install.sh && ./build.sh -t ${params.BUILD_TYPE} -o /opt/milvus -f ${env.FAISS_ROOT_PATH} -l -m -g -u -c"
+                    }
+                } else {
+                    sh ". ./before-install.sh && ./build.sh -t ${params.BUILD_TYPE} -o /opt/milvus -f ${env.FAISS_ROOT_PATH} -l -m -u -c"
+                }
+                sh "./update_ccache.sh -l ${params.JFROG_ARTFACTORY_URL}/ccache -u ${USERNAME} -p ${PASSWORD}"
+            }
+        }
+    }
+}
diff --git a/core/src/index/thirdparty/faiss/ci/jenkins/step/coverage.groovy b/core/src/index/thirdparty/faiss/ci/jenkins/step/coverage.groovy
new file mode 100644
index 0000000000..40d2d2f03a
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/ci/jenkins/step/coverage.groovy
@@ -0,0 +1,5 @@
+timeout(time: 30, unit: 'MINUTES') {
+    dir ("milvus/ci/scripts") {
+        sh "./coverage.sh -o /opt/milvus -u root -p 123456 -t \$POD_IP"
+    }
+}
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/ci/scripts/build.sh b/core/src/index/thirdparty/faiss/ci/scripts/build.sh
new file mode 100755
index 0000000000..f3bb2dcaca
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/ci/scripts/build.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+
+set -e
+
+SOURCE="${BASH_SOURCE[0]}"
+while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
+  DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
+  SOURCE="$(readlink "$SOURCE")"
+  [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
+done
+SCRIPTS_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
+
+FAISS_SOURCE_DIR="${SCRIPTS_DIR}/../.."
+
+FAISS_WITH_MKL="False"
+FAISS_GPU_VERSION="False"
+FAISS_COMMON_CONFIGURE_ARGS="CXXFLAGS=\"-mavx2 -mf16c\" --without-python"
+FAISS_CONFIGURE_ARGS="${FAISS_COMMON_CONFIGURE_ARGS}"
+CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda"
+FAISS_CUDA_ARCH="-gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75"
+MKL_ROOT_DIR="/opt/intel/compilers_and_libraries_2019.5.281/linux/mkl"
+
+while getopts "o:s:m:b:l:c:a:igh" arg
+do
+        case $arg in
+             o)
+                FAISS_INSTALL_PREFIX=$OPTARG
+                ;;
+             s)
+                FAISS_SOURCE_DIR=$OPTARG
+                ;;
+             m)
+                MKL_ROOT_DIR=$OPTARG
+                ;;
+             b)
+                OPENBLAS_PREFIX=$OPTARG
+                ;;
+             l)
+                LAPACK_PREFIX=$OPTARG
+                ;;
+             c)
+                CUDA_TOOLKIT_ROOT_DIR=$OPTARG
+                ;;
+             a)
+                FAISS_CUDA_ARCH=$OPTARG
+                ;;
+             i)
+                FAISS_WITH_MKL="True"
+                ;;
+             g)
+                FAISS_GPU_VERSION="True"
+                ;;
+             h) # help
+                echo "
+
+parameter:
+-o: faiss install prefix path
+-s: faiss source directory
+-m: mkl root directory
+-b: openblas install prefix path
+-l: lapack install prefix path
+-c: CUDA toolkit root directory
+-a: faiss CUDA compute architecture
+-i: faiss with mkl
+-g: faiss gpu version
+-h: help
+
+usage:
+./build.sh -o \${FAISS_INSTALL_PREFIX} -s \${FAISS_SOURCE_DIR} -m \${MKL_ROOT_DIR} -b \${OPENBLAS_PREFIX} -l \${LAPACK_PREFIX} -c \${CUDA_TOOLKIT_ROOT_DIR} -a \${FAISS_CUDA_ARCH} [-i] [-g] [-h]
+                "
+                exit 0
+                ;;
+             ?)
+                echo "ERROR! unknown argument"
+        exit 1
+        ;;
+        esac
+done
+
+if [[ -n "${FAISS_INSTALL_PREFIX}" ]];then
+        FAISS_CONFIGURE_ARGS="${FAISS_CONFIGURE_ARGS} --prefix=${FAISS_INSTALL_PREFIX}"
+fi
+
+if [[ "${FAISS_GPU_VERSION}" == "True" ]];then
+    if [[ ! -n "${FAISS_CUDA_ARCH}" ]];then
+        echo "FAISS_CUDA_ARCH: \"${FAISS_CUDA_ARCH}\" is empty!"
+        exit 1
+    fi
+    if [[ ! -d "${CUDA_TOOLKIT_ROOT_DIR}" ]];then
+        echo "CUDA_TOOLKIT_ROOT_DIR: \"${CUDA_TOOLKIT_ROOT_DIR}\" directory doesn't exist!"
+        exit 1
+    fi
+    FAISS_CONFIGURE_ARGS="${FAISS_CONFIGURE_ARGS} --with-cuda=${CUDA_TOOLKIT_ROOT_DIR} --with-cuda-arch='${FAISS_CUDA_ARCH}'"
+else
+    FAISS_CONFIGURE_ARGS="${FAISS_CONFIGURE_ARGS} --without-cuda"
+fi
+
+if [[ "${FAISS_WITH_MKL}" == "True" ]];then
+    if [[ ! -d "${MKL_ROOT_DIR}" ]];then
+        echo "MKL_ROOT_DIR: \"${MKL_ROOT_DIR}\" directory doesn't exist!"
+        exit 1
+    fi
+    FAISS_CONFIGURE_ARGS="${FAISS_CONFIGURE_ARGS} CPPFLAGS='-DFINTEGER=long -DMKL_ILP64 -m64 -I${MKL_ROOT_DIR}/include' LDFLAGS='-L${MKL_ROOT_DIR}/lib/intel64'"
+else
+    if [[ -n "${LAPACK_PREFIX}" ]];then
+        if [[ ! -d "${LAPACK_PREFIX}" ]];then
+            echo "LAPACK_PREFIX: \"${LAPACK_PREFIX}\" directory doesn't exist!"
+            exit 1
+        fi
+        FAISS_CONFIGURE_ARGS="${FAISS_CONFIGURE_ARGS} LDFLAGS=-L${LAPACK_PREFIX}/lib"
+    fi
+    if [[ -n "${OPENBLAS_PREFIX}" ]];then
+        if [[ ! -d "${OPENBLAS_PREFIX}" ]];then
+            echo "OPENBLAS_PREFIX: \"${OPENBLAS_PREFIX}\" directory doesn't exist!"
+            exit 1
+        fi
+        FAISS_CONFIGURE_ARGS="${FAISS_CONFIGURE_ARGS} LDFLAGS=-L${OPENBLAS_PREFIX}/lib"
+    fi
+fi
+
+cd ${FAISS_SOURCE_DIR}
+
+sh -c "./configure ${FAISS_CONFIGURE_ARGS}"
+
+# compile and build
+make -j8 || exit 1
+make install || exit 1
diff --git a/core/src/index/thirdparty/faiss/clone_index.cpp b/core/src/index/thirdparty/faiss/clone_index.cpp
new file mode 100644
index 0000000000..8258d3fa6f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/clone_index.cpp
@@ -0,0 +1,150 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/clone_index.h>
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <faiss/impl/FaissAssert.h>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/VectorTransform.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/IndexLSH.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexIVFPQR.h>
+#include <faiss/Index2Layer.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFSpectralHash.h>
+#include <faiss/MetaIndexes.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/IndexSQHybrid.h>
+#include <faiss/IndexHNSW.h>
+#include <faiss/IndexLattice.h>
+#include <faiss/Index2Layer.h>
+
+namespace faiss {
+
+/*************************************************************
+ * cloning functions
+ **************************************************************/
+
+
+
+Index * clone_index (const Index *index)
+{
+    Cloner cl;
+    return cl.clone_Index (index);
+}
+
+// assumes there is a copy constructor ready. Always try from most
+// specific to most general. Most indexes don't have complicated
+// structs, the default copy constructor often just works.
+#define TRYCLONE(classname, obj) \
+    if (const classname *clo = dynamic_cast<const classname *>(obj)) { \
+        return new classname(*clo); \
+    } else
+
+VectorTransform *Cloner::clone_VectorTransform (const VectorTransform *vt)
+{
+    TRYCLONE (RemapDimensionsTransform, vt)
+    TRYCLONE (OPQMatrix, vt)
+    TRYCLONE (PCAMatrix, vt)
+    TRYCLONE (ITQMatrix, vt)
+    TRYCLONE (RandomRotationMatrix, vt)
+    TRYCLONE (LinearTransform, vt)
+    {
+      FAISS_THROW_MSG("clone not supported for this type of VectorTransform");
+    }
+    return nullptr;
+}
+
+IndexIVF * Cloner::clone_IndexIVF (const IndexIVF *ivf)
+{
+    TRYCLONE (IndexIVFPQR, ivf)
+    TRYCLONE (IndexIVFPQ, ivf)
+    TRYCLONE (IndexIVFFlat, ivf)
+    TRYCLONE (IndexIVFScalarQuantizer, ivf)
+    TRYCLONE (IndexIVFSQHybrid, ivf)
+    {
+      FAISS_THROW_MSG("clone not supported for this type of IndexIVF");
+    }
+    return nullptr;
+}
+
+Index *Cloner::clone_Index (IndexComposition* index_composition) {
+    FAISS_THROW_MSG( "Not implemented");
+}
+
+Index *Cloner::clone_Index (const Index *index)
+{
+    TRYCLONE (IndexPQ, index)
+    TRYCLONE (IndexLSH, index)
+    TRYCLONE (IndexFlatL2, index)
+    TRYCLONE (IndexFlatIP, index)
+    TRYCLONE (IndexFlat, index)
+    TRYCLONE (IndexLattice, index)
+    TRYCLONE (IndexScalarQuantizer, index)
+    TRYCLONE (MultiIndexQuantizer, index)
+    if (const IndexIVF * ivf = dynamic_cast<const IndexIVF*>(index)) {
+        IndexIVF *res = clone_IndexIVF (ivf);
+        if (ivf->invlists == nullptr) {
+            res->invlists = nullptr;
+        } else if (auto *ails = dynamic_cast<const ArrayInvertedLists*>
+                   (ivf->invlists)) {
+            res->invlists = new ArrayInvertedLists(*ails);
+            res->own_invlists = true;
+        } else if (auto *ails = dynamic_cast<const ReadOnlyArrayInvertedLists*>(ivf->invlists)) {
+            res->invlists = new ReadOnlyArrayInvertedLists(*ails);
+            res->own_invlists = true;
+        } else {
+            FAISS_THROW_MSG( "clone not supported for this type of inverted lists");
+        }
+        res->own_fields = true;
+        res->quantizer = clone_Index (ivf->quantizer);
+        return res;
+    } else if (const IndexPreTransform * ipt =
+               dynamic_cast<const IndexPreTransform*> (index)) {
+        IndexPreTransform *res = new IndexPreTransform ();
+        res->d = ipt->d;
+        res->index = clone_Index (ipt->index);
+        for (int i = 0; i < ipt->chain.size(); i++)
+            res->chain.push_back (clone_VectorTransform (ipt->chain[i]));
+        res->own_fields = true;
+        return res;
+    } else if (const IndexIDMap *idmap =
+               dynamic_cast<const IndexIDMap*> (index)) {
+        IndexIDMap *res = new IndexIDMap (*idmap);
+        res->own_fields = true;
+        res->index = clone_Index (idmap->index);
+        return res;
+    } else if (const IndexHNSW *ihnsw =
+               dynamic_cast<const IndexHNSW*> (index)) {
+        IndexHNSW *res = new IndexHNSW (*ihnsw);
+        res->own_fields = true;
+        res->storage = clone_Index (ihnsw->storage);
+        return res;
+    } else if (const Index2Layer *i2l =
+               dynamic_cast<const Index2Layer*> (index)) {
+        Index2Layer *res = new Index2Layer (*i2l);
+        res->q1.own_fields = true;
+        res->q1.quantizer = clone_Index (i2l->q1.quantizer);
+        return res;
+    } else {
+        FAISS_THROW_MSG( "clone not supported for this type of Index");
+    }
+    return nullptr;
+}
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/clone_index.h b/core/src/index/thirdparty/faiss/clone_index.h
new file mode 100644
index 0000000000..45990c93f7
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/clone_index.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+// I/O code for indexes
+
+#pragma once
+
+
+
+namespace faiss {
+
+struct Index;
+struct IndexIVF;
+struct VectorTransform;
+
+namespace gpu {
+struct GpuIndexFlat;
+}
+
+/* cloning functions */
+Index *clone_index (const Index *);
+
+struct IndexComposition {
+    Index *index = nullptr;
+    gpu::GpuIndexFlat *quantizer = nullptr;
+    long mode = 0; // 0: all data, 1: copy quantizer, 2: copy data
+};
+
+/** Cloner class, useful to override classes with other cloning
+ * functions. The cloning function above just calls
+ * Cloner::clone_Index. */
+struct Cloner {
+    virtual VectorTransform *clone_VectorTransform (const VectorTransform *);
+    virtual Index *clone_Index (const Index *);
+    virtual Index *clone_Index (IndexComposition* index_composition);
+    virtual IndexIVF *clone_IndexIVF (const IndexIVF *);
+    virtual ~Cloner() {}
+};
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/conda/Dockerfile b/core/src/index/thirdparty/faiss/conda/Dockerfile
new file mode 100644
index 0000000000..9184e8fea3
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/conda/Dockerfile
@@ -0,0 +1,33 @@
+FROM soumith/conda-cuda:latest
+
+COPY ./ faiss
+WORKDIR /faiss/conda
+
+ENV FAISS_BUILD_VERSION 1.5.3
+ENV FAISS_BUILD_NUMBER  0
+RUN conda build faiss --no-anaconda-upload -c pytorch
+RUN CUDA_ROOT=/usr/local/cuda-8.0 \
+    CUDA_ARCH="-gencode=arch=compute_35,code=compute_35 \
+               -gencode=arch=compute_52,code=compute_52 \
+               -gencode=arch=compute_60,code=compute_60 \
+               -gencode=arch=compute_61,code=compute_61" \
+    conda build faiss-gpu --variants '{ "cudatoolkit": "8.0" }' \
+          --no-anaconda-upload -c pytorch --no-test
+RUN CUDA_ROOT=/usr/local/cuda-9.0 \
+    CUDA_ARCH="-gencode=arch=compute_35,code=compute_35 \
+               -gencode=arch=compute_52,code=compute_52 \
+               -gencode=arch=compute_60,code=compute_60 \
+               -gencode=arch=compute_61,code=compute_61 \
+               -gencode=arch=compute_70,code=compute_70" \
+    conda build faiss-gpu --variants '{ "cudatoolkit": "9.0" }' \
+          --no-anaconda-upload -c pytorch --no-test
+RUN CUDA_ROOT=/usr/local/cuda-10.0 \
+    CUDA_ARCH="-gencode=arch=compute_35,code=compute_35 \
+               -gencode=arch=compute_52,code=compute_52 \
+               -gencode=arch=compute_60,code=compute_60 \
+               -gencode=arch=compute_61,code=compute_61 \
+               -gencode=arch=compute_70,code=compute_70 \
+               -gencode=arch=compute_72,code=compute_72 \
+               -gencode=arch=compute_75,code=compute_75" \
+    conda build faiss-gpu --variants '{ "cudatoolkit": "10.0" }' \
+          --no-anaconda-upload -c pytorch --no-test
diff --git a/core/src/index/thirdparty/faiss/conda/conda_build_config.yaml b/core/src/index/thirdparty/faiss/conda/conda_build_config.yaml
new file mode 100644
index 0000000000..e9f0a51d26
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/conda/conda_build_config.yaml
@@ -0,0 +1,7 @@
+CONDA_BUILD_SYSROOT:
+  - /opt/MacOSX10.9.sdk        # [osx]
+python:
+  - 2.7
+  - 3.5
+  - 3.6
+  - 3.7
diff --git a/core/src/index/thirdparty/faiss/conda/faiss-gpu/build.sh b/core/src/index/thirdparty/faiss/conda/faiss-gpu/build.sh
new file mode 100644
index 0000000000..25326c90d9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/conda/faiss-gpu/build.sh
@@ -0,0 +1,16 @@
+# Build avx2 version
+CXXFLAGS="-mavx2 -mf16c" ./configure --with-cuda=$CUDA_ROOT --with-cuda-arch="$CUDA_ARCH"
+make -j $CPU_COUNT
+make -C python _swigfaiss_avx2.so
+make clean
+
+# Build vanilla version (no avx)
+./configure --with-cuda=$CUDA_ROOT --with-cuda-arch="$CUDA_ARCH"
+make -j $CPU_COUNT
+make -C python _swigfaiss.so
+
+make -C python build
+
+cd python
+
+$PYTHON setup.py install --single-version-externally-managed --record=record.txt
diff --git a/core/src/index/thirdparty/faiss/conda/faiss-gpu/conda_build_config.yaml b/core/src/index/thirdparty/faiss/conda/faiss-gpu/conda_build_config.yaml
new file mode 100644
index 0000000000..da98e5d414
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/conda/faiss-gpu/conda_build_config.yaml
@@ -0,0 +1,11 @@
+cxx_compiler_version:
+  - 5.4
+cudatoolkit:
+  - 8.0
+  - 9.0
+  - 9.2
+  - 10.0
+  - 10.1
+pin_run_as_build:
+  cudatoolkit:
+    max_pin: x.x
diff --git a/core/src/index/thirdparty/faiss/conda/faiss-gpu/meta.yaml b/core/src/index/thirdparty/faiss/conda/faiss-gpu/meta.yaml
new file mode 100644
index 0000000000..886531bafc
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/conda/faiss-gpu/meta.yaml
@@ -0,0 +1,41 @@
+package:
+  name: faiss-gpu
+  version: "{{ FAISS_BUILD_VERSION }}"
+
+source:
+  git_url: ../../
+
+requirements:
+  build:
+    - {{ compiler('cxx') }}
+    - llvm-openmp # [osx]
+    - setuptools
+    - swig
+
+  host:
+    - python {{ python }}
+    - intel-openmp # [osx]
+    - numpy 1.11.*
+    - mkl >=2018
+    - cudatoolkit {{ cudatoolkit }}
+
+  run:
+    - python {{ python }}
+    - intel-openmp # [osx]
+    - numpy >=1.11
+    - mkl >=2018
+    - blas=*=mkl
+    - {{ pin_compatible('cudatoolkit') }}
+
+build:
+  number: {{ FAISS_BUILD_NUMBER }}
+  script_env:
+    - CUDA_ROOT
+    - CUDA_ARCH
+
+about:
+  home: https://github.com/facebookresearch/faiss
+  license: MIT
+  license_family: MIT
+  license_file: LICENSE
+  summary: A library for efficient similarity search and clustering of dense vectors.
diff --git a/core/src/index/thirdparty/faiss/conda/faiss-gpu/run_test.py b/core/src/index/thirdparty/faiss/conda/faiss-gpu/run_test.py
new file mode 100644
index 0000000000..68e0bbc3e3
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/conda/faiss-gpu/run_test.py
@@ -0,0 +1,16 @@
+import faiss
+import numpy as np
+
+d = 128
+n = 100
+
+rs = np.random.RandomState(1337)
+x = rs.rand(n, d).astype(np.float32)
+
+index = faiss.IndexFlatL2(d)
+
+res = faiss.StandardGpuResources()
+gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
+gpu_index.add(x)
+
+D, I = index.search(x, 10)
diff --git a/core/src/index/thirdparty/faiss/conda/faiss/build.sh b/core/src/index/thirdparty/faiss/conda/faiss/build.sh
new file mode 100644
index 0000000000..87ccb4cad0
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/conda/faiss/build.sh
@@ -0,0 +1,16 @@
+# Build avx2 version
+CXXFLAGS="-mavx2 -mf16c" ./configure --without-cuda
+make -j $CPU_COUNT
+make -C python _swigfaiss_avx2.so
+make clean
+
+# Build vanilla version (no avx)
+./configure --without-cuda
+make -j $CPU_COUNT
+make -C python _swigfaiss.so
+
+make -C python build
+
+cd python
+
+$PYTHON setup.py install --single-version-externally-managed --record=record.txt
diff --git a/core/src/index/thirdparty/faiss/conda/faiss/meta.yaml b/core/src/index/thirdparty/faiss/conda/faiss/meta.yaml
new file mode 100644
index 0000000000..e765cf388d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/conda/faiss/meta.yaml
@@ -0,0 +1,36 @@
+package:
+  name: faiss-cpu
+  version: "{{ FAISS_BUILD_VERSION }}"
+
+source:
+  git_url: ../../
+
+requirements:
+  build:
+    - {{ compiler('cxx') }}
+    - llvm-openmp # [osx]
+    - setuptools
+    - swig
+
+  host:
+    - python {{ python }}
+    - intel-openmp # [osx]
+    - numpy 1.11.*
+    - mkl >=2018
+
+  run:
+    - python {{ python }}
+    - intel-openmp # [osx]
+    - numpy >=1.11
+    - blas=*=mkl
+    - mkl >=2018
+
+build:
+  number: {{ FAISS_BUILD_NUMBER }}
+
+about:
+  home: https://github.com/facebookresearch/faiss
+  license: MIT
+  license_family: MIT
+  license_file: LICENSE
+  summary: A library for efficient similarity search and clustering of dense vectors.
diff --git a/core/src/index/thirdparty/faiss/conda/faiss/run_test.py b/core/src/index/thirdparty/faiss/conda/faiss/run_test.py
new file mode 100644
index 0000000000..57e6d7d92c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/conda/faiss/run_test.py
@@ -0,0 +1,14 @@
+import faiss
+import numpy as np
+
+d = 128
+# NOTE: BLAS kicks in only when n > distance_compute_blas_threshold = 20
+n = 100
+
+rs = np.random.RandomState(1337)
+x = rs.rand(n, d).astype(np.float32)
+
+index = faiss.IndexFlatL2(d)
+index.add(x)
+
+D, I = index.search(x, 10)
diff --git a/core/src/index/thirdparty/faiss/configure b/core/src/index/thirdparty/faiss/configure
new file mode 100755
index 0000000000..ed40daefd9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/configure
@@ -0,0 +1,7998 @@
+#! /bin/sh
+# Guess values for system-dependent variables and create Makefiles.
+# Generated by GNU Autoconf 2.69 for faiss 1.0.
+#
+#
+# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
+#
+#
+# This configure script is free software; the Free Software Foundation
+# gives unlimited permission to copy, distribute and modify it.
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+## -------------------- ##
+## M4sh Initialization. ##
+## -------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac
+fi
+
+
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+# Prefer a ksh shell builtin over an external printf program on Solaris,
+# but without wasting forks for bash or zsh.
+if test -z "$BASH_VERSION$ZSH_VERSION" \
+    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='print -r --'
+  as_echo_n='print -rn --'
+elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='printf %s\n'
+  as_echo_n='printf %s'
+else
+  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+    as_echo_n='/usr/ucb/echo -n'
+  else
+    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+    as_echo_n_body='eval
+      arg=$1;
+      case $arg in #(
+      *"$as_nl"*)
+	expr "X$arg" : "X\\(.*\\)$as_nl";
+	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+      esac;
+      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+    '
+    export as_echo_n_body
+    as_echo_n='sh -c $as_echo_n_body as_echo'
+  fi
+  export as_echo_body
+  as_echo='sh -c $as_echo_body as_echo'
+fi
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+  PATH_SEPARATOR=:
+  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+      PATH_SEPARATOR=';'
+  }
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order.  Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+IFS=" ""	$as_nl"
+
+# Find who we are.  Look in the path if we contain no directory separator.
+as_myself=
+case $0 in #((
+  *[\\/]* ) as_myself=$0 ;;
+  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+  done
+IFS=$as_save_IFS
+
+     ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+  as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  exit 1
+fi
+
+# Unset variables that we do not need and which cause bugs (e.g. in
+# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
+# suppresses any "Segmentation fault" message there.  '((' could
+# trigger a bug in pdksh 5.2.14.
+for as_var in BASH_ENV ENV MAIL MAILPATH
+do eval test x\${$as_var+set} = xset \
+  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# CDPATH.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+# Use a proper internal environment variable to ensure we don't fall
+  # into an infinite loop, continuously re-executing ourselves.
+  if test x"${_as_can_reexec}" != xno && test "x$CONFIG_SHELL" != x; then
+    _as_can_reexec=no; export _as_can_reexec;
+    # We cannot yet assume a decent shell, so we have to provide a
+# neutralization value for shells without unset; and this also
+# works around shells that cannot unset nonexistent variables.
+# Preserve -v and -x to the replacement shell.
+BASH_ENV=/dev/null
+ENV=/dev/null
+(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
+case $- in # ((((
+  *v*x* | *x*v* ) as_opts=-vx ;;
+  *v* ) as_opts=-v ;;
+  *x* ) as_opts=-x ;;
+  * ) as_opts= ;;
+esac
+exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
+# Admittedly, this is quite paranoid, since all the known shells bail
+# out after a failed `exec'.
+$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
+as_fn_exit 255
+  fi
+  # We don't want this to propagate to other subprocesses.
+          { _as_can_reexec=; unset _as_can_reexec;}
+if test "x$CONFIG_SHELL" = x; then
+  as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '\${1+\"\$@\"}'='\"\$@\"'
+  setopt NO_GLOB_SUBST
+else
+  case \`(set -o) 2>/dev/null\` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac
+fi
+"
+  as_required="as_fn_return () { (exit \$1); }
+as_fn_success () { as_fn_return 0; }
+as_fn_failure () { as_fn_return 1; }
+as_fn_ret_success () { return 0; }
+as_fn_ret_failure () { return 1; }
+
+exitcode=0
+as_fn_success || { exitcode=1; echo as_fn_success failed.; }
+as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; }
+as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; }
+as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; }
+if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then :
+
+else
+  exitcode=1; echo positional parameters were not saved.
+fi
+test x\$exitcode = x0 || exit 1
+test -x / || exit 1"
+  as_suggested="  as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO
+  as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO
+  eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" &&
+  test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1
+test \$(( 1 + 1 )) = 2 || exit 1"
+  if (eval "$as_required") 2>/dev/null; then :
+  as_have_required=yes
+else
+  as_have_required=no
+fi
+  if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then :
+
+else
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+as_found=false
+for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  as_found=:
+  case $as_dir in #(
+	 /*)
+	   for as_base in sh bash ksh sh5; do
+	     # Try only shells that exist, to save several forks.
+	     as_shell=$as_dir/$as_base
+	     if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
+		    { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then :
+  CONFIG_SHELL=$as_shell as_have_required=yes
+		   if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then :
+  break 2
+fi
+fi
+	   done;;
+       esac
+  as_found=false
+done
+$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } &&
+	      { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then :
+  CONFIG_SHELL=$SHELL as_have_required=yes
+fi; }
+IFS=$as_save_IFS
+
+
+      if test "x$CONFIG_SHELL" != x; then :
+  export CONFIG_SHELL
+             # We cannot yet assume a decent shell, so we have to provide a
+# neutralization value for shells without unset; and this also
+# works around shells that cannot unset nonexistent variables.
+# Preserve -v and -x to the replacement shell.
+BASH_ENV=/dev/null
+ENV=/dev/null
+(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
+case $- in # ((((
+  *v*x* | *x*v* ) as_opts=-vx ;;
+  *v* ) as_opts=-v ;;
+  *x* ) as_opts=-x ;;
+  * ) as_opts= ;;
+esac
+exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
+# Admittedly, this is quite paranoid, since all the known shells bail
+# out after a failed `exec'.
+$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
+exit 255
+fi
+
+    if test x$as_have_required = xno; then :
+  $as_echo "$0: This script requires a shell more modern than all"
+  $as_echo "$0: the shells that I found on your system."
+  if test x${ZSH_VERSION+set} = xset ; then
+    $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should"
+    $as_echo "$0: be upgraded to zsh 4.3.4 or later."
+  else
+    $as_echo "$0: Please tell bug-autoconf@gnu.org about your system,
+$0: including any error possibly output before this
+$0: message. Then install a modern shell, or manually run
+$0: the script under such a shell if you do have one."
+  fi
+  exit 1
+fi
+fi
+fi
+SHELL=${CONFIG_SHELL-/bin/sh}
+export SHELL
+# Unset more variables known to interfere with behavior of common tools.
+CLICOLOR_FORCE= GREP_OPTIONS=
+unset CLICOLOR_FORCE GREP_OPTIONS
+
+## --------------------- ##
+## M4sh Shell Functions. ##
+## --------------------- ##
+# as_fn_unset VAR
+# ---------------
+# Portably unset VAR.
+as_fn_unset ()
+{
+  { eval $1=; unset $1;}
+}
+as_unset=as_fn_unset
+
+# as_fn_set_status STATUS
+# -----------------------
+# Set $? to STATUS, without forking.
+as_fn_set_status ()
+{
+  return $1
+} # as_fn_set_status
+
+# as_fn_exit STATUS
+# -----------------
+# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
+as_fn_exit ()
+{
+  set +e
+  as_fn_set_status $1
+  exit $1
+} # as_fn_exit
+
+# as_fn_mkdir_p
+# -------------
+# Create "$as_dir" as a directory, including parents if necessary.
+as_fn_mkdir_p ()
+{
+
+  case $as_dir in #(
+  -*) as_dir=./$as_dir;;
+  esac
+  test -d "$as_dir" || eval $as_mkdir_p || {
+    as_dirs=
+    while :; do
+      case $as_dir in #(
+      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *) as_qdir=$as_dir;;
+      esac
+      as_dirs="'$as_qdir' $as_dirs"
+      as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_dir" : 'X\(//\)[^/]' \| \
+	 X"$as_dir" : 'X\(//\)$' \| \
+	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_dir" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      test -d "$as_dir" && break
+    done
+    test -z "$as_dirs" || eval "mkdir $as_dirs"
+  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
+
+
+} # as_fn_mkdir_p
+
+# as_fn_executable_p FILE
+# -----------------------
+# Test if FILE is an executable regular file.
+as_fn_executable_p ()
+{
+  test -f "$1" && test -x "$1"
+} # as_fn_executable_p
+# as_fn_append VAR VALUE
+# ----------------------
+# Append the text in VALUE to the end of the definition contained in VAR. Take
+# advantage of any shell optimizations that allow amortized linear growth over
+# repeated appends, instead of the typical quadratic growth present in naive
+# implementations.
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
+  eval 'as_fn_append ()
+  {
+    eval $1+=\$2
+  }'
+else
+  as_fn_append ()
+  {
+    eval $1=\$$1\$2
+  }
+fi # as_fn_append
+
+# as_fn_arith ARG...
+# ------------------
+# Perform arithmetic evaluation on the ARGs, and store the result in the
+# global $as_val. Take advantage of shells that can avoid forks. The arguments
+# must be portable across $(()) and expr.
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
+  eval 'as_fn_arith ()
+  {
+    as_val=$(( $* ))
+  }'
+else
+  as_fn_arith ()
+  {
+    as_val=`expr "$@" || test $? -eq 1`
+  }
+fi # as_fn_arith
+
+
+# as_fn_error STATUS ERROR [LINENO LOG_FD]
+# ----------------------------------------
+# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
+# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
+# script with STATUS, using 1 if that was 0.
+as_fn_error ()
+{
+  as_status=$1; test $as_status -eq 0 && as_status=1
+  if test "$4"; then
+    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+  fi
+  $as_echo "$as_me: error: $2" >&2
+  as_fn_exit $as_status
+} # as_fn_error
+
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+  as_basename=basename
+else
+  as_basename=false
+fi
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+  as_dirname=dirname
+else
+  as_dirname=false
+fi
+
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+	 X"$0" : 'X\(//\)$' \| \
+	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+    sed '/^.*\/\([^/][^/]*\)\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+
+  as_lineno_1=$LINENO as_lineno_1a=$LINENO
+  as_lineno_2=$LINENO as_lineno_2a=$LINENO
+  eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" &&
+  test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || {
+  # Blame Lee E. McMahon (1931-1989) for sed's syntax.  :-)
+  sed -n '
+    p
+    /[$]LINENO/=
+  ' <$as_myself |
+    sed '
+      s/[$]LINENO.*/&-/
+      t lineno
+      b
+      :lineno
+      N
+      :loop
+      s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
+      t loop
+      s/-\n.*//
+    ' >$as_me.lineno &&
+  chmod +x "$as_me.lineno" ||
+    { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; }
+
+  # If we had to re-execute with $CONFIG_SHELL, we're ensured to have
+  # already done that, so ensure we don't try to do so again and fall
+  # in an infinite loop.  This has already happened in practice.
+  _as_can_reexec=no; export _as_can_reexec
+  # Don't try to exec as it changes $[0], causing all sort of problems
+  # (the dirname of $[0] is not the place where we might find the
+  # original and so on.  Autoconf is especially sensitive to this).
+  . "./$as_me.lineno"
+  # Exit status is that of the last command.
+  exit
+}
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in #(((((
+-n*)
+  case `echo 'xy\c'` in
+  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
+  xy)  ECHO_C='\c';;
+  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
+       ECHO_T='	';;
+  esac;;
+*)
+  ECHO_N='-n';;
+esac
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+  rm -f conf$$.dir/conf$$.file
+else
+  rm -f conf$$.dir
+  mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+  if ln -s conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s='ln -s'
+    # ... but there are two gotchas:
+    # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+    # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+    # In both cases, we have to default to `cp -pR'.
+    ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+      as_ln_s='cp -pR'
+  elif ln conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s=ln
+  else
+    as_ln_s='cp -pR'
+  fi
+else
+  as_ln_s='cp -pR'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+if mkdir -p . 2>/dev/null; then
+  as_mkdir_p='mkdir -p "$as_dir"'
+else
+  test -d ./-p && rmdir ./-p
+  as_mkdir_p=false
+fi
+
+as_test_x='test -x'
+as_executable_p=as_fn_executable_p
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+test -n "$DJDIR" || exec 7<&0 </dev/null
+exec 6>&1
+
+# Name of the host.
+# hostname on some systems (SVR3.2, old GNU/Linux) returns a bogus exit status,
+# so uname gets run too.
+ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q`
+
+#
+# Initializations.
+#
+ac_default_prefix=/usr/local
+ac_clean_files=
+ac_config_libobj_dir=.
+LIBOBJS=
+cross_compiling=no
+subdirs=
+MFLAGS=
+MAKEFLAGS=
+
+# Identity of this package.
+PACKAGE_NAME='faiss'
+PACKAGE_TARNAME='faiss'
+PACKAGE_VERSION='1.0'
+PACKAGE_STRING='faiss 1.0'
+PACKAGE_BUGREPORT=''
+PACKAGE_URL=''
+
+ac_unique_file="Index.h"
+# Factoring default headers for most tests.
+ac_includes_default="\
+#include <stdio.h>
+#ifdef HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#ifdef HAVE_SYS_STAT_H
+# include <sys/stat.h>
+#endif
+#ifdef STDC_HEADERS
+# include <stdlib.h>
+# include <stddef.h>
+#else
+# ifdef HAVE_STDLIB_H
+#  include <stdlib.h>
+# endif
+#endif
+#ifdef HAVE_STRING_H
+# if !defined STDC_HEADERS && defined HAVE_MEMORY_H
+#  include <memory.h>
+# endif
+# include <string.h>
+#endif
+#ifdef HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>
+#endif
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
+#endif"
+
+ac_header_list=
+ac_subst_vars='LTLIBOBJS
+ARCH_CXXFLAGS
+ARCH_CPUFLAGS
+target_os
+target_vendor
+target_cpu
+target
+LAPACK_LIBS
+OPENMP_LDFLAGS
+BLAS_LIBS
+host_os
+host_vendor
+host_cpu
+host
+build_os
+build_vendor
+build_cpu
+build
+OPENMP_CXXFLAGS
+LIBOBJS
+CUDA_ARCH
+CUDA_PREFIX
+NVCC_LIBS
+NVCC_LDFLAGS
+NVCC_CPPFLAGS
+EGREP
+GREP
+CXXCPP
+NVCC
+SWIG
+NUMPY_INCLUDE
+PYTHON_CFLAGS
+PYTHON
+MKDIR_P
+SET_MAKE
+CPP
+ac_ct_CC
+CFLAGS
+CC
+HAVE_CXX11
+OBJEXT
+EXEEXT
+ac_ct_CXX
+CPPFLAGS
+LDFLAGS
+CXXFLAGS
+CXX
+target_alias
+host_alias
+build_alias
+LIBS
+ECHO_T
+ECHO_N
+ECHO_C
+DEFS
+mandir
+localedir
+libdir
+psdir
+pdfdir
+dvidir
+htmldir
+infodir
+docdir
+oldincludedir
+includedir
+localstatedir
+sharedstatedir
+sysconfdir
+datadir
+datarootdir
+libexecdir
+sbindir
+bindir
+program_transform_name
+prefix
+exec_prefix
+PACKAGE_URL
+PACKAGE_BUGREPORT
+PACKAGE_STRING
+PACKAGE_VERSION
+PACKAGE_TARNAME
+PACKAGE_NAME
+PATH_SEPARATOR
+SHELL'
+ac_subst_files=''
+ac_user_opts='
+enable_option_checking
+with_python
+with_swig
+with_cuda
+with_cuda_arch
+enable_openmp
+with_blas
+with_lapack
+'
+      ac_precious_vars='build_alias
+host_alias
+target_alias
+CXX
+CXXFLAGS
+LDFLAGS
+LIBS
+CPPFLAGS
+CCC
+CC
+CFLAGS
+CPP
+CXXCPP'
+
+
+# Initialize some variables set by options.
+ac_init_help=
+ac_init_version=false
+ac_unrecognized_opts=
+ac_unrecognized_sep=
+# The variables have the same names as the options, with
+# dashes changed to underlines.
+cache_file=/dev/null
+exec_prefix=NONE
+no_create=
+no_recursion=
+prefix=NONE
+program_prefix=NONE
+program_suffix=NONE
+program_transform_name=s,x,x,
+silent=
+site=
+srcdir=
+verbose=
+x_includes=NONE
+x_libraries=NONE
+
+# Installation directory options.
+# These are left unexpanded so users can "make install exec_prefix=/foo"
+# and all the variables that are supposed to be based on exec_prefix
+# by default will actually change.
+# Use braces instead of parens because sh, perl, etc. also accept them.
+# (The list follows the same order as the GNU Coding Standards.)
+bindir='${exec_prefix}/bin'
+sbindir='${exec_prefix}/sbin'
+libexecdir='${exec_prefix}/libexec'
+datarootdir='${prefix}/share'
+datadir='${datarootdir}'
+sysconfdir='${prefix}/etc'
+sharedstatedir='${prefix}/com'
+localstatedir='${prefix}/var'
+includedir='${prefix}/include'
+oldincludedir='/usr/include'
+docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
+infodir='${datarootdir}/info'
+htmldir='${docdir}'
+dvidir='${docdir}'
+pdfdir='${docdir}'
+psdir='${docdir}'
+libdir='${exec_prefix}/lib'
+localedir='${datarootdir}/locale'
+mandir='${datarootdir}/man'
+
+ac_prev=
+ac_dashdash=
+for ac_option
+do
+  # If the previous option needs an argument, assign it.
+  if test -n "$ac_prev"; then
+    eval $ac_prev=\$ac_option
+    ac_prev=
+    continue
+  fi
+
+  case $ac_option in
+  *=?*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;;
+  *=)   ac_optarg= ;;
+  *)    ac_optarg=yes ;;
+  esac
+
+  # Accept the important Cygnus configure options, so we can diagnose typos.
+
+  case $ac_dashdash$ac_option in
+  --)
+    ac_dashdash=yes ;;
+
+  -bindir | --bindir | --bindi | --bind | --bin | --bi)
+    ac_prev=bindir ;;
+  -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
+    bindir=$ac_optarg ;;
+
+  -build | --build | --buil | --bui | --bu)
+    ac_prev=build_alias ;;
+  -build=* | --build=* | --buil=* | --bui=* | --bu=*)
+    build_alias=$ac_optarg ;;
+
+  -cache-file | --cache-file | --cache-fil | --cache-fi \
+  | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
+    ac_prev=cache_file ;;
+  -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
+  | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
+    cache_file=$ac_optarg ;;
+
+  --config-cache | -C)
+    cache_file=config.cache ;;
+
+  -datadir | --datadir | --datadi | --datad)
+    ac_prev=datadir ;;
+  -datadir=* | --datadir=* | --datadi=* | --datad=*)
+    datadir=$ac_optarg ;;
+
+  -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \
+  | --dataroo | --dataro | --datar)
+    ac_prev=datarootdir ;;
+  -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \
+  | --dataroot=* | --dataroo=* | --dataro=* | --datar=*)
+    datarootdir=$ac_optarg ;;
+
+  -disable-* | --disable-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid feature name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"enable_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval enable_$ac_useropt=no ;;
+
+  -docdir | --docdir | --docdi | --doc | --do)
+    ac_prev=docdir ;;
+  -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*)
+    docdir=$ac_optarg ;;
+
+  -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv)
+    ac_prev=dvidir ;;
+  -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*)
+    dvidir=$ac_optarg ;;
+
+  -enable-* | --enable-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid feature name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"enable_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval enable_$ac_useropt=\$ac_optarg ;;
+
+  -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
+  | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
+  | --exec | --exe | --ex)
+    ac_prev=exec_prefix ;;
+  -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
+  | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
+  | --exec=* | --exe=* | --ex=*)
+    exec_prefix=$ac_optarg ;;
+
+  -gas | --gas | --ga | --g)
+    # Obsolete; use --with-gas.
+    with_gas=yes ;;
+
+  -help | --help | --hel | --he | -h)
+    ac_init_help=long ;;
+  -help=r* | --help=r* | --hel=r* | --he=r* | -hr*)
+    ac_init_help=recursive ;;
+  -help=s* | --help=s* | --hel=s* | --he=s* | -hs*)
+    ac_init_help=short ;;
+
+  -host | --host | --hos | --ho)
+    ac_prev=host_alias ;;
+  -host=* | --host=* | --hos=* | --ho=*)
+    host_alias=$ac_optarg ;;
+
+  -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht)
+    ac_prev=htmldir ;;
+  -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \
+  | --ht=*)
+    htmldir=$ac_optarg ;;
+
+  -includedir | --includedir | --includedi | --included | --include \
+  | --includ | --inclu | --incl | --inc)
+    ac_prev=includedir ;;
+  -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
+  | --includ=* | --inclu=* | --incl=* | --inc=*)
+    includedir=$ac_optarg ;;
+
+  -infodir | --infodir | --infodi | --infod | --info | --inf)
+    ac_prev=infodir ;;
+  -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
+    infodir=$ac_optarg ;;
+
+  -libdir | --libdir | --libdi | --libd)
+    ac_prev=libdir ;;
+  -libdir=* | --libdir=* | --libdi=* | --libd=*)
+    libdir=$ac_optarg ;;
+
+  -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
+  | --libexe | --libex | --libe)
+    ac_prev=libexecdir ;;
+  -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
+  | --libexe=* | --libex=* | --libe=*)
+    libexecdir=$ac_optarg ;;
+
+  -localedir | --localedir | --localedi | --localed | --locale)
+    ac_prev=localedir ;;
+  -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*)
+    localedir=$ac_optarg ;;
+
+  -localstatedir | --localstatedir | --localstatedi | --localstated \
+  | --localstate | --localstat | --localsta | --localst | --locals)
+    ac_prev=localstatedir ;;
+  -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
+  | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*)
+    localstatedir=$ac_optarg ;;
+
+  -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
+    ac_prev=mandir ;;
+  -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
+    mandir=$ac_optarg ;;
+
+  -nfp | --nfp | --nf)
+    # Obsolete; use --without-fp.
+    with_fp=no ;;
+
+  -no-create | --no-create | --no-creat | --no-crea | --no-cre \
+  | --no-cr | --no-c | -n)
+    no_create=yes ;;
+
+  -no-recursion | --no-recursion | --no-recursio | --no-recursi \
+  | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
+    no_recursion=yes ;;
+
+  -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
+  | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
+  | --oldin | --oldi | --old | --ol | --o)
+    ac_prev=oldincludedir ;;
+  -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
+  | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
+  | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
+    oldincludedir=$ac_optarg ;;
+
+  -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
+    ac_prev=prefix ;;
+  -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
+    prefix=$ac_optarg ;;
+
+  -program-prefix | --program-prefix | --program-prefi | --program-pref \
+  | --program-pre | --program-pr | --program-p)
+    ac_prev=program_prefix ;;
+  -program-prefix=* | --program-prefix=* | --program-prefi=* \
+  | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
+    program_prefix=$ac_optarg ;;
+
+  -program-suffix | --program-suffix | --program-suffi | --program-suff \
+  | --program-suf | --program-su | --program-s)
+    ac_prev=program_suffix ;;
+  -program-suffix=* | --program-suffix=* | --program-suffi=* \
+  | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
+    program_suffix=$ac_optarg ;;
+
+  -program-transform-name | --program-transform-name \
+  | --program-transform-nam | --program-transform-na \
+  | --program-transform-n | --program-transform- \
+  | --program-transform | --program-transfor \
+  | --program-transfo | --program-transf \
+  | --program-trans | --program-tran \
+  | --progr-tra | --program-tr | --program-t)
+    ac_prev=program_transform_name ;;
+  -program-transform-name=* | --program-transform-name=* \
+  | --program-transform-nam=* | --program-transform-na=* \
+  | --program-transform-n=* | --program-transform-=* \
+  | --program-transform=* | --program-transfor=* \
+  | --program-transfo=* | --program-transf=* \
+  | --program-trans=* | --program-tran=* \
+  | --progr-tra=* | --program-tr=* | --program-t=*)
+    program_transform_name=$ac_optarg ;;
+
+  -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd)
+    ac_prev=pdfdir ;;
+  -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*)
+    pdfdir=$ac_optarg ;;
+
+  -psdir | --psdir | --psdi | --psd | --ps)
+    ac_prev=psdir ;;
+  -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*)
+    psdir=$ac_optarg ;;
+
+  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+  | -silent | --silent | --silen | --sile | --sil)
+    silent=yes ;;
+
+  -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
+    ac_prev=sbindir ;;
+  -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
+  | --sbi=* | --sb=*)
+    sbindir=$ac_optarg ;;
+
+  -sharedstatedir | --sharedstatedir | --sharedstatedi \
+  | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
+  | --sharedst | --shareds | --shared | --share | --shar \
+  | --sha | --sh)
+    ac_prev=sharedstatedir ;;
+  -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
+  | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
+  | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
+  | --sha=* | --sh=*)
+    sharedstatedir=$ac_optarg ;;
+
+  -site | --site | --sit)
+    ac_prev=site ;;
+  -site=* | --site=* | --sit=*)
+    site=$ac_optarg ;;
+
+  -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
+    ac_prev=srcdir ;;
+  -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
+    srcdir=$ac_optarg ;;
+
+  -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
+  | --syscon | --sysco | --sysc | --sys | --sy)
+    ac_prev=sysconfdir ;;
+  -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
+  | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
+    sysconfdir=$ac_optarg ;;
+
+  -target | --target | --targe | --targ | --tar | --ta | --t)
+    ac_prev=target_alias ;;
+  -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
+    target_alias=$ac_optarg ;;
+
+  -v | -verbose | --verbose | --verbos | --verbo | --verb)
+    verbose=yes ;;
+
+  -version | --version | --versio | --versi | --vers | -V)
+    ac_init_version=: ;;
+
+  -with-* | --with-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid package name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"with_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval with_$ac_useropt=\$ac_optarg ;;
+
+  -without-* | --without-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid package name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"with_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval with_$ac_useropt=no ;;
+
+  --x)
+    # Obsolete; use --with-x.
+    with_x=yes ;;
+
+  -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
+  | --x-incl | --x-inc | --x-in | --x-i)
+    ac_prev=x_includes ;;
+  -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
+  | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
+    x_includes=$ac_optarg ;;
+
+  -x-libraries | --x-libraries | --x-librarie | --x-librari \
+  | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
+    ac_prev=x_libraries ;;
+  -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
+  | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
+    x_libraries=$ac_optarg ;;
+
+  -*) as_fn_error $? "unrecognized option: \`$ac_option'
+Try \`$0 --help' for more information"
+    ;;
+
+  *=*)
+    ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='`
+    # Reject names that are not valid shell variable names.
+    case $ac_envvar in #(
+      '' | [0-9]* | *[!_$as_cr_alnum]* )
+      as_fn_error $? "invalid variable name: \`$ac_envvar'" ;;
+    esac
+    eval $ac_envvar=\$ac_optarg
+    export $ac_envvar ;;
+
+  *)
+    # FIXME: should be removed in autoconf 3.0.
+    $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2
+    expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null &&
+      $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2
+    : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}"
+    ;;
+
+  esac
+done
+
+if test -n "$ac_prev"; then
+  ac_option=--`echo $ac_prev | sed 's/_/-/g'`
+  as_fn_error $? "missing argument to $ac_option"
+fi
+
+if test -n "$ac_unrecognized_opts"; then
+  case $enable_option_checking in
+    no) ;;
+    fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;;
+    *)     $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;;
+  esac
+fi
+
+# Check all directory arguments for consistency.
+for ac_var in	exec_prefix prefix bindir sbindir libexecdir datarootdir \
+		datadir sysconfdir sharedstatedir localstatedir includedir \
+		oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
+		libdir localedir mandir
+do
+  eval ac_val=\$$ac_var
+  # Remove trailing slashes.
+  case $ac_val in
+    */ )
+      ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'`
+      eval $ac_var=\$ac_val;;
+  esac
+  # Be sure to have absolute directory names.
+  case $ac_val in
+    [\\/$]* | ?:[\\/]* )  continue;;
+    NONE | '' ) case $ac_var in *prefix ) continue;; esac;;
+  esac
+  as_fn_error $? "expected an absolute directory name for --$ac_var: $ac_val"
+done
+
+# There might be people who depend on the old broken behavior: `$host'
+# used to hold the argument of --host etc.
+# FIXME: To remove some day.
+build=$build_alias
+host=$host_alias
+target=$target_alias
+
+# FIXME: To remove some day.
+if test "x$host_alias" != x; then
+  if test "x$build_alias" = x; then
+    cross_compiling=maybe
+  elif test "x$build_alias" != "x$host_alias"; then
+    cross_compiling=yes
+  fi
+fi
+
+ac_tool_prefix=
+test -n "$host_alias" && ac_tool_prefix=$host_alias-
+
+test "$silent" = yes && exec 6>/dev/null
+
+
+ac_pwd=`pwd` && test -n "$ac_pwd" &&
+ac_ls_di=`ls -di .` &&
+ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` ||
+  as_fn_error $? "working directory cannot be determined"
+test "X$ac_ls_di" = "X$ac_pwd_ls_di" ||
+  as_fn_error $? "pwd does not report name of working directory"
+
+
+# Find the source files, if location was not specified.
+if test -z "$srcdir"; then
+  ac_srcdir_defaulted=yes
+  # Try the directory containing this script, then the parent directory.
+  ac_confdir=`$as_dirname -- "$as_myself" ||
+$as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_myself" : 'X\(//\)[^/]' \| \
+	 X"$as_myself" : 'X\(//\)$' \| \
+	 X"$as_myself" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_myself" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+  srcdir=$ac_confdir
+  if test ! -r "$srcdir/$ac_unique_file"; then
+    srcdir=..
+  fi
+else
+  ac_srcdir_defaulted=no
+fi
+if test ! -r "$srcdir/$ac_unique_file"; then
+  test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .."
+  as_fn_error $? "cannot find sources ($ac_unique_file) in $srcdir"
+fi
+ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work"
+ac_abs_confdir=`(
+	cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg"
+	pwd)`
+# When building in place, set srcdir=.
+if test "$ac_abs_confdir" = "$ac_pwd"; then
+  srcdir=.
+fi
+# Remove unnecessary trailing slashes from srcdir.
+# Double slashes in file names in object file debugging info
+# mess up M-x gdb in Emacs.
+case $srcdir in
+*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;;
+esac
+for ac_var in $ac_precious_vars; do
+  eval ac_env_${ac_var}_set=\${${ac_var}+set}
+  eval ac_env_${ac_var}_value=\$${ac_var}
+  eval ac_cv_env_${ac_var}_set=\${${ac_var}+set}
+  eval ac_cv_env_${ac_var}_value=\$${ac_var}
+done
+
+#
+# Report the --help message.
+#
+if test "$ac_init_help" = "long"; then
+  # Omit some internal or obsolete options to make the list less imposing.
+  # This message is too long to be a string in the A/UX 3.1 sh.
+  cat <<_ACEOF
+\`configure' configures faiss 1.0 to adapt to many kinds of systems.
+
+Usage: $0 [OPTION]... [VAR=VALUE]...
+
+To assign environment variables (e.g., CC, CFLAGS...), specify them as
+VAR=VALUE.  See below for descriptions of some of the useful variables.
+
+Defaults for the options are specified in brackets.
+
+Configuration:
+  -h, --help              display this help and exit
+      --help=short        display options specific to this package
+      --help=recursive    display the short help of all the included packages
+  -V, --version           display version information and exit
+  -q, --quiet, --silent   do not print \`checking ...' messages
+      --cache-file=FILE   cache test results in FILE [disabled]
+  -C, --config-cache      alias for \`--cache-file=config.cache'
+  -n, --no-create         do not create output files
+      --srcdir=DIR        find the sources in DIR [configure dir or \`..']
+
+Installation directories:
+  --prefix=PREFIX         install architecture-independent files in PREFIX
+                          [$ac_default_prefix]
+  --exec-prefix=EPREFIX   install architecture-dependent files in EPREFIX
+                          [PREFIX]
+
+By default, \`make install' will install all the files in
+\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc.  You can specify
+an installation prefix other than \`$ac_default_prefix' using \`--prefix',
+for instance \`--prefix=\$HOME'.
+
+For better control, use the options below.
+
+Fine tuning of the installation directories:
+  --bindir=DIR            user executables [EPREFIX/bin]
+  --sbindir=DIR           system admin executables [EPREFIX/sbin]
+  --libexecdir=DIR        program executables [EPREFIX/libexec]
+  --sysconfdir=DIR        read-only single-machine data [PREFIX/etc]
+  --sharedstatedir=DIR    modifiable architecture-independent data [PREFIX/com]
+  --localstatedir=DIR     modifiable single-machine data [PREFIX/var]
+  --libdir=DIR            object code libraries [EPREFIX/lib]
+  --includedir=DIR        C header files [PREFIX/include]
+  --oldincludedir=DIR     C header files for non-gcc [/usr/include]
+  --datarootdir=DIR       read-only arch.-independent data root [PREFIX/share]
+  --datadir=DIR           read-only architecture-independent data [DATAROOTDIR]
+  --infodir=DIR           info documentation [DATAROOTDIR/info]
+  --localedir=DIR         locale-dependent data [DATAROOTDIR/locale]
+  --mandir=DIR            man documentation [DATAROOTDIR/man]
+  --docdir=DIR            documentation root [DATAROOTDIR/doc/faiss]
+  --htmldir=DIR           html documentation [DOCDIR]
+  --dvidir=DIR            dvi documentation [DOCDIR]
+  --pdfdir=DIR            pdf documentation [DOCDIR]
+  --psdir=DIR             ps documentation [DOCDIR]
+_ACEOF
+
+  cat <<\_ACEOF
+
+System types:
+  --build=BUILD     configure for building on BUILD [guessed]
+  --host=HOST       cross-compile to build programs to run on HOST [BUILD]
+  --target=TARGET   configure for building compilers for TARGET [HOST]
+_ACEOF
+fi
+
+if test -n "$ac_init_help"; then
+  case $ac_init_help in
+     short | recursive ) echo "Configuration of faiss 1.0:";;
+   esac
+  cat <<\_ACEOF
+
+Optional Features:
+  --disable-option-checking  ignore unrecognized --enable/--with options
+  --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
+  --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
+  --disable-openmp        do not use OpenMP
+
+Optional Packages:
+  --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
+  --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
+  --with-python=<bin>     use Python binary <bin>
+  --with-swig=<bin>       use SWIG binary <bin>
+  --with-cuda=<prefix>    prefix of the CUDA installation
+  --with-cuda-arch=<gencodes>
+                          device specific -gencode flags
+  --with-blas=<lib>       use BLAS library <lib>
+  --with-lapack=<lib>     use LAPACK library <lib>
+
+Some influential environment variables:
+  CXX         C++ compiler command
+  CXXFLAGS    C++ compiler flags
+  LDFLAGS     linker flags, e.g. -L<lib dir> if you have libraries in a
+              nonstandard directory <lib dir>
+  LIBS        libraries to pass to the linker, e.g. -l<library>
+  CPPFLAGS    (Objective) C/C++ preprocessor flags, e.g. -I<include dir> if
+              you have headers in a nonstandard directory <include dir>
+  CC          C compiler command
+  CFLAGS      C compiler flags
+  CPP         C preprocessor
+  CXXCPP      C++ preprocessor
+
+Use these variables to override the choices made by `configure' or to help
+it to find libraries and programs with nonstandard names/locations.
+
+Report bugs to the package provider.
+_ACEOF
+ac_status=$?
+fi
+
+if test "$ac_init_help" = "recursive"; then
+  # If there are subdirs, report their specific --help.
+  for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue
+    test -d "$ac_dir" ||
+      { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } ||
+      continue
+    ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+  ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+  # A ".." for each directory in $ac_dir_suffix.
+  ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+  case $ac_top_builddir_sub in
+  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+  esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+  .)  # We are building in place.
+    ac_srcdir=.
+    ac_top_srcdir=$ac_top_builddir_sub
+    ac_abs_top_srcdir=$ac_pwd ;;
+  [\\/]* | ?:[\\/]* )  # Absolute name.
+    ac_srcdir=$srcdir$ac_dir_suffix;
+    ac_top_srcdir=$srcdir
+    ac_abs_top_srcdir=$srcdir ;;
+  *) # Relative name.
+    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+    ac_top_srcdir=$ac_top_build_prefix$srcdir
+    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+    cd "$ac_dir" || { ac_status=$?; continue; }
+    # Check for guested configure.
+    if test -f "$ac_srcdir/configure.gnu"; then
+      echo &&
+      $SHELL "$ac_srcdir/configure.gnu" --help=recursive
+    elif test -f "$ac_srcdir/configure"; then
+      echo &&
+      $SHELL "$ac_srcdir/configure" --help=recursive
+    else
+      $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2
+    fi || ac_status=$?
+    cd "$ac_pwd" || { ac_status=$?; break; }
+  done
+fi
+
+test -n "$ac_init_help" && exit $ac_status
+if $ac_init_version; then
+  cat <<\_ACEOF
+faiss configure 1.0
+generated by GNU Autoconf 2.69
+
+Copyright (C) 2012 Free Software Foundation, Inc.
+This configure script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it.
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+_ACEOF
+  exit
+fi
+
+## ------------------------ ##
+## Autoconf initialization. ##
+## ------------------------ ##
+
+# ac_fn_cxx_try_compile LINENO
+# ----------------------------
+# Try to compile conftest.$ac_ext, and return whether this succeeded.
+ac_fn_cxx_try_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext
+  if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_cxx_try_compile
+
+# ac_fn_c_try_compile LINENO
+# --------------------------
+# Try to compile conftest.$ac_ext, and return whether this succeeded.
+ac_fn_c_try_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext
+  if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_compile
+
+# ac_fn_c_try_cpp LINENO
+# ----------------------
+# Try to preprocess conftest.$ac_ext, and return whether this succeeded.
+ac_fn_c_try_cpp ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if { { ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } > conftest.i && {
+	 test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+    ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_cpp
+
+# ac_fn_cxx_try_cpp LINENO
+# ------------------------
+# Try to preprocess conftest.$ac_ext, and return whether this succeeded.
+ac_fn_cxx_try_cpp ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if { { ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } > conftest.i && {
+	 test -z "$ac_cxx_preproc_warn_flag$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+    ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_cxx_try_cpp
+
+# ac_fn_cxx_check_header_mongrel LINENO HEADER VAR INCLUDES
+# ---------------------------------------------------------
+# Tests whether HEADER exists, giving a warning if it cannot be compiled using
+# the include files in INCLUDES and setting the cache variable VAR
+# accordingly.
+ac_fn_cxx_check_header_mongrel ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if eval \${$3+:} false; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+else
+  # Is the header compilable?
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 usability" >&5
+$as_echo_n "checking $2 usability... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+#include <$2>
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_header_compiler=yes
+else
+  ac_header_compiler=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_compiler" >&5
+$as_echo "$ac_header_compiler" >&6; }
+
+# Is the header present?
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 presence" >&5
+$as_echo_n "checking $2 presence... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <$2>
+_ACEOF
+if ac_fn_cxx_try_cpp "$LINENO"; then :
+  ac_header_preproc=yes
+else
+  ac_header_preproc=no
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_preproc" >&5
+$as_echo "$ac_header_preproc" >&6; }
+
+# So?  What about this header?
+case $ac_header_compiler:$ac_header_preproc:$ac_cxx_preproc_warn_flag in #((
+  yes:no: )
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&5
+$as_echo "$as_me: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
+$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
+    ;;
+  no:yes:* )
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: present but cannot be compiled" >&5
+$as_echo "$as_me: WARNING: $2: present but cannot be compiled" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     check for missing prerequisite headers?" >&5
+$as_echo "$as_me: WARNING: $2:     check for missing prerequisite headers?" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: see the Autoconf documentation" >&5
+$as_echo "$as_me: WARNING: $2: see the Autoconf documentation" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&5
+$as_echo "$as_me: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
+$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
+    ;;
+esac
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  eval "$3=\$ac_header_compiler"
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_cxx_check_header_mongrel
+
+# ac_fn_cxx_try_run LINENO
+# ------------------------
+# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
+# that executables *can* be run.
+ac_fn_cxx_try_run ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
+  { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: program exited with status $ac_status" >&5
+       $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+       ac_retval=$ac_status
+fi
+  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_cxx_try_run
+
+# ac_fn_cxx_check_header_compile LINENO HEADER VAR INCLUDES
+# ---------------------------------------------------------
+# Tests whether HEADER exists and can be compiled using the include files in
+# INCLUDES, setting the cache variable VAR accordingly.
+ac_fn_cxx_check_header_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+#include <$2>
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  eval "$3=yes"
+else
+  eval "$3=no"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_cxx_check_header_compile
+
+# ac_fn_cxx_try_link LINENO
+# -------------------------
+# Try to link conftest.$ac_ext, and return whether this succeeded.
+ac_fn_cxx_try_link ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext conftest$ac_exeext
+  if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext && {
+	 test "$cross_compiling" = yes ||
+	 test -x conftest$ac_exeext
+       }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_retval=1
+fi
+  # Delete the IPA/IPO (Inter Procedural Analysis/Optimization) information
+  # created by the PGI compiler (conftest_ipa8_conftest.oo), as it would
+  # interfere with the next link command; also delete a directory that is
+  # left behind by Apple's compiler.  We do this before executing the actions.
+  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_cxx_try_link
+
+# ac_fn_cxx_check_type LINENO TYPE VAR INCLUDES
+# ---------------------------------------------
+# Tests whether TYPE exists after having included INCLUDES, setting cache
+# variable VAR accordingly.
+ac_fn_cxx_check_type ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  eval "$3=no"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+if (sizeof ($2))
+	 return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+if (sizeof (($2)))
+	    return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+
+else
+  eval "$3=yes"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_cxx_check_type
+
+# ac_fn_c_find_intX_t LINENO BITS VAR
+# -----------------------------------
+# Finds a signed integer type with width BITS, setting cache variable VAR
+# accordingly.
+ac_fn_c_find_intX_t ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for int$2_t" >&5
+$as_echo_n "checking for int$2_t... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  eval "$3=no"
+     # Order is important - never check a type that is potentially smaller
+     # than half of the expected target width.
+     for ac_type in int$2_t 'int' 'long int' \
+	 'long long int' 'short int' 'signed char'; do
+       cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_includes_default
+	     enum { N = $2 / 2 - 1 };
+int
+main ()
+{
+static int test_array [1 - 2 * !(0 < ($ac_type) ((((($ac_type) 1 << N) << N) - 1) * 2 + 1))];
+test_array [0] = 0;
+return test_array [0];
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_includes_default
+	        enum { N = $2 / 2 - 1 };
+int
+main ()
+{
+static int test_array [1 - 2 * !(($ac_type) ((((($ac_type) 1 << N) << N) - 1) * 2 + 1)
+		 < ($ac_type) ((((($ac_type) 1 << N) << N) - 1) * 2 + 2))];
+test_array [0] = 0;
+return test_array [0];
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+
+else
+  case $ac_type in #(
+  int$2_t) :
+    eval "$3=yes" ;; #(
+  *) :
+    eval "$3=\$ac_type" ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+       if eval test \"x\$"$3"\" = x"no"; then :
+
+else
+  break
+fi
+     done
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_find_intX_t
+
+# ac_fn_c_find_uintX_t LINENO BITS VAR
+# ------------------------------------
+# Finds an unsigned integer type with width BITS, setting cache variable VAR
+# accordingly.
+ac_fn_c_find_uintX_t ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for uint$2_t" >&5
+$as_echo_n "checking for uint$2_t... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  eval "$3=no"
+     # Order is important - never check a type that is potentially smaller
+     # than half of the expected target width.
+     for ac_type in uint$2_t 'unsigned int' 'unsigned long int' \
+	 'unsigned long long int' 'unsigned short int' 'unsigned char'; do
+       cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_includes_default
+int
+main ()
+{
+static int test_array [1 - 2 * !((($ac_type) -1 >> ($2 / 2 - 1)) >> ($2 / 2 - 1) == 3)];
+test_array [0] = 0;
+return test_array [0];
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  case $ac_type in #(
+  uint$2_t) :
+    eval "$3=yes" ;; #(
+  *) :
+    eval "$3=\$ac_type" ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+       if eval test \"x\$"$3"\" = x"no"; then :
+
+else
+  break
+fi
+     done
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_find_uintX_t
+
+# ac_fn_cxx_check_func LINENO FUNC VAR
+# ------------------------------------
+# Tests whether FUNC exists, setting the cache variable VAR accordingly
+ac_fn_cxx_check_func ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+/* Define $2 to an innocuous variant, in case <limits.h> declares $2.
+   For example, HP-UX 11i <limits.h> declares gettimeofday.  */
+#define $2 innocuous_$2
+
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char $2 (); below.
+    Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+    <limits.h> exists even on freestanding compilers.  */
+
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+
+#undef $2
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $2 ();
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined __stub_$2 || defined __stub___$2
+choke me
+#endif
+
+int
+main ()
+{
+return $2 ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$3=yes"
+else
+  eval "$3=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_cxx_check_func
+cat >config.log <<_ACEOF
+This file contains any messages produced by compilers while
+running configure, to aid debugging if configure makes a mistake.
+
+It was created by faiss $as_me 1.0, which was
+generated by GNU Autoconf 2.69.  Invocation command line was
+
+  $ $0 $@
+
+_ACEOF
+exec 5>>config.log
+{
+cat <<_ASUNAME
+## --------- ##
+## Platform. ##
+## --------- ##
+
+hostname = `(hostname || uname -n) 2>/dev/null | sed 1q`
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null     || echo unknown`
+
+/bin/arch              = `(/bin/arch) 2>/dev/null              || echo unknown`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null       || echo unknown`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown`
+/usr/bin/hostinfo      = `(/usr/bin/hostinfo) 2>/dev/null      || echo unknown`
+/bin/machine           = `(/bin/machine) 2>/dev/null           || echo unknown`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null       || echo unknown`
+/bin/universe          = `(/bin/universe) 2>/dev/null          || echo unknown`
+
+_ASUNAME
+
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    $as_echo "PATH: $as_dir"
+  done
+IFS=$as_save_IFS
+
+} >&5
+
+cat >&5 <<_ACEOF
+
+
+## ----------- ##
+## Core tests. ##
+## ----------- ##
+
+_ACEOF
+
+
+# Keep a trace of the command line.
+# Strip out --no-create and --no-recursion so they do not pile up.
+# Strip out --silent because we don't want to record it for future runs.
+# Also quote any args containing shell meta-characters.
+# Make two passes to allow for proper duplicate-argument suppression.
+ac_configure_args=
+ac_configure_args0=
+ac_configure_args1=
+ac_must_keep_next=false
+for ac_pass in 1 2
+do
+  for ac_arg
+  do
+    case $ac_arg in
+    -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;;
+    -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+    | -silent | --silent | --silen | --sile | --sil)
+      continue ;;
+    *\'*)
+      ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    esac
+    case $ac_pass in
+    1) as_fn_append ac_configure_args0 " '$ac_arg'" ;;
+    2)
+      as_fn_append ac_configure_args1 " '$ac_arg'"
+      if test $ac_must_keep_next = true; then
+	ac_must_keep_next=false # Got value, back to normal.
+      else
+	case $ac_arg in
+	  *=* | --config-cache | -C | -disable-* | --disable-* \
+	  | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \
+	  | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \
+	  | -with-* | --with-* | -without-* | --without-* | --x)
+	    case "$ac_configure_args0 " in
+	      "$ac_configure_args1"*" '$ac_arg' "* ) continue ;;
+	    esac
+	    ;;
+	  -* ) ac_must_keep_next=true ;;
+	esac
+      fi
+      as_fn_append ac_configure_args " '$ac_arg'"
+      ;;
+    esac
+  done
+done
+{ ac_configure_args0=; unset ac_configure_args0;}
+{ ac_configure_args1=; unset ac_configure_args1;}
+
+# When interrupted or exit'd, cleanup temporary files, and complete
+# config.log.  We remove comments because anyway the quotes in there
+# would cause problems or look ugly.
+# WARNING: Use '\'' to represent an apostrophe within the trap.
+# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug.
+trap 'exit_status=$?
+  # Save into config.log some information that might help in debugging.
+  {
+    echo
+
+    $as_echo "## ---------------- ##
+## Cache variables. ##
+## ---------------- ##"
+    echo
+    # The following way of writing the cache mishandles newlines in values,
+(
+  for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do
+    eval ac_val=\$$ac_var
+    case $ac_val in #(
+    *${as_nl}*)
+      case $ac_var in #(
+      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+      esac
+      case $ac_var in #(
+      _ | IFS | as_nl) ;; #(
+      BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+      *) { eval $ac_var=; unset $ac_var;} ;;
+      esac ;;
+    esac
+  done
+  (set) 2>&1 |
+    case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #(
+    *${as_nl}ac_space=\ *)
+      sed -n \
+	"s/'\''/'\''\\\\'\'''\''/g;
+	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p"
+      ;; #(
+    *)
+      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+      ;;
+    esac |
+    sort
+)
+    echo
+
+    $as_echo "## ----------------- ##
+## Output variables. ##
+## ----------------- ##"
+    echo
+    for ac_var in $ac_subst_vars
+    do
+      eval ac_val=\$$ac_var
+      case $ac_val in
+      *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+      esac
+      $as_echo "$ac_var='\''$ac_val'\''"
+    done | sort
+    echo
+
+    if test -n "$ac_subst_files"; then
+      $as_echo "## ------------------- ##
+## File substitutions. ##
+## ------------------- ##"
+      echo
+      for ac_var in $ac_subst_files
+      do
+	eval ac_val=\$$ac_var
+	case $ac_val in
+	*\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+	esac
+	$as_echo "$ac_var='\''$ac_val'\''"
+      done | sort
+      echo
+    fi
+
+    if test -s confdefs.h; then
+      $as_echo "## ----------- ##
+## confdefs.h. ##
+## ----------- ##"
+      echo
+      cat confdefs.h
+      echo
+    fi
+    test "$ac_signal" != 0 &&
+      $as_echo "$as_me: caught signal $ac_signal"
+    $as_echo "$as_me: exit $exit_status"
+  } >&5
+  rm -f core *.core core.conftest.* &&
+    rm -f -r conftest* confdefs* conf$$* $ac_clean_files &&
+    exit $exit_status
+' 0
+for ac_signal in 1 2 13 15; do
+  trap 'ac_signal='$ac_signal'; as_fn_exit 1' $ac_signal
+done
+ac_signal=0
+
+# confdefs.h avoids OS command line length limits that DEFS can exceed.
+rm -f -r conftest* confdefs.h
+
+$as_echo "/* confdefs.h */" > confdefs.h
+
+# Predefined preprocessor variables.
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_NAME "$PACKAGE_NAME"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_TARNAME "$PACKAGE_TARNAME"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_VERSION "$PACKAGE_VERSION"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_STRING "$PACKAGE_STRING"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_URL "$PACKAGE_URL"
+_ACEOF
+
+
+# Let the site file select an alternate cache file if it wants to.
+# Prefer an explicitly selected file to automatically selected ones.
+ac_site_file1=NONE
+ac_site_file2=NONE
+if test -n "$CONFIG_SITE"; then
+  # We do not want a PATH search for config.site.
+  case $CONFIG_SITE in #((
+    -*)  ac_site_file1=./$CONFIG_SITE;;
+    */*) ac_site_file1=$CONFIG_SITE;;
+    *)   ac_site_file1=./$CONFIG_SITE;;
+  esac
+elif test "x$prefix" != xNONE; then
+  ac_site_file1=$prefix/share/config.site
+  ac_site_file2=$prefix/etc/config.site
+else
+  ac_site_file1=$ac_default_prefix/share/config.site
+  ac_site_file2=$ac_default_prefix/etc/config.site
+fi
+for ac_site_file in "$ac_site_file1" "$ac_site_file2"
+do
+  test "x$ac_site_file" = xNONE && continue
+  if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5
+$as_echo "$as_me: loading site script $ac_site_file" >&6;}
+    sed 's/^/| /' "$ac_site_file" >&5
+    . "$ac_site_file" \
+      || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "failed to load site script $ac_site_file
+See \`config.log' for more details" "$LINENO" 5; }
+  fi
+done
+
+if test -r "$cache_file"; then
+  # Some versions of bash will fail to source /dev/null (special files
+  # actually), so we avoid doing that.  DJGPP emulates it as a regular file.
+  if test /dev/null != "$cache_file" && test -f "$cache_file"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5
+$as_echo "$as_me: loading cache $cache_file" >&6;}
+    case $cache_file in
+      [\\/]* | ?:[\\/]* ) . "$cache_file";;
+      *)                      . "./$cache_file";;
+    esac
+  fi
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5
+$as_echo "$as_me: creating cache $cache_file" >&6;}
+  >$cache_file
+fi
+
+as_fn_append ac_header_list " stdlib.h"
+as_fn_append ac_header_list " unistd.h"
+as_fn_append ac_header_list " sys/param.h"
+# Check that the precious variables saved in the cache have kept the same
+# value.
+ac_cache_corrupted=false
+for ac_var in $ac_precious_vars; do
+  eval ac_old_set=\$ac_cv_env_${ac_var}_set
+  eval ac_new_set=\$ac_env_${ac_var}_set
+  eval ac_old_val=\$ac_cv_env_${ac_var}_value
+  eval ac_new_val=\$ac_env_${ac_var}_value
+  case $ac_old_set,$ac_new_set in
+    set,)
+      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
+$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;}
+      ac_cache_corrupted=: ;;
+    ,set)
+      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5
+$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
+      ac_cache_corrupted=: ;;
+    ,);;
+    *)
+      if test "x$ac_old_val" != "x$ac_new_val"; then
+	# differences in whitespace do not lead to failure.
+	ac_old_val_w=`echo x $ac_old_val`
+	ac_new_val_w=`echo x $ac_new_val`
+	if test "$ac_old_val_w" != "$ac_new_val_w"; then
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5
+$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;}
+	  ac_cache_corrupted=:
+	else
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5
+$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;}
+	  eval $ac_var=\$ac_old_val
+	fi
+	{ $as_echo "$as_me:${as_lineno-$LINENO}:   former value:  \`$ac_old_val'" >&5
+$as_echo "$as_me:   former value:  \`$ac_old_val'" >&2;}
+	{ $as_echo "$as_me:${as_lineno-$LINENO}:   current value: \`$ac_new_val'" >&5
+$as_echo "$as_me:   current value: \`$ac_new_val'" >&2;}
+      fi;;
+  esac
+  # Pass precious variables to config.status.
+  if test "$ac_new_set" = set; then
+    case $ac_new_val in
+    *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
+    *) ac_arg=$ac_var=$ac_new_val ;;
+    esac
+    case " $ac_configure_args " in
+      *" '$ac_arg' "*) ;; # Avoid dups.  Use of quotes ensures accuracy.
+      *) as_fn_append ac_configure_args " '$ac_arg'" ;;
+    esac
+  fi
+done
+if $ac_cache_corrupted; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+  { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5
+$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;}
+  as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5
+fi
+## -------------------- ##
+## Main body of script. ##
+## -------------------- ##
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+
+ac_aux_dir=
+for ac_dir in build-aux "$srcdir"/build-aux; do
+  if test -f "$ac_dir/install-sh"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/install-sh -c"
+    break
+  elif test -f "$ac_dir/install.sh"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/install.sh -c"
+    break
+  elif test -f "$ac_dir/shtool"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/shtool install -c"
+    break
+  fi
+done
+if test -z "$ac_aux_dir"; then
+  as_fn_error $? "cannot find install-sh, install.sh, or shtool in build-aux \"$srcdir\"/build-aux" "$LINENO" 5
+fi
+
+# These three variables are undocumented and unsupported,
+# and are intended to be withdrawn in a future Autoconf release.
+# They can cause serious problems if a builder's source tree is in a directory
+# whose full name contains unusual characters.
+ac_config_guess="$SHELL $ac_aux_dir/config.guess"  # Please don't use this var.
+ac_config_sub="$SHELL $ac_aux_dir/config.sub"  # Please don't use this var.
+ac_configure="$SHELL $ac_aux_dir/configure"  # Please don't use this var.
+
+
+
+
+: ${CXXFLAGS="-g -O3 -Wall -Wextra"}
+
+# Checks for programs.
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+if test -z "$CXX"; then
+  if test -n "$CCC"; then
+    CXX=$CCC
+  else
+    if test -n "$ac_tool_prefix"; then
+  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CXX"; then
+  ac_cv_prog_CXX="$CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CXX="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CXX=$ac_cv_prog_CXX
+if test -n "$CXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5
+$as_echo "$CXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+    test -n "$CXX" && break
+  done
+fi
+if test -z "$CXX"; then
+  ac_ct_CXX=$CXX
+  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_CXX"; then
+  ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CXX="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CXX=$ac_cv_prog_ac_ct_CXX
+if test -n "$ac_ct_CXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CXX" >&5
+$as_echo "$ac_ct_CXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$ac_ct_CXX" && break
+done
+
+  if test "x$ac_ct_CXX" = x; then
+    CXX="g++"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CXX=$ac_ct_CXX
+  fi
+fi
+
+  fi
+fi
+# Provide some information about the compiler.
+$as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+for ac_option in --version -v -V -qversion; do
+  { { ac_try="$ac_compiler $ac_option >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    sed '10a\
+... rest of stderr output deleted ...
+         10q' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+  fi
+  rm -f conftest.er1 conftest.err
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+done
+
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out"
+# Try to create an executable without -o first, disregard a.out.
+# It will help us diagnose broken compilers, and finding out an intuition
+# of exeext.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C++ compiler works" >&5
+$as_echo_n "checking whether the C++ compiler works... " >&6; }
+ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'`
+
+# The possible output files:
+ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*"
+
+ac_rmfiles=
+for ac_file in $ac_files
+do
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+    * ) ac_rmfiles="$ac_rmfiles $ac_file";;
+  esac
+done
+rm -f $ac_rmfiles
+
+if { { ac_try="$ac_link_default"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link_default") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then :
+  # Autoconf-2.13 could set the ac_cv_exeext variable to `no'.
+# So ignore a value of `no', otherwise this would lead to `EXEEXT = no'
+# in a Makefile.  We should not override ac_cv_exeext if it was cached,
+# so that the user can short-circuit this test for compilers unknown to
+# Autoconf.
+for ac_file in $ac_files ''
+do
+  test -f "$ac_file" || continue
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj )
+	;;
+    [ab].out )
+	# We found the default executable, but exeext='' is most
+	# certainly right.
+	break;;
+    *.* )
+	if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no;
+	then :; else
+	   ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+	fi
+	# We set ac_cv_exeext here because the later test for it is not
+	# safe: cross compilers may not add the suffix if given an `-o'
+	# argument, so we may need to know it at that point already.
+	# Even if this section looks crufty: it has the advantage of
+	# actually working.
+	break;;
+    * )
+	break;;
+  esac
+done
+test "$ac_cv_exeext" = no && ac_cv_exeext=
+
+else
+  ac_file=''
+fi
+if test -z "$ac_file"; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+$as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "C++ compiler cannot create executables
+See \`config.log' for more details" "$LINENO" 5; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ compiler default output file name" >&5
+$as_echo_n "checking for C++ compiler default output file name... " >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5
+$as_echo "$ac_file" >&6; }
+ac_exeext=$ac_cv_exeext
+
+rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out
+ac_clean_files=$ac_clean_files_save
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5
+$as_echo_n "checking for suffix of executables... " >&6; }
+if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then :
+  # If both `conftest.exe' and `conftest' are `present' (well, observable)
+# catch `conftest.exe'.  For instance with Cygwin, `ls conftest' will
+# work properly (i.e., refer to `conftest.exe'), while it won't with
+# `rm'.
+for ac_file in conftest.exe conftest conftest.*; do
+  test -f "$ac_file" || continue
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+    *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+	  break;;
+    * ) break;;
+  esac
+done
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot compute suffix of executables: cannot compile and link
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+rm -f conftest conftest$ac_cv_exeext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5
+$as_echo "$ac_cv_exeext" >&6; }
+
+rm -f conftest.$ac_ext
+EXEEXT=$ac_cv_exeext
+ac_exeext=$EXEEXT
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdio.h>
+int
+main ()
+{
+FILE *f = fopen ("conftest.out", "w");
+ return ferror (f) || fclose (f) != 0;
+
+  ;
+  return 0;
+}
+_ACEOF
+ac_clean_files="$ac_clean_files conftest.out"
+# Check that the compiler produces executables we can run.  If not, either
+# the compiler is broken, or we cross compile.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5
+$as_echo_n "checking whether we are cross compiling... " >&6; }
+if test "$cross_compiling" != yes; then
+  { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+  if { ac_try='./conftest$ac_cv_exeext'
+  { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+    cross_compiling=no
+  else
+    if test "$cross_compiling" = maybe; then
+	cross_compiling=yes
+    else
+	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot run C++ compiled programs.
+If you meant to cross compile, use \`--host'.
+See \`config.log' for more details" "$LINENO" 5; }
+    fi
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5
+$as_echo "$cross_compiling" >&6; }
+
+rm -f conftest.$ac_ext conftest$ac_cv_exeext conftest.out
+ac_clean_files=$ac_clean_files_save
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5
+$as_echo_n "checking for suffix of object files... " >&6; }
+if ${ac_cv_objext+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.o conftest.obj
+if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then :
+  for ac_file in conftest.o conftest.obj conftest.*; do
+  test -f "$ac_file" || continue;
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;;
+    *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'`
+       break;;
+  esac
+done
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot compute suffix of object files: cannot compile
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+rm -f conftest.$ac_cv_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5
+$as_echo "$ac_cv_objext" >&6; }
+OBJEXT=$ac_cv_objext
+ac_objext=$OBJEXT
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C++ compiler" >&5
+$as_echo_n "checking whether we are using the GNU C++ compiler... " >&6; }
+if ${ac_cv_cxx_compiler_gnu+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+#ifndef __GNUC__
+       choke me
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_compiler_gnu=yes
+else
+  ac_compiler_gnu=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_cxx_compiler_gnu=$ac_compiler_gnu
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_cxx_compiler_gnu" >&5
+$as_echo "$ac_cv_cxx_compiler_gnu" >&6; }
+if test $ac_compiler_gnu = yes; then
+  GXX=yes
+else
+  GXX=
+fi
+ac_test_CXXFLAGS=${CXXFLAGS+set}
+ac_save_CXXFLAGS=$CXXFLAGS
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX accepts -g" >&5
+$as_echo_n "checking whether $CXX accepts -g... " >&6; }
+if ${ac_cv_prog_cxx_g+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_save_cxx_werror_flag=$ac_cxx_werror_flag
+   ac_cxx_werror_flag=yes
+   ac_cv_prog_cxx_g=no
+   CXXFLAGS="-g"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_prog_cxx_g=yes
+else
+  CXXFLAGS=""
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+
+else
+  ac_cxx_werror_flag=$ac_save_cxx_werror_flag
+	 CXXFLAGS="-g"
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_prog_cxx_g=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   ac_cxx_werror_flag=$ac_save_cxx_werror_flag
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_g" >&5
+$as_echo "$ac_cv_prog_cxx_g" >&6; }
+if test "$ac_test_CXXFLAGS" = set; then
+  CXXFLAGS=$ac_save_CXXFLAGS
+elif test $ac_cv_prog_cxx_g = yes; then
+  if test "$GXX" = yes; then
+    CXXFLAGS="-g -O2"
+  else
+    CXXFLAGS="-g"
+  fi
+else
+  if test "$GXX" = yes; then
+    CXXFLAGS="-O2"
+  else
+    CXXFLAGS=
+  fi
+fi
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+
+  ax_cxx_compile_alternatives="11 0x"    ax_cxx_compile_cxx11_required=true
+  ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+  ac_success=no
+
+
+
+    if test x$ac_success = xno; then
+                for alternative in ${ax_cxx_compile_alternatives}; do
+      for switch in -std=c++${alternative} +std=c++${alternative} "-h std=c++${alternative}"; do
+        cachevar=`$as_echo "ax_cv_cxx_compile_cxx11_$switch" | $as_tr_sh`
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features with $switch" >&5
+$as_echo_n "checking whether $CXX supports C++11 features with $switch... " >&6; }
+if eval \${$cachevar+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_save_CXX="$CXX"
+           CXX="$CXX $switch"
+           cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+
+// If the compiler admits that it is not ready for C++11, why torture it?
+// Hopefully, this will speed up the test.
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus < 201103L
+
+#error "This is not a C++11 compiler"
+
+#else
+
+namespace cxx11
+{
+
+  namespace test_static_assert
+  {
+
+    template <typename T>
+    struct check
+    {
+      static_assert(sizeof(int) <= sizeof(T), "not big enough");
+    };
+
+  }
+
+  namespace test_final_override
+  {
+
+    struct Base
+    {
+      virtual void f() {}
+    };
+
+    struct Derived : public Base
+    {
+      virtual void f() override {}
+    };
+
+  }
+
+  namespace test_double_right_angle_brackets
+  {
+
+    template < typename T >
+    struct check {};
+
+    typedef check<void> single_type;
+    typedef check<check<void>> double_type;
+    typedef check<check<check<void>>> triple_type;
+    typedef check<check<check<check<void>>>> quadruple_type;
+
+  }
+
+  namespace test_decltype
+  {
+
+    int
+    f()
+    {
+      int a = 1;
+      decltype(a) b = 2;
+      return a + b;
+    }
+
+  }
+
+  namespace test_type_deduction
+  {
+
+    template < typename T1, typename T2 >
+    struct is_same
+    {
+      static const bool value = false;
+    };
+
+    template < typename T >
+    struct is_same<T, T>
+    {
+      static const bool value = true;
+    };
+
+    template < typename T1, typename T2 >
+    auto
+    add(T1 a1, T2 a2) -> decltype(a1 + a2)
+    {
+      return a1 + a2;
+    }
+
+    int
+    test(const int c, volatile int v)
+    {
+      static_assert(is_same<int, decltype(0)>::value == true, "");
+      static_assert(is_same<int, decltype(c)>::value == false, "");
+      static_assert(is_same<int, decltype(v)>::value == false, "");
+      auto ac = c;
+      auto av = v;
+      auto sumi = ac + av + 'x';
+      auto sumf = ac + av + 1.0;
+      static_assert(is_same<int, decltype(ac)>::value == true, "");
+      static_assert(is_same<int, decltype(av)>::value == true, "");
+      static_assert(is_same<int, decltype(sumi)>::value == true, "");
+      static_assert(is_same<int, decltype(sumf)>::value == false, "");
+      static_assert(is_same<int, decltype(add(c, v))>::value == true, "");
+      return (sumf > 0.0) ? sumi : add(c, v);
+    }
+
+  }
+
+  namespace test_noexcept
+  {
+
+    int f() { return 0; }
+    int g() noexcept { return 0; }
+
+    static_assert(noexcept(f()) == false, "");
+    static_assert(noexcept(g()) == true, "");
+
+  }
+
+  namespace test_constexpr
+  {
+
+    template < typename CharT >
+    unsigned long constexpr
+    strlen_c_r(const CharT *const s, const unsigned long acc) noexcept
+    {
+      return *s ? strlen_c_r(s + 1, acc + 1) : acc;
+    }
+
+    template < typename CharT >
+    unsigned long constexpr
+    strlen_c(const CharT *const s) noexcept
+    {
+      return strlen_c_r(s, 0UL);
+    }
+
+    static_assert(strlen_c("") == 0UL, "");
+    static_assert(strlen_c("1") == 1UL, "");
+    static_assert(strlen_c("example") == 7UL, "");
+    static_assert(strlen_c("another\0example") == 7UL, "");
+
+  }
+
+  namespace test_rvalue_references
+  {
+
+    template < int N >
+    struct answer
+    {
+      static constexpr int value = N;
+    };
+
+    answer<1> f(int&)       { return answer<1>(); }
+    answer<2> f(const int&) { return answer<2>(); }
+    answer<3> f(int&&)      { return answer<3>(); }
+
+    void
+    test()
+    {
+      int i = 0;
+      const int c = 0;
+      static_assert(decltype(f(i))::value == 1, "");
+      static_assert(decltype(f(c))::value == 2, "");
+      static_assert(decltype(f(0))::value == 3, "");
+    }
+
+  }
+
+  namespace test_uniform_initialization
+  {
+
+    struct test
+    {
+      static const int zero {};
+      static const int one {1};
+    };
+
+    static_assert(test::zero == 0, "");
+    static_assert(test::one == 1, "");
+
+  }
+
+  namespace test_lambdas
+  {
+
+    void
+    test1()
+    {
+      auto lambda1 = [](){};
+      auto lambda2 = lambda1;
+      lambda1();
+      lambda2();
+    }
+
+    int
+    test2()
+    {
+      auto a = [](int i, int j){ return i + j; }(1, 2);
+      auto b = []() -> int { return '0'; }();
+      auto c = [=](){ return a + b; }();
+      auto d = [&](){ return c; }();
+      auto e = [a, &b](int x) mutable {
+        const auto identity = [](int y){ return y; };
+        for (auto i = 0; i < a; ++i)
+          a += b--;
+        return x + identity(a + b);
+      }(0);
+      return a + b + c + d + e;
+    }
+
+    int
+    test3()
+    {
+      const auto nullary = [](){ return 0; };
+      const auto unary = [](int x){ return x; };
+      using nullary_t = decltype(nullary);
+      using unary_t = decltype(unary);
+      const auto higher1st = [](nullary_t f){ return f(); };
+      const auto higher2nd = [unary](nullary_t f1){
+        return [unary, f1](unary_t f2){ return f2(unary(f1())); };
+      };
+      return higher1st(nullary) + higher2nd(nullary)(unary);
+    }
+
+  }
+
+  namespace test_variadic_templates
+  {
+
+    template <int...>
+    struct sum;
+
+    template <int N0, int... N1toN>
+    struct sum<N0, N1toN...>
+    {
+      static constexpr auto value = N0 + sum<N1toN...>::value;
+    };
+
+    template <>
+    struct sum<>
+    {
+      static constexpr auto value = 0;
+    };
+
+    static_assert(sum<>::value == 0, "");
+    static_assert(sum<1>::value == 1, "");
+    static_assert(sum<23>::value == 23, "");
+    static_assert(sum<1, 2>::value == 3, "");
+    static_assert(sum<5, 5, 11>::value == 21, "");
+    static_assert(sum<2, 3, 5, 7, 11, 13>::value == 41, "");
+
+  }
+
+  // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
+  // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function
+  // because of this.
+  namespace test_template_alias_sfinae
+  {
+
+    struct foo {};
+
+    template<typename T>
+    using member = typename T::member_type;
+
+    template<typename T>
+    void func(...) {}
+
+    template<typename T>
+    void func(member<T>*) {}
+
+    void test();
+
+    void test() { func<foo>(0); }
+
+  }
+
+}  // namespace cxx11
+
+#endif  // __cplusplus >= 201103L
+
+
+
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  eval $cachevar=yes
+else
+  eval $cachevar=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+           CXX="$ac_save_CXX"
+fi
+eval ac_res=\$$cachevar
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+        if eval test x\$$cachevar = xyes; then
+          CXX="$CXX $switch"
+          if test -n "$CXXCPP" ; then
+            CXXCPP="$CXXCPP $switch"
+          fi
+          ac_success=yes
+          break
+        fi
+      done
+      if test x$ac_success = xyes; then
+        break
+      fi
+    done
+  fi
+  ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+  if test x$ax_cxx_compile_cxx11_required = xtrue; then
+    if test x$ac_success = xno; then
+      as_fn_error $? "*** A compiler with support for C++11 language features is required." "$LINENO" 5
+    fi
+  fi
+  if test x$ac_success = xno; then
+    HAVE_CXX11=0
+    { $as_echo "$as_me:${as_lineno-$LINENO}: No compiler with C++11 support was found" >&5
+$as_echo "$as_me: No compiler with C++11 support was found" >&6;}
+  else
+    HAVE_CXX11=1
+
+$as_echo "#define HAVE_CXX11 1" >>confdefs.h
+
+  fi
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}gcc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="${ac_tool_prefix}gcc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_CC"; then
+  ac_ct_CC=$CC
+  # Extract the first word of "gcc", so it can be a program name with args.
+set dummy gcc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_CC"; then
+  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CC="gcc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_CC" = x; then
+    CC=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CC=$ac_ct_CC
+  fi
+else
+  CC="$ac_cv_prog_CC"
+fi
+
+if test -z "$CC"; then
+          if test -n "$ac_tool_prefix"; then
+    # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}cc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="${ac_tool_prefix}cc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  fi
+fi
+if test -z "$CC"; then
+  # Extract the first word of "cc", so it can be a program name with args.
+set dummy cc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+  ac_prog_rejected=no
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then
+       ac_prog_rejected=yes
+       continue
+     fi
+    ac_cv_prog_CC="cc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+if test $ac_prog_rejected = yes; then
+  # We found a bogon in the path, so make sure we never use it.
+  set dummy $ac_cv_prog_CC
+  shift
+  if test $# != 0; then
+    # We chose a different compiler from the bogus one.
+    # However, it has the same basename, so the bogon will be chosen
+    # first if we set CC to just the basename; use the full file name.
+    shift
+    ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@"
+  fi
+fi
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$CC"; then
+  if test -n "$ac_tool_prefix"; then
+  for ac_prog in cl.exe
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+    test -n "$CC" && break
+  done
+fi
+if test -z "$CC"; then
+  ac_ct_CC=$CC
+  for ac_prog in cl.exe
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_CC"; then
+  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CC="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$ac_ct_CC" && break
+done
+
+  if test "x$ac_ct_CC" = x; then
+    CC=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CC=$ac_ct_CC
+  fi
+fi
+
+fi
+
+
+test -z "$CC" && { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "no acceptable C compiler found in \$PATH
+See \`config.log' for more details" "$LINENO" 5; }
+
+# Provide some information about the compiler.
+$as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+for ac_option in --version -v -V -qversion; do
+  { { ac_try="$ac_compiler $ac_option >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    sed '10a\
+... rest of stderr output deleted ...
+         10q' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+  fi
+  rm -f conftest.er1 conftest.err
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+done
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C compiler" >&5
+$as_echo_n "checking whether we are using the GNU C compiler... " >&6; }
+if ${ac_cv_c_compiler_gnu+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+#ifndef __GNUC__
+       choke me
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_compiler_gnu=yes
+else
+  ac_compiler_gnu=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_c_compiler_gnu=$ac_compiler_gnu
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5
+$as_echo "$ac_cv_c_compiler_gnu" >&6; }
+if test $ac_compiler_gnu = yes; then
+  GCC=yes
+else
+  GCC=
+fi
+ac_test_CFLAGS=${CFLAGS+set}
+ac_save_CFLAGS=$CFLAGS
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5
+$as_echo_n "checking whether $CC accepts -g... " >&6; }
+if ${ac_cv_prog_cc_g+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_save_c_werror_flag=$ac_c_werror_flag
+   ac_c_werror_flag=yes
+   ac_cv_prog_cc_g=no
+   CFLAGS="-g"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_g=yes
+else
+  CFLAGS=""
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+else
+  ac_c_werror_flag=$ac_save_c_werror_flag
+	 CFLAGS="-g"
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_g=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   ac_c_werror_flag=$ac_save_c_werror_flag
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5
+$as_echo "$ac_cv_prog_cc_g" >&6; }
+if test "$ac_test_CFLAGS" = set; then
+  CFLAGS=$ac_save_CFLAGS
+elif test $ac_cv_prog_cc_g = yes; then
+  if test "$GCC" = yes; then
+    CFLAGS="-g -O2"
+  else
+    CFLAGS="-g"
+  fi
+else
+  if test "$GCC" = yes; then
+    CFLAGS="-O2"
+  else
+    CFLAGS=
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5
+$as_echo_n "checking for $CC option to accept ISO C89... " >&6; }
+if ${ac_cv_prog_cc_c89+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_prog_cc_c89=no
+ac_save_CC=$CC
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdarg.h>
+#include <stdio.h>
+struct stat;
+/* Most of the following tests are stolen from RCS 5.7's src/conf.sh.  */
+struct buf { int x; };
+FILE * (*rcsopen) (struct buf *, struct stat *, int);
+static char *e (p, i)
+     char **p;
+     int i;
+{
+  return p[i];
+}
+static char *f (char * (*g) (char **, int), char **p, ...)
+{
+  char *s;
+  va_list v;
+  va_start (v,p);
+  s = g (p, va_arg (v,int));
+  va_end (v);
+  return s;
+}
+
+/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default.  It has
+   function prototypes and stuff, but not '\xHH' hex character constants.
+   These don't provoke an error unfortunately, instead are silently treated
+   as 'x'.  The following induces an error, until -std is added to get
+   proper ANSI mode.  Curiously '\x00'!='x' always comes out true, for an
+   array size at least.  It's necessary to write '\x00'==0 to get something
+   that's true only with -std.  */
+int osf4_cc_array ['\x00' == 0 ? 1 : -1];
+
+/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters
+   inside strings and character constants.  */
+#define FOO(x) 'x'
+int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1];
+
+int test (int i, double x);
+struct s1 {int (*f) (int a);};
+struct s2 {int (*f) (double a);};
+int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
+int argc;
+char **argv;
+int
+main ()
+{
+return f (e, argv, 0) != argv[0]  ||  f (e, argv, 1) != argv[1];
+  ;
+  return 0;
+}
+_ACEOF
+for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \
+	-Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
+do
+  CC="$ac_save_CC $ac_arg"
+  if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_c89=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext
+  test "x$ac_cv_prog_cc_c89" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC
+
+fi
+# AC_CACHE_VAL
+case "x$ac_cv_prog_cc_c89" in
+  x)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+$as_echo "none needed" >&6; } ;;
+  xno)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+$as_echo "unsupported" >&6; } ;;
+  *)
+    CC="$CC $ac_cv_prog_cc_c89"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5
+$as_echo "$ac_cv_prog_cc_c89" >&6; } ;;
+esac
+if test "x$ac_cv_prog_cc_c89" != xno; then :
+
+fi
+
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C preprocessor" >&5
+$as_echo_n "checking how to run the C preprocessor... " >&6; }
+# On Suns, sometimes $CPP names a directory.
+if test -n "$CPP" && test -d "$CPP"; then
+  CPP=
+fi
+if test -z "$CPP"; then
+  if ${ac_cv_prog_CPP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+      # Double quotes because CPP needs to be expanded
+    for CPP in "$CC -E" "$CC -E -traditional-cpp" "/lib/cpp"
+    do
+      ac_preproc_ok=false
+for ac_c_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+
+else
+  # Broken: fails on valid input.
+continue
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  # Broken: success on invalid input.
+continue
+else
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then :
+  break
+fi
+
+    done
+    ac_cv_prog_CPP=$CPP
+
+fi
+  CPP=$ac_cv_prog_CPP
+else
+  ac_cv_prog_CPP=$CPP
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CPP" >&5
+$as_echo "$CPP" >&6; }
+ac_preproc_ok=false
+for ac_c_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+
+else
+  # Broken: fails on valid input.
+continue
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  # Broken: success on invalid input.
+continue
+else
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then :
+
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "C preprocessor \"$CPP\" fails sanity check
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${MAKE-make} sets \$(MAKE)" >&5
+$as_echo_n "checking whether ${MAKE-make} sets \$(MAKE)... " >&6; }
+set x ${MAKE-make}
+ac_make=`$as_echo "$2" | sed 's/+/p/g; s/[^a-zA-Z0-9_]/_/g'`
+if eval \${ac_cv_prog_make_${ac_make}_set+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat >conftest.make <<\_ACEOF
+SHELL = /bin/sh
+all:
+	@echo '@@@%%%=$(MAKE)=@@@%%%'
+_ACEOF
+# GNU make sometimes prints "make[1]: Entering ...", which would confuse us.
+case `${MAKE-make} -f conftest.make 2>/dev/null` in
+  *@@@%%%=?*=@@@%%%*)
+    eval ac_cv_prog_make_${ac_make}_set=yes;;
+  *)
+    eval ac_cv_prog_make_${ac_make}_set=no;;
+esac
+rm -f conftest.make
+fi
+if eval test \$ac_cv_prog_make_${ac_make}_set = yes; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+  SET_MAKE=
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+  SET_MAKE="MAKE=${MAKE-make}"
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a thread-safe mkdir -p" >&5
+$as_echo_n "checking for a thread-safe mkdir -p... " >&6; }
+if test -z "$MKDIR_P"; then
+  if ${ac_cv_path_mkdir+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/opt/sfw/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in mkdir gmkdir; do
+	 for ac_exec_ext in '' $ac_executable_extensions; do
+	   as_fn_executable_p "$as_dir/$ac_prog$ac_exec_ext" || continue
+	   case `"$as_dir/$ac_prog$ac_exec_ext" --version 2>&1` in #(
+	     'mkdir (GNU coreutils) '* | \
+	     'mkdir (coreutils) '* | \
+	     'mkdir (fileutils) '4.1*)
+	       ac_cv_path_mkdir=$as_dir/$ac_prog$ac_exec_ext
+	       break 3;;
+	   esac
+	 done
+       done
+  done
+IFS=$as_save_IFS
+
+fi
+
+  test -d ./--version && rmdir ./--version
+  if test "${ac_cv_path_mkdir+set}" = set; then
+    MKDIR_P="$ac_cv_path_mkdir -p"
+  else
+    # As a last resort, use the slow shell script.  Don't cache a
+    # value for MKDIR_P within a source directory, because that will
+    # break other packages using the cache if that directory is
+    # removed, or if the value is a relative name.
+    MKDIR_P="$ac_install_sh -d"
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $MKDIR_P" >&5
+$as_echo "$MKDIR_P" >&6; }
+
+
+
+
+
+# Check whether --with-python was given.
+if test "${with_python+set}" = set; then :
+  withval=$with_python;
+fi
+
+case $with_python in
+  "") PYTHON_BIN=python ;;
+  *) PYTHON_BIN="$with_python"
+esac
+
+# Extract the first word of "$PYTHON_BIN", so it can be a program name with args.
+set dummy $PYTHON_BIN; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_PYTHON+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$PYTHON"; then
+  ac_cv_prog_PYTHON="$PYTHON" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_PYTHON="$PYTHON_BIN"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+PYTHON=$ac_cv_prog_PYTHON
+if test -n "$PYTHON"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $PYTHON" >&5
+$as_echo "$PYTHON" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fa_python_bin=$PYTHON
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Python C flags" >&5
+$as_echo_n "checking for Python C flags... " >&6; }
+fa_python_cflags=`$PYTHON -c "
+import sysconfig
+paths = ['-I' + sysconfig.get_path(p) for p in ['include', 'platinclude']]
+print(' '.join(paths))"`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $fa_python_cflags" >&5
+$as_echo "$fa_python_cflags" >&6; }
+PYTHON_CFLAGS="$PYTHON_CFLAGS $fa_python_cflags"
+
+
+
+
+if test x$PYTHON != x; then
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for numpy headers path" >&5
+$as_echo_n "checking for numpy headers path... " >&6; }
+
+fa_numpy_headers=`$PYTHON -c "import numpy; print(numpy.get_include())"`
+
+if test $? == 0; then
+  if test x$fa_numpy_headers != x; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $fa_numpy_headers" >&5
+$as_echo "$fa_numpy_headers" >&6; }
+    NUMPY_INCLUDE=$fa_numpy_headers
+
+  else
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: not found" >&5
+$as_echo "not found" >&6; }
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: You won't be able to build the python interface." >&5
+$as_echo "$as_me: WARNING: You won't be able to build the python interface." >&2;}
+  fi
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: not found" >&5
+$as_echo "not found" >&6; }
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: You won't be able to build the python interface." >&5
+$as_echo "$as_me: WARNING: You won't be able to build the python interface." >&2;}
+fi
+
+fi
+
+
+
+
+# Check whether --with-swig was given.
+if test "${with_swig+set}" = set; then :
+  withval=$with_swig;
+fi
+
+case $with_swig in
+ "") # Extract the first word of "swig", so it can be a program name with args.
+set dummy swig; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_SWIG+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$SWIG"; then
+  ac_cv_prog_SWIG="$SWIG" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_SWIG="swig"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+SWIG=$ac_cv_prog_SWIG
+if test -n "$SWIG"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $SWIG" >&5
+$as_echo "$SWIG" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+;;
+  *) SWIG="$with_swig"
+esac
+
+
+
+
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C++ preprocessor" >&5
+$as_echo_n "checking how to run the C++ preprocessor... " >&6; }
+if test -z "$CXXCPP"; then
+  if ${ac_cv_prog_CXXCPP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+      # Double quotes because CXXCPP needs to be expanded
+    for CXXCPP in "$CXX -E" "/lib/cpp"
+    do
+      ac_preproc_ok=false
+for ac_cxx_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if ac_fn_cxx_try_cpp "$LINENO"; then :
+
+else
+  # Broken: fails on valid input.
+continue
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_cxx_try_cpp "$LINENO"; then :
+  # Broken: success on invalid input.
+continue
+else
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then :
+  break
+fi
+
+    done
+    ac_cv_prog_CXXCPP=$CXXCPP
+
+fi
+  CXXCPP=$ac_cv_prog_CXXCPP
+else
+  ac_cv_prog_CXXCPP=$CXXCPP
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXXCPP" >&5
+$as_echo "$CXXCPP" >&6; }
+ac_preproc_ok=false
+for ac_cxx_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if ac_fn_cxx_try_cpp "$LINENO"; then :
+
+else
+  # Broken: fails on valid input.
+continue
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_cxx_try_cpp "$LINENO"; then :
+  # Broken: success on invalid input.
+continue
+else
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then :
+
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "C++ preprocessor \"$CXXCPP\" fails sanity check
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
+$as_echo_n "checking for grep that handles long lines and -e... " >&6; }
+if ${ac_cv_path_GREP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$GREP"; then
+  ac_path_GREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in grep ggrep; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_GREP" || continue
+# Check for GNU ac_path_GREP and select it if it is found.
+  # Check for GNU $ac_path_GREP
+case `"$ac_path_GREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
+*)
+  ac_count=0
+  $as_echo_n 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    $as_echo 'GREP' >> "conftest.nl"
+    "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_GREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_GREP="$ac_path_GREP"
+      ac_path_GREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_GREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_GREP"; then
+    as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_GREP=$GREP
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
+$as_echo "$ac_cv_path_GREP" >&6; }
+ GREP="$ac_cv_path_GREP"
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5
+$as_echo_n "checking for egrep... " >&6; }
+if ${ac_cv_path_EGREP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
+   then ac_cv_path_EGREP="$GREP -E"
+   else
+     if test -z "$EGREP"; then
+  ac_path_EGREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in egrep; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_EGREP" || continue
+# Check for GNU ac_path_EGREP and select it if it is found.
+  # Check for GNU $ac_path_EGREP
+case `"$ac_path_EGREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
+*)
+  ac_count=0
+  $as_echo_n 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    $as_echo 'EGREP' >> "conftest.nl"
+    "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_EGREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_EGREP="$ac_path_EGREP"
+      ac_path_EGREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_EGREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_EGREP"; then
+    as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_EGREP=$EGREP
+fi
+
+   fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5
+$as_echo "$ac_cv_path_EGREP" >&6; }
+ EGREP="$ac_cv_path_EGREP"
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5
+$as_echo_n "checking for ANSI C header files... " >&6; }
+if ${ac_cv_header_stdc+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <float.h>
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_header_stdc=yes
+else
+  ac_cv_header_stdc=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+if test $ac_cv_header_stdc = yes; then
+  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <string.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "memchr" >/dev/null 2>&1; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdlib.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "free" >/dev/null 2>&1; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
+  if test "$cross_compiling" = yes; then :
+  :
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ctype.h>
+#include <stdlib.h>
+#if ((' ' & 0x0FF) == 0x020)
+# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
+# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
+#else
+# define ISLOWER(c) \
+		   (('a' <= (c) && (c) <= 'i') \
+		     || ('j' <= (c) && (c) <= 'r') \
+		     || ('s' <= (c) && (c) <= 'z'))
+# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
+#endif
+
+#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    if (XOR (islower (i), ISLOWER (i))
+	|| toupper (i) != TOUPPER (i))
+      return 2;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_run "$LINENO"; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5
+$as_echo "$ac_cv_header_stdc" >&6; }
+if test $ac_cv_header_stdc = yes; then
+
+$as_echo "#define STDC_HEADERS 1" >>confdefs.h
+
+fi
+
+# On IRIX 5.3, sys/types and inttypes.h are conflicting.
+for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \
+		  inttypes.h stdint.h unistd.h
+do :
+  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
+ac_fn_cxx_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default
+"
+if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+
+done
+
+
+
+
+
+# Check whether --with-cuda was given.
+if test "${with_cuda+set}" = set; then :
+  withval=$with_cuda;
+fi
+
+
+# Check whether --with-cuda-arch was given.
+if test "${with_cuda_arch+set}" = set; then :
+  withval=$with_cuda_arch;
+else
+  with_cuda_arch=default
+fi
+
+
+if test x$with_cuda != xno; then
+  if test x$with_cuda != x; then
+    cuda_prefix=$with_cuda
+    # Extract the first word of "nvcc", so it can be a program name with args.
+set dummy nvcc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_NVCC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$NVCC"; then
+  ac_cv_prog_NVCC="$NVCC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $cuda_prefix/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_NVCC="$cuda_prefix/bin/nvcc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+NVCC=$ac_cv_prog_NVCC
+if test -n "$NVCC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $NVCC" >&5
+$as_echo "$NVCC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+    NVCC_CPPFLAGS="-I$cuda_prefix/include"
+    NVCC_LDFLAGS="-L$cuda_prefix/lib64"
+  else
+    for ac_prog in nvcc /usr/local/cuda/bin/nvcc
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_NVCC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$NVCC"; then
+  ac_cv_prog_NVCC="$NVCC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_NVCC="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+NVCC=$ac_cv_prog_NVCC
+if test -n "$NVCC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $NVCC" >&5
+$as_echo "$NVCC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$NVCC" && break
+done
+
+    if test "x$NVCC" == "x/usr/local/cuda/bin/nvcc"; then
+      cuda_prefix="/usr/local/cuda"
+      NVCC_CPPFLAGS="-I$cuda_prefix/include"
+      NVCC_LDFLAGS="-L$cuda_prefix/lib64"
+    else
+      cuda_prefix=""
+      NVCC_CPPFLAGS=""
+      NVCC_LDFLAGS=""
+    fi
+  fi
+
+  if test "x$NVCC" == x; then
+    as_fn_error $? "Couldn't find nvcc" "$LINENO" 5
+  fi
+
+  if test "x$with_cuda_arch" == xdefault; then
+    with_cuda_arch="-gencode=arch=compute_35,code=compute_35 \\
+-gencode=arch=compute_52,code=compute_52 \\
+-gencode=arch=compute_60,code=compute_60 \\
+-gencode=arch=compute_61,code=compute_61 \\
+-gencode=arch=compute_70,code=compute_70 \\
+-gencode=arch=compute_75,code=compute_75"
+  fi
+
+  fa_save_CPPFLAGS="$CPPFLAGS"
+  fa_save_LDFLAGS="$LDFLAGS"
+  fa_save_LIBS="$LIBS"
+
+  CPPFLAGS="$NVCC_CPPFLAGS $CPPFLAGS"
+  LDFLAGS="$NVCC_LDFLAGS $LDFLAGS"
+
+  ac_fn_cxx_check_header_mongrel "$LINENO" "cuda.h" "ac_cv_header_cuda_h" "$ac_includes_default"
+if test "x$ac_cv_header_cuda_h" = xyes; then :
+
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "Couldn't find cuda.h
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for cublasAlloc in -lcublas" >&5
+$as_echo_n "checking for cublasAlloc in -lcublas... " >&6; }
+if ${ac_cv_lib_cublas_cublasAlloc+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lcublas  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char cublasAlloc ();
+int
+main ()
+{
+return cublasAlloc ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ac_cv_lib_cublas_cublasAlloc=yes
+else
+  ac_cv_lib_cublas_cublasAlloc=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_cublas_cublasAlloc" >&5
+$as_echo "$ac_cv_lib_cublas_cublasAlloc" >&6; }
+if test "x$ac_cv_lib_cublas_cublasAlloc" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBCUBLAS 1
+_ACEOF
+
+  LIBS="-lcublas $LIBS"
+
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "Couldn't find libcublas
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for cudaSetDevice in -lcudart" >&5
+$as_echo_n "checking for cudaSetDevice in -lcudart... " >&6; }
+if ${ac_cv_lib_cudart_cudaSetDevice+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lcudart  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char cudaSetDevice ();
+int
+main ()
+{
+return cudaSetDevice ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ac_cv_lib_cudart_cudaSetDevice=yes
+else
+  ac_cv_lib_cudart_cudaSetDevice=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_cudart_cudaSetDevice" >&5
+$as_echo "$ac_cv_lib_cudart_cudaSetDevice" >&6; }
+if test "x$ac_cv_lib_cudart_cudaSetDevice" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBCUDART 1
+_ACEOF
+
+  LIBS="-lcudart $LIBS"
+
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "Couldn't find libcudart
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+
+  NVCC_LIBS="$LIBS"
+  NVCC_CPPFLAGS="$CPPFLAGS"
+  NVCC_LDFLAGS="$LDFLAGS"
+  CPPFLAGS="$fa_save_CPPFLAGS"
+  LDFLAGS="$fa_save_LDFLAGS"
+  LIBS="$fa_save_LIBS"
+fi
+
+
+
+
+
+CUDA_PREFIX=$cuda_prefix
+
+CUDA_ARCH=$with_cuda_arch
+
+
+
+
+# Checks for header files.
+for ac_header in float.h limits.h stddef.h stdint.h stdlib.h string.h sys/time.h unistd.h
+do :
+  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
+ac_fn_cxx_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
+if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+
+done
+
+
+# Checks for typedefs, structures, and compiler characteristics.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for stdbool.h that conforms to C99" >&5
+$as_echo_n "checking for stdbool.h that conforms to C99... " >&6; }
+if ${ac_cv_header_stdbool_h+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+             #include <stdbool.h>
+             #ifndef bool
+              "error: bool is not defined"
+             #endif
+             #ifndef false
+              "error: false is not defined"
+             #endif
+             #if false
+              "error: false is not 0"
+             #endif
+             #ifndef true
+              "error: true is not defined"
+             #endif
+             #if true != 1
+              "error: true is not 1"
+             #endif
+             #ifndef __bool_true_false_are_defined
+              "error: __bool_true_false_are_defined is not defined"
+             #endif
+
+             struct s { _Bool s: 1; _Bool t; } s;
+
+             char a[true == 1 ? 1 : -1];
+             char b[false == 0 ? 1 : -1];
+             char c[__bool_true_false_are_defined == 1 ? 1 : -1];
+             char d[(bool) 0.5 == true ? 1 : -1];
+             /* See body of main program for 'e'.  */
+             char f[(_Bool) 0.0 == false ? 1 : -1];
+             char g[true];
+             char h[sizeof (_Bool)];
+             char i[sizeof s.t];
+             enum { j = false, k = true, l = false * true, m = true * 256 };
+             /* The following fails for
+                HP aC++/ANSI C B3910B A.05.55 [Dec 04 2003]. */
+             _Bool n[m];
+             char o[sizeof n == m * sizeof n[0] ? 1 : -1];
+             char p[-1 - (_Bool) 0 < 0 && -1 - (bool) 0 < 0 ? 1 : -1];
+             /* Catch a bug in an HP-UX C compiler.  See
+                http://gcc.gnu.org/ml/gcc-patches/2003-12/msg02303.html
+                http://lists.gnu.org/archive/html/bug-coreutils/2005-11/msg00161.html
+              */
+             _Bool q = true;
+             _Bool *pq = &q;
+
+int
+main ()
+{
+
+             bool e = &s;
+             *pq |= q;
+             *pq |= ! q;
+             /* Refer to every declared value, to avoid compiler optimizations.  */
+             return (!a + !b + !c + !d + !e + !f + !g + !h + !i + !!j + !k + !!l
+                     + !m + !n + !o + !p + !q + !pq);
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_header_stdbool_h=yes
+else
+  ac_cv_header_stdbool_h=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdbool_h" >&5
+$as_echo "$ac_cv_header_stdbool_h" >&6; }
+   ac_fn_cxx_check_type "$LINENO" "_Bool" "ac_cv_type__Bool" "$ac_includes_default"
+if test "x$ac_cv_type__Bool" = xyes; then :
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE__BOOL 1
+_ACEOF
+
+
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for inline" >&5
+$as_echo_n "checking for inline... " >&6; }
+if ${ac_cv_c_inline+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_c_inline=no
+for ac_kw in inline __inline__ __inline; do
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifndef __cplusplus
+typedef int foo_t;
+static $ac_kw foo_t static_foo () {return 0; }
+$ac_kw foo_t foo () {return 0; }
+#endif
+
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_c_inline=$ac_kw
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  test "$ac_cv_c_inline" != no && break
+done
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_inline" >&5
+$as_echo "$ac_cv_c_inline" >&6; }
+
+case $ac_cv_c_inline in
+  inline | yes) ;;
+  *)
+    case $ac_cv_c_inline in
+      no) ac_val=;;
+      *) ac_val=$ac_cv_c_inline;;
+    esac
+    cat >>confdefs.h <<_ACEOF
+#ifndef __cplusplus
+#define inline $ac_val
+#endif
+_ACEOF
+    ;;
+esac
+
+ac_fn_c_find_intX_t "$LINENO" "32" "ac_cv_c_int32_t"
+case $ac_cv_c_int32_t in #(
+  no|yes) ;; #(
+  *)
+
+cat >>confdefs.h <<_ACEOF
+#define int32_t $ac_cv_c_int32_t
+_ACEOF
+;;
+esac
+
+ac_fn_c_find_intX_t "$LINENO" "64" "ac_cv_c_int64_t"
+case $ac_cv_c_int64_t in #(
+  no|yes) ;; #(
+  *)
+
+cat >>confdefs.h <<_ACEOF
+#define int64_t $ac_cv_c_int64_t
+_ACEOF
+;;
+esac
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C/C++ restrict keyword" >&5
+$as_echo_n "checking for C/C++ restrict keyword... " >&6; }
+if ${ac_cv_c_restrict+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_c_restrict=no
+   # The order here caters to the fact that C++ does not require restrict.
+   for ac_kw in __restrict __restrict__ _Restrict restrict; do
+     cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+typedef int * int_ptr;
+	int foo (int_ptr $ac_kw ip) {
+	return ip[0];
+       }
+int
+main ()
+{
+int s[1];
+	int * $ac_kw t = s;
+	t[0] = 0;
+	return foo(t)
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_c_restrict=$ac_kw
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+     test "$ac_cv_c_restrict" != no && break
+   done
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_restrict" >&5
+$as_echo "$ac_cv_c_restrict" >&6; }
+
+ case $ac_cv_c_restrict in
+   restrict) ;;
+   no) $as_echo "#define restrict /**/" >>confdefs.h
+ ;;
+   *)  cat >>confdefs.h <<_ACEOF
+#define restrict $ac_cv_c_restrict
+_ACEOF
+ ;;
+ esac
+
+ac_fn_cxx_check_type "$LINENO" "size_t" "ac_cv_type_size_t" "$ac_includes_default"
+if test "x$ac_cv_type_size_t" = xyes; then :
+
+else
+
+cat >>confdefs.h <<_ACEOF
+#define size_t unsigned int
+_ACEOF
+
+fi
+
+ac_fn_c_find_uintX_t "$LINENO" "16" "ac_cv_c_uint16_t"
+case $ac_cv_c_uint16_t in #(
+  no|yes) ;; #(
+  *)
+
+
+cat >>confdefs.h <<_ACEOF
+#define uint16_t $ac_cv_c_uint16_t
+_ACEOF
+;;
+  esac
+
+ac_fn_c_find_uintX_t "$LINENO" "32" "ac_cv_c_uint32_t"
+case $ac_cv_c_uint32_t in #(
+  no|yes) ;; #(
+  *)
+
+$as_echo "#define _UINT32_T 1" >>confdefs.h
+
+
+cat >>confdefs.h <<_ACEOF
+#define uint32_t $ac_cv_c_uint32_t
+_ACEOF
+;;
+  esac
+
+ac_fn_c_find_uintX_t "$LINENO" "64" "ac_cv_c_uint64_t"
+case $ac_cv_c_uint64_t in #(
+  no|yes) ;; #(
+  *)
+
+$as_echo "#define _UINT64_T 1" >>confdefs.h
+
+
+cat >>confdefs.h <<_ACEOF
+#define uint64_t $ac_cv_c_uint64_t
+_ACEOF
+;;
+  esac
+
+ac_fn_c_find_uintX_t "$LINENO" "8" "ac_cv_c_uint8_t"
+case $ac_cv_c_uint8_t in #(
+  no|yes) ;; #(
+  *)
+
+$as_echo "#define _UINT8_T 1" >>confdefs.h
+
+
+cat >>confdefs.h <<_ACEOF
+#define uint8_t $ac_cv_c_uint8_t
+_ACEOF
+;;
+  esac
+
+
+# Checks for library functions.
+for ac_header in stdlib.h
+do :
+  ac_fn_cxx_check_header_mongrel "$LINENO" "stdlib.h" "ac_cv_header_stdlib_h" "$ac_includes_default"
+if test "x$ac_cv_header_stdlib_h" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_STDLIB_H 1
+_ACEOF
+
+fi
+
+done
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for GNU libc compatible malloc" >&5
+$as_echo_n "checking for GNU libc compatible malloc... " >&6; }
+if ${ac_cv_func_malloc_0_nonnull+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "$cross_compiling" = yes; then :
+  ac_cv_func_malloc_0_nonnull=no
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#if defined STDC_HEADERS || defined HAVE_STDLIB_H
+# include <stdlib.h>
+#else
+char *malloc ();
+#endif
+
+int
+main ()
+{
+return ! malloc (0);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_run "$LINENO"; then :
+  ac_cv_func_malloc_0_nonnull=yes
+else
+  ac_cv_func_malloc_0_nonnull=no
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_malloc_0_nonnull" >&5
+$as_echo "$ac_cv_func_malloc_0_nonnull" >&6; }
+if test $ac_cv_func_malloc_0_nonnull = yes; then :
+
+$as_echo "#define HAVE_MALLOC 1" >>confdefs.h
+
+else
+  $as_echo "#define HAVE_MALLOC 0" >>confdefs.h
+
+   case " $LIBOBJS " in
+  *" malloc.$ac_objext "* ) ;;
+  *) LIBOBJS="$LIBOBJS malloc.$ac_objext"
+ ;;
+esac
+
+
+$as_echo "#define malloc rpl_malloc" >>confdefs.h
+
+fi
+
+
+
+
+
+  for ac_header in $ac_header_list
+do :
+  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
+ac_fn_cxx_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default
+"
+if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+
+done
+
+
+
+
+
+
+
+
+for ac_func in getpagesize
+do :
+  ac_fn_cxx_check_func "$LINENO" "getpagesize" "ac_cv_func_getpagesize"
+if test "x$ac_cv_func_getpagesize" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_GETPAGESIZE 1
+_ACEOF
+
+fi
+done
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for working mmap" >&5
+$as_echo_n "checking for working mmap... " >&6; }
+if ${ac_cv_func_mmap_fixed_mapped+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "$cross_compiling" = yes; then :
+  ac_cv_func_mmap_fixed_mapped=no
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_includes_default
+/* malloc might have been renamed as rpl_malloc. */
+#undef malloc
+
+/* Thanks to Mike Haertel and Jim Avera for this test.
+   Here is a matrix of mmap possibilities:
+	mmap private not fixed
+	mmap private fixed at somewhere currently unmapped
+	mmap private fixed at somewhere already mapped
+	mmap shared not fixed
+	mmap shared fixed at somewhere currently unmapped
+	mmap shared fixed at somewhere already mapped
+   For private mappings, we should verify that changes cannot be read()
+   back from the file, nor mmap's back from the file at a different
+   address.  (There have been systems where private was not correctly
+   implemented like the infamous i386 svr4.0, and systems where the
+   VM page cache was not coherent with the file system buffer cache
+   like early versions of FreeBSD and possibly contemporary NetBSD.)
+   For shared mappings, we should conversely verify that changes get
+   propagated back to all the places they're supposed to be.
+
+   Grep wants private fixed already mapped.
+   The main things grep needs to know about mmap are:
+   * does it exist and is it safe to write into the mmap'd area
+   * how to use it (BSD variants)  */
+
+#include <fcntl.h>
+#include <sys/mman.h>
+
+#if !defined STDC_HEADERS && !defined HAVE_STDLIB_H
+char *malloc ();
+#endif
+
+/* This mess was copied from the GNU getpagesize.h.  */
+#ifndef HAVE_GETPAGESIZE
+# ifdef _SC_PAGESIZE
+#  define getpagesize() sysconf(_SC_PAGESIZE)
+# else /* no _SC_PAGESIZE */
+#  ifdef HAVE_SYS_PARAM_H
+#   include <sys/param.h>
+#   ifdef EXEC_PAGESIZE
+#    define getpagesize() EXEC_PAGESIZE
+#   else /* no EXEC_PAGESIZE */
+#    ifdef NBPG
+#     define getpagesize() NBPG * CLSIZE
+#     ifndef CLSIZE
+#      define CLSIZE 1
+#     endif /* no CLSIZE */
+#    else /* no NBPG */
+#     ifdef NBPC
+#      define getpagesize() NBPC
+#     else /* no NBPC */
+#      ifdef PAGESIZE
+#       define getpagesize() PAGESIZE
+#      endif /* PAGESIZE */
+#     endif /* no NBPC */
+#    endif /* no NBPG */
+#   endif /* no EXEC_PAGESIZE */
+#  else /* no HAVE_SYS_PARAM_H */
+#   define getpagesize() 8192	/* punt totally */
+#  endif /* no HAVE_SYS_PARAM_H */
+# endif /* no _SC_PAGESIZE */
+
+#endif /* no HAVE_GETPAGESIZE */
+
+int
+main ()
+{
+  char *data, *data2, *data3;
+  const char *cdata2;
+  int i, pagesize;
+  int fd, fd2;
+
+  pagesize = getpagesize ();
+
+  /* First, make a file with some known garbage in it. */
+  data = (char *) malloc (pagesize);
+  if (!data)
+    return 1;
+  for (i = 0; i < pagesize; ++i)
+    *(data + i) = rand ();
+  umask (0);
+  fd = creat ("conftest.mmap", 0600);
+  if (fd < 0)
+    return 2;
+  if (write (fd, data, pagesize) != pagesize)
+    return 3;
+  close (fd);
+
+  /* Next, check that the tail of a page is zero-filled.  File must have
+     non-zero length, otherwise we risk SIGBUS for entire page.  */
+  fd2 = open ("conftest.txt", O_RDWR | O_CREAT | O_TRUNC, 0600);
+  if (fd2 < 0)
+    return 4;
+  cdata2 = "";
+  if (write (fd2, cdata2, 1) != 1)
+    return 5;
+  data2 = (char *) mmap (0, pagesize, PROT_READ | PROT_WRITE, MAP_SHARED, fd2, 0L);
+  if (data2 == MAP_FAILED)
+    return 6;
+  for (i = 0; i < pagesize; ++i)
+    if (*(data2 + i))
+      return 7;
+  close (fd2);
+  if (munmap (data2, pagesize))
+    return 8;
+
+  /* Next, try to mmap the file at a fixed address which already has
+     something else allocated at it.  If we can, also make sure that
+     we see the same garbage.  */
+  fd = open ("conftest.mmap", O_RDWR);
+  if (fd < 0)
+    return 9;
+  if (data2 != mmap (data2, pagesize, PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_FIXED, fd, 0L))
+    return 10;
+  for (i = 0; i < pagesize; ++i)
+    if (*(data + i) != *(data2 + i))
+      return 11;
+
+  /* Finally, make sure that changes to the mapped area do not
+     percolate back to the file as seen by read().  (This is a bug on
+     some variants of i386 svr4.0.)  */
+  for (i = 0; i < pagesize; ++i)
+    *(data2 + i) = *(data2 + i) + 1;
+  data3 = (char *) malloc (pagesize);
+  if (!data3)
+    return 12;
+  if (read (fd, data3, pagesize) != pagesize)
+    return 13;
+  for (i = 0; i < pagesize; ++i)
+    if (*(data + i) != *(data3 + i))
+      return 14;
+  close (fd);
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_run "$LINENO"; then :
+  ac_cv_func_mmap_fixed_mapped=yes
+else
+  ac_cv_func_mmap_fixed_mapped=no
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_mmap_fixed_mapped" >&5
+$as_echo "$ac_cv_func_mmap_fixed_mapped" >&6; }
+if test $ac_cv_func_mmap_fixed_mapped = yes; then
+
+$as_echo "#define HAVE_MMAP 1" >>confdefs.h
+
+fi
+rm -f conftest.mmap conftest.txt
+
+for ac_func in clock_gettime floor gettimeofday memmove memset munmap pow sqrt strerror strstr
+do :
+  as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
+ac_fn_cxx_check_func "$LINENO" "$ac_func" "$as_ac_var"
+if eval test \"x\$"$as_ac_var"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+done
+
+
+
+  OPENMP_CXXFLAGS=
+  # Check whether --enable-openmp was given.
+if test "${enable_openmp+set}" = set; then :
+  enableval=$enable_openmp;
+fi
+
+  if test "$enable_openmp" != no; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CXX option to support OpenMP" >&5
+$as_echo_n "checking for $CXX option to support OpenMP... " >&6; }
+if ${ac_cv_prog_cxx_openmp+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+#ifndef _OPENMP
+ choke me
+#endif
+#include <omp.h>
+int main () { return omp_get_num_threads (); }
+
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ac_cv_prog_cxx_openmp='none needed'
+else
+  ac_cv_prog_cxx_openmp='unsupported'
+	  	  	  	  	  	  	                                	  	  	  	  	  	  for ac_option in -fopenmp -xopenmp -openmp -mp -omp -qsmp=omp -homp \
+                           -Popenmp --openmp; do
+	    ac_save_CXXFLAGS=$CXXFLAGS
+	    CXXFLAGS="$CXXFLAGS $ac_option"
+	    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+#ifndef _OPENMP
+ choke me
+#endif
+#include <omp.h>
+int main () { return omp_get_num_threads (); }
+
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ac_cv_prog_cxx_openmp=$ac_option
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+	    CXXFLAGS=$ac_save_CXXFLAGS
+	    if test "$ac_cv_prog_cxx_openmp" != unsupported; then
+	      break
+	    fi
+	  done
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_openmp" >&5
+$as_echo "$ac_cv_prog_cxx_openmp" >&6; }
+    case $ac_cv_prog_cxx_openmp in #(
+      "none needed" | unsupported)
+	;; #(
+      *)
+	OPENMP_CXXFLAGS=$ac_cv_prog_cxx_openmp ;;
+    esac
+  fi
+
+
+
+# Make sure we can run config.sub.
+$SHELL "$ac_aux_dir/config.sub" sun4 >/dev/null 2>&1 ||
+  as_fn_error $? "cannot run $SHELL $ac_aux_dir/config.sub" "$LINENO" 5
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking build system type" >&5
+$as_echo_n "checking build system type... " >&6; }
+if ${ac_cv_build+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_build_alias=$build_alias
+test "x$ac_build_alias" = x &&
+  ac_build_alias=`$SHELL "$ac_aux_dir/config.guess"`
+test "x$ac_build_alias" = x &&
+  as_fn_error $? "cannot guess build type; you must specify one" "$LINENO" 5
+ac_cv_build=`$SHELL "$ac_aux_dir/config.sub" $ac_build_alias` ||
+  as_fn_error $? "$SHELL $ac_aux_dir/config.sub $ac_build_alias failed" "$LINENO" 5
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_build" >&5
+$as_echo "$ac_cv_build" >&6; }
+case $ac_cv_build in
+*-*-*) ;;
+*) as_fn_error $? "invalid value of canonical build" "$LINENO" 5;;
+esac
+build=$ac_cv_build
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_build
+shift
+build_cpu=$1
+build_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+build_os=$*
+IFS=$ac_save_IFS
+case $build_os in *\ *) build_os=`echo "$build_os" | sed 's/ /-/g'`;; esac
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking host system type" >&5
+$as_echo_n "checking host system type... " >&6; }
+if ${ac_cv_host+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "x$host_alias" = x; then
+  ac_cv_host=$ac_cv_build
+else
+  ac_cv_host=`$SHELL "$ac_aux_dir/config.sub" $host_alias` ||
+    as_fn_error $? "$SHELL $ac_aux_dir/config.sub $host_alias failed" "$LINENO" 5
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_host" >&5
+$as_echo "$ac_cv_host" >&6; }
+case $ac_cv_host in
+*-*-*) ;;
+*) as_fn_error $? "invalid value of canonical host" "$LINENO" 5;;
+esac
+host=$ac_cv_host
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_host
+shift
+host_cpu=$1
+host_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+host_os=$*
+IFS=$ac_save_IFS
+case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac
+
+
+
+
+# AC_REQUIRE([AC_F77_LIBRARY_LDFLAGS])
+
+ax_blas_ok=no
+
+
+# Check whether --with-blas was given.
+if test "${with_blas+set}" = set; then :
+  withval=$with_blas;
+fi
+
+case $with_blas in
+	yes | "") ;;
+	no) ax_blas_ok=disable ;;
+	-* | */* | *.a | *.so | *.so.* | *.o) BLAS_LIBS="$with_blas" ;;
+	*) BLAS_LIBS="-l$with_blas" ;;
+esac
+
+OPENMP_LDFLAGS="$OPENMP_CXXFLAGS"
+
+# Get fortran linker names of BLAS functions to check for.
+# AC_F77_FUNC(sgemm)
+# AC_F77_FUNC(dgemm)
+sgemm=sgemm_
+dgemm=dgemm_
+
+ax_blas_save_LIBS="$LIBS"
+LIBS="$LIBS $FLIBS"
+
+# First, check BLAS_LIBS environment variable
+if test $ax_blas_ok = no; then
+if test "x$BLAS_LIBS" != x; then
+	save_LIBS="$LIBS"; LIBS="$BLAS_LIBS $LIBS"
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in $BLAS_LIBS" >&5
+$as_echo_n "checking for $sgemm in $BLAS_LIBS... " >&6; }
+	cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ax_blas_ok=yes
+else
+  BLAS_LIBS=""
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_blas_ok" >&5
+$as_echo "$ax_blas_ok" >&6; }
+	LIBS="$save_LIBS"
+fi
+fi
+
+# BLAS linked to by default?  (happens on some supercomputers)
+if test $ax_blas_ok = no; then
+	save_LIBS="$LIBS"; LIBS="$LIBS"
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking if $sgemm is being linked in already" >&5
+$as_echo_n "checking if $sgemm is being linked in already... " >&6; }
+	cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ax_blas_ok=yes
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_blas_ok" >&5
+$as_echo "$ax_blas_ok" >&6; }
+	LIBS="$save_LIBS"
+fi
+
+# BLAS in Intel MKL library?
+if test $ax_blas_ok = no; then
+  case $host_os in
+    darwin*)
+      as_ac_Lib=`$as_echo "ac_cv_lib_mkl_intel_lp64_$sgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -lmkl_intel_lp64" >&5
+$as_echo_n "checking for $sgemm in -lmkl_intel_lp64... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lmkl_intel_lp64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  ax_blas_ok=yes;BLAS_LIBS="-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread"; OPENMP_LDFLAGS=""
+fi
+
+      ;;
+    *)
+      if test $host_cpu = x86_64; then
+        as_ac_Lib=`$as_echo "ac_cv_lib_mkl_intel_lp64_$sgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -lmkl_intel_lp64" >&5
+$as_echo_n "checking for $sgemm in -lmkl_intel_lp64... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lmkl_intel_lp64 -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  ax_blas_ok=yes;BLAS_LIBS="-lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl"
+fi
+
+      elif test $host_cpu = i686; then
+        as_ac_Lib=`$as_echo "ac_cv_lib_mkl_intel_$sgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -lmkl_intel" >&5
+$as_echo_n "checking for $sgemm in -lmkl_intel... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lmkl_intel -lmkl_intel -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  ax_blas_ok=yes;BLAS_LIBS="-lmkl_intel -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl"
+fi
+
+      fi
+    ;;
+  esac
+fi
+# Old versions of MKL
+if test $ax_blas_ok = no; then
+	as_ac_Lib=`$as_echo "ac_cv_lib_mkl_$sgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -lmkl" >&5
+$as_echo_n "checking for $sgemm in -lmkl... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lmkl -lguide -lpthread $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  ax_blas_ok=yes;BLAS_LIBS="-lmkl -lguide -lpthread"
+fi
+
+fi
+
+# BLAS in OpenBLAS library? (http://xianyi.github.com/OpenBLAS/)
+if test $ax_blas_ok = no; then
+	as_ac_Lib=`$as_echo "ac_cv_lib_openblas_$sgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -lopenblas" >&5
+$as_echo_n "checking for $sgemm in -lopenblas... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lopenblas  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  ax_blas_ok=yes
+			                BLAS_LIBS="-lopenblas"
+fi
+
+fi
+
+# BLAS in ATLAS library? (http://math-atlas.sourceforge.net/)
+if test $ax_blas_ok = no; then
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ATL_xerbla in -latlas" >&5
+$as_echo_n "checking for ATL_xerbla in -latlas... " >&6; }
+if ${ac_cv_lib_atlas_ATL_xerbla+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-latlas  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char ATL_xerbla ();
+int
+main ()
+{
+return ATL_xerbla ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ac_cv_lib_atlas_ATL_xerbla=yes
+else
+  ac_cv_lib_atlas_ATL_xerbla=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_atlas_ATL_xerbla" >&5
+$as_echo "$ac_cv_lib_atlas_ATL_xerbla" >&6; }
+if test "x$ac_cv_lib_atlas_ATL_xerbla" = xyes; then :
+  as_ac_Lib=`$as_echo "ac_cv_lib_f77blas_$sgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -lf77blas" >&5
+$as_echo_n "checking for $sgemm in -lf77blas... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lf77blas -latlas $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for cblas_dgemm in -lcblas" >&5
+$as_echo_n "checking for cblas_dgemm in -lcblas... " >&6; }
+if ${ac_cv_lib_cblas_cblas_dgemm+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lcblas -lf77blas -latlas $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char cblas_dgemm ();
+int
+main ()
+{
+return cblas_dgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ac_cv_lib_cblas_cblas_dgemm=yes
+else
+  ac_cv_lib_cblas_cblas_dgemm=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_cblas_cblas_dgemm" >&5
+$as_echo "$ac_cv_lib_cblas_cblas_dgemm" >&6; }
+if test "x$ac_cv_lib_cblas_cblas_dgemm" = xyes; then :
+  ax_blas_ok=yes
+			 BLAS_LIBS="-lcblas -lf77blas -latlas"
+fi
+
+fi
+
+fi
+
+fi
+
+# BLAS in PhiPACK libraries? (requires generic BLAS lib, too)
+if test $ax_blas_ok = no; then
+	as_ac_Lib=`$as_echo "ac_cv_lib_blas_$sgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -lblas" >&5
+$as_echo_n "checking for $sgemm in -lblas... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lblas  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  as_ac_Lib=`$as_echo "ac_cv_lib_dgemm_$dgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $dgemm in -ldgemm" >&5
+$as_echo_n "checking for $dgemm in -ldgemm... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-ldgemm -lblas $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $dgemm ();
+int
+main ()
+{
+return $dgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  as_ac_Lib=`$as_echo "ac_cv_lib_sgemm_$sgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -lsgemm" >&5
+$as_echo_n "checking for $sgemm in -lsgemm... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lsgemm -lblas $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  ax_blas_ok=yes; BLAS_LIBS="-lsgemm -ldgemm -lblas"
+fi
+
+fi
+
+fi
+
+fi
+
+# BLAS in Apple vecLib library?
+if test $ax_blas_ok = no; then
+	save_LIBS="$LIBS"; LIBS="-framework vecLib $LIBS"
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -framework vecLib" >&5
+$as_echo_n "checking for $sgemm in -framework vecLib... " >&6; }
+	cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ax_blas_ok=yes;BLAS_LIBS="-framework vecLib"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_blas_ok" >&5
+$as_echo "$ax_blas_ok" >&6; }
+	LIBS="$save_LIBS"
+fi
+
+# BLAS in Alpha CXML library?
+if test $ax_blas_ok = no; then
+	as_ac_Lib=`$as_echo "ac_cv_lib_cxml_$sgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -lcxml" >&5
+$as_echo_n "checking for $sgemm in -lcxml... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lcxml  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  ax_blas_ok=yes;BLAS_LIBS="-lcxml"
+fi
+
+fi
+
+# BLAS in Alpha DXML library? (now called CXML, see above)
+if test $ax_blas_ok = no; then
+	as_ac_Lib=`$as_echo "ac_cv_lib_dxml_$sgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -ldxml" >&5
+$as_echo_n "checking for $sgemm in -ldxml... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-ldxml  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  ax_blas_ok=yes;BLAS_LIBS="-ldxml"
+fi
+
+fi
+
+# BLAS in Sun Performance library?
+if test $ax_blas_ok = no; then
+	if test "x$GCC" != xyes; then # only works with Sun CC
+		{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for acosp in -lsunmath" >&5
+$as_echo_n "checking for acosp in -lsunmath... " >&6; }
+if ${ac_cv_lib_sunmath_acosp+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lsunmath  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char acosp ();
+int
+main ()
+{
+return acosp ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ac_cv_lib_sunmath_acosp=yes
+else
+  ac_cv_lib_sunmath_acosp=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_sunmath_acosp" >&5
+$as_echo "$ac_cv_lib_sunmath_acosp" >&6; }
+if test "x$ac_cv_lib_sunmath_acosp" = xyes; then :
+  as_ac_Lib=`$as_echo "ac_cv_lib_sunperf_$sgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -lsunperf" >&5
+$as_echo_n "checking for $sgemm in -lsunperf... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lsunperf -lsunmath $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  BLAS_LIBS="-xlic_lib=sunperf -lsunmath"
+                                 ax_blas_ok=yes
+fi
+
+fi
+
+	fi
+fi
+
+# BLAS in SCSL library?  (SGI/Cray Scientific Library)
+if test $ax_blas_ok = no; then
+	as_ac_Lib=`$as_echo "ac_cv_lib_scs_$sgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -lscs" >&5
+$as_echo_n "checking for $sgemm in -lscs... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lscs  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  ax_blas_ok=yes; BLAS_LIBS="-lscs"
+fi
+
+fi
+
+# BLAS in SGIMATH library?
+if test $ax_blas_ok = no; then
+	as_ac_Lib=`$as_echo "ac_cv_lib_complib.sgimath_$sgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -lcomplib.sgimath" >&5
+$as_echo_n "checking for $sgemm in -lcomplib.sgimath... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lcomplib.sgimath  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  ax_blas_ok=yes; BLAS_LIBS="-lcomplib.sgimath"
+fi
+
+fi
+
+# BLAS in IBM ESSL library? (requires generic BLAS lib, too)
+if test $ax_blas_ok = no; then
+	as_ac_Lib=`$as_echo "ac_cv_lib_blas_$sgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -lblas" >&5
+$as_echo_n "checking for $sgemm in -lblas... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lblas  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  as_ac_Lib=`$as_echo "ac_cv_lib_essl_$sgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -lessl" >&5
+$as_echo_n "checking for $sgemm in -lessl... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lessl -lblas $FLIBS $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  ax_blas_ok=yes; BLAS_LIBS="-lessl -lblas"
+fi
+
+fi
+
+fi
+
+# Generic BLAS library?
+if test $ax_blas_ok = no; then
+	as_ac_Lib=`$as_echo "ac_cv_lib_blas_$sgemm" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $sgemm in -lblas" >&5
+$as_echo_n "checking for $sgemm in -lblas... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lblas  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $sgemm ();
+int
+main ()
+{
+return $sgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  ax_blas_ok=yes; BLAS_LIBS="-lblas"
+fi
+
+fi
+
+
+
+
+LIBS="$ax_blas_save_LIBS"
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x"$ax_blas_ok" = xyes; then
+
+$as_echo "#define HAVE_BLAS 1" >>confdefs.h
+
+        :
+else
+        ax_blas_ok=no
+
+fi
+
+if test "x$ax_blas_ok" == "xno"; then
+  as_fn_error $? "An implementation of BLAS is required but none was found." "$LINENO" 5
+fi
+
+
+
+ax_lapack_ok=no
+
+
+# Check whether --with-lapack was given.
+if test "${with_lapack+set}" = set; then :
+  withval=$with_lapack;
+fi
+
+case $with_lapack in
+        yes | "") ;;
+        no) ax_lapack_ok=disable ;;
+        -* | */* | *.a | *.so | *.so.* | *.o) LAPACK_LIBS="$with_lapack" ;;
+        *) LAPACK_LIBS="-l$with_lapack" ;;
+esac
+
+# Get fortran linker name of LAPACK function to check for.
+# AC_F77_FUNC(cheev)
+cheev=cheev_
+
+# We cannot use LAPACK if BLAS is not found
+if test "x$ax_blas_ok" != xyes; then
+        ax_lapack_ok=noblas
+        LAPACK_LIBS=""
+fi
+
+# First, check LAPACK_LIBS environment variable
+if test "x$LAPACK_LIBS" != x; then
+        save_LIBS="$LIBS"; LIBS="$LAPACK_LIBS $BLAS_LIBS $LIBS $FLIBS"
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $cheev in $LAPACK_LIBS" >&5
+$as_echo_n "checking for $cheev in $LAPACK_LIBS... " >&6; }
+        cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $cheev ();
+int
+main ()
+{
+return $cheev ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ax_lapack_ok=yes
+else
+  LAPACK_LIBS=""
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_lapack_ok" >&5
+$as_echo "$ax_lapack_ok" >&6; }
+        LIBS="$save_LIBS"
+        if test $ax_lapack_ok = no; then
+                LAPACK_LIBS=""
+        fi
+fi
+
+# LAPACK linked to by default?  (is sometimes included in BLAS lib)
+if test $ax_lapack_ok = no; then
+        save_LIBS="$LIBS"; LIBS="$LIBS $BLAS_LIBS $FLIBS"
+        as_ac_var=`$as_echo "ac_cv_func_$cheev" | $as_tr_sh`
+ac_fn_cxx_check_func "$LINENO" "$cheev" "$as_ac_var"
+if eval test \"x\$"$as_ac_var"\" = x"yes"; then :
+  ax_lapack_ok=yes
+fi
+
+        LIBS="$save_LIBS"
+fi
+
+# Generic LAPACK library?
+for lapack in lapack lapack_rs6k; do
+        if test $ax_lapack_ok = no; then
+                save_LIBS="$LIBS"; LIBS="$BLAS_LIBS $LIBS"
+                as_ac_Lib=`$as_echo "ac_cv_lib_$lapack''_$cheev" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $cheev in -l$lapack" >&5
+$as_echo_n "checking for $cheev in -l$lapack... " >&6; }
+if eval \${$as_ac_Lib+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-l$lapack $FLIBS $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $cheev ();
+int
+main ()
+{
+return $cheev ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_ac_Lib=yes"
+else
+  eval "$as_ac_Lib=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+eval ac_res=\$$as_ac_Lib
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_ac_Lib"\" = x"yes"; then :
+  ax_lapack_ok=yes; LAPACK_LIBS="-l$lapack"
+fi
+
+                LIBS="$save_LIBS"
+        fi
+done
+
+
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x"$ax_lapack_ok" = xyes; then
+
+$as_echo "#define HAVE_LAPACK 1" >>confdefs.h
+
+        :
+else
+        ax_lapack_ok=no
+
+fi
+
+if test "x$ax_lapack_ok" == "xno"; then
+  as_fn_error $? "An implementation of LAPACK is required but none was found." "$LINENO" 5
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking target system type" >&5
+$as_echo_n "checking target system type... " >&6; }
+if ${ac_cv_target+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "x$target_alias" = x; then
+  ac_cv_target=$ac_cv_host
+else
+  ac_cv_target=`$SHELL "$ac_aux_dir/config.sub" $target_alias` ||
+    as_fn_error $? "$SHELL $ac_aux_dir/config.sub $target_alias failed" "$LINENO" 5
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_target" >&5
+$as_echo "$ac_cv_target" >&6; }
+case $ac_cv_target in
+*-*-*) ;;
+*) as_fn_error $? "invalid value of canonical target" "$LINENO" 5;;
+esac
+target=$ac_cv_target
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_target
+shift
+target_cpu=$1
+target_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+target_os=$*
+IFS=$ac_save_IFS
+case $target_os in *\ *) target_os=`echo "$target_os" | sed 's/ /-/g'`;; esac
+
+
+# The aliases save the names the user supplied, while $host etc.
+# will get canonicalized.
+test -n "$target_alias" &&
+  test "$program_prefix$program_suffix$program_transform_name" = \
+    NONENONEs,x,x, &&
+  program_prefix=${target_alias}-
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for cpu arch" >&5
+$as_echo_n "checking for cpu arch... " >&6; }
+
+
+
+  case $target in
+    amd64-* | x86_64-*)
+      ARCH_CPUFLAGS="-mpopcnt -msse4"
+      ARCH_CXXFLAGS="-m64"
+      ;;
+    aarch64*-*)
+      ARCH_CPUFLAGS="-march=armv8.2-a"
+      ;;
+    *) ;;
+  esac
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $target CPUFLAGS+=\"$ARCH_CPUFLAGS\" CXXFLAGS+=\"$ARCH_CXXFLAGS\"" >&5
+$as_echo "$target CPUFLAGS+=\"$ARCH_CPUFLAGS\" CXXFLAGS+=\"$ARCH_CXXFLAGS\"" >&6; }
+
+
+
+
+
+
+ac_config_files="$ac_config_files makefile.inc"
+
+cat >confcache <<\_ACEOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs, see configure's option --config-cache.
+# It is not useful on other systems.  If it contains results you don't
+# want to keep, you may remove or edit it.
+#
+# config.status only pays attention to the cache file if you give it
+# the --recheck option to rerun configure.
+#
+# `ac_cv_env_foo' variables (set or unset) will be overridden when
+# loading this file, other *unset* `ac_cv_foo' will be assigned the
+# following values.
+
+_ACEOF
+
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, we kill variables containing newlines.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(
+  for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do
+    eval ac_val=\$$ac_var
+    case $ac_val in #(
+    *${as_nl}*)
+      case $ac_var in #(
+      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+      esac
+      case $ac_var in #(
+      _ | IFS | as_nl) ;; #(
+      BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+      *) { eval $ac_var=; unset $ac_var;} ;;
+      esac ;;
+    esac
+  done
+
+  (set) 2>&1 |
+    case $as_nl`(ac_space=' '; set) 2>&1` in #(
+    *${as_nl}ac_space=\ *)
+      # `set' does not quote correctly, so add quotes: double-quote
+      # substitution turns \\\\ into \\, and sed turns \\ into \.
+      sed -n \
+	"s/'/'\\\\''/g;
+	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p"
+      ;; #(
+    *)
+      # `set' quotes correctly as required by POSIX, so do not add quotes.
+      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+      ;;
+    esac |
+    sort
+) |
+  sed '
+     /^ac_cv_env_/b end
+     t clear
+     :clear
+     s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/
+     t end
+     s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/
+     :end' >>confcache
+if diff "$cache_file" confcache >/dev/null 2>&1; then :; else
+  if test -w "$cache_file"; then
+    if test "x$cache_file" != "x/dev/null"; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5
+$as_echo "$as_me: updating cache $cache_file" >&6;}
+      if test ! -f "$cache_file" || test -h "$cache_file"; then
+	cat confcache >"$cache_file"
+      else
+        case $cache_file in #(
+        */* | ?:*)
+	  mv -f confcache "$cache_file"$$ &&
+	  mv -f "$cache_file"$$ "$cache_file" ;; #(
+        *)
+	  mv -f confcache "$cache_file" ;;
+	esac
+      fi
+    fi
+  else
+    { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5
+$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;}
+  fi
+fi
+rm -f confcache
+
+test "x$prefix" = xNONE && prefix=$ac_default_prefix
+# Let make expand exec_prefix.
+test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
+
+# Transform confdefs.h into DEFS.
+# Protect against shell expansion while executing Makefile rules.
+# Protect against Makefile macro expansion.
+#
+# If the first sed substitution is executed (which looks for macros that
+# take arguments), then branch to the quote section.  Otherwise,
+# look for a macro that doesn't take arguments.
+ac_script='
+:mline
+/\\$/{
+ N
+ s,\\\n,,
+ b mline
+}
+t clear
+:clear
+s/^[	 ]*#[	 ]*define[	 ][	 ]*\([^	 (][^	 (]*([^)]*)\)[	 ]*\(.*\)/-D\1=\2/g
+t quote
+s/^[	 ]*#[	 ]*define[	 ][	 ]*\([^	 ][^	 ]*\)[	 ]*\(.*\)/-D\1=\2/g
+t quote
+b any
+:quote
+s/[	 `~#$^&*(){}\\|;'\''"<>?]/\\&/g
+s/\[/\\&/g
+s/\]/\\&/g
+s/\$/$$/g
+H
+:any
+${
+	g
+	s/^\n//
+	s/\n/ /g
+	p
+}
+'
+DEFS=`sed -n "$ac_script" confdefs.h`
+
+
+ac_libobjs=
+ac_ltlibobjs=
+U=
+for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue
+  # 1. Remove the extension, and $U if already installed.
+  ac_script='s/\$U\././;s/\.o$//;s/\.obj$//'
+  ac_i=`$as_echo "$ac_i" | sed "$ac_script"`
+  # 2. Prepend LIBOBJDIR.  When used with automake>=1.10 LIBOBJDIR
+  #    will be set to the directory where LIBOBJS objects are built.
+  as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext"
+  as_fn_append ac_ltlibobjs " \${LIBOBJDIR}$ac_i"'$U.lo'
+done
+LIBOBJS=$ac_libobjs
+
+LTLIBOBJS=$ac_ltlibobjs
+
+
+
+: "${CONFIG_STATUS=./config.status}"
+ac_write_fail=0
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files $CONFIG_STATUS"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5
+$as_echo "$as_me: creating $CONFIG_STATUS" >&6;}
+as_write_fail=0
+cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1
+#! $SHELL
+# Generated by $as_me.
+# Run this file to recreate the current configuration.
+# Compiler output produced by configure, useful for debugging
+# configure, is in config.log if it exists.
+
+debug=false
+ac_cs_recheck=false
+ac_cs_silent=false
+
+SHELL=\${CONFIG_SHELL-$SHELL}
+export SHELL
+_ASEOF
+cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1
+## -------------------- ##
+## M4sh Initialization. ##
+## -------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac
+fi
+
+
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+# Prefer a ksh shell builtin over an external printf program on Solaris,
+# but without wasting forks for bash or zsh.
+if test -z "$BASH_VERSION$ZSH_VERSION" \
+    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='print -r --'
+  as_echo_n='print -rn --'
+elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='printf %s\n'
+  as_echo_n='printf %s'
+else
+  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+    as_echo_n='/usr/ucb/echo -n'
+  else
+    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+    as_echo_n_body='eval
+      arg=$1;
+      case $arg in #(
+      *"$as_nl"*)
+	expr "X$arg" : "X\\(.*\\)$as_nl";
+	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+      esac;
+      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+    '
+    export as_echo_n_body
+    as_echo_n='sh -c $as_echo_n_body as_echo'
+  fi
+  export as_echo_body
+  as_echo='sh -c $as_echo_body as_echo'
+fi
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+  PATH_SEPARATOR=:
+  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+      PATH_SEPARATOR=';'
+  }
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order.  Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+IFS=" ""	$as_nl"
+
+# Find who we are.  Look in the path if we contain no directory separator.
+as_myself=
+case $0 in #((
+  *[\\/]* ) as_myself=$0 ;;
+  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+  done
+IFS=$as_save_IFS
+
+     ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+  as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  exit 1
+fi
+
+# Unset variables that we do not need and which cause bugs (e.g. in
+# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
+# suppresses any "Segmentation fault" message there.  '((' could
+# trigger a bug in pdksh 5.2.14.
+for as_var in BASH_ENV ENV MAIL MAILPATH
+do eval test x\${$as_var+set} = xset \
+  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# CDPATH.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+
+# as_fn_error STATUS ERROR [LINENO LOG_FD]
+# ----------------------------------------
+# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
+# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
+# script with STATUS, using 1 if that was 0.
+as_fn_error ()
+{
+  as_status=$1; test $as_status -eq 0 && as_status=1
+  if test "$4"; then
+    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+  fi
+  $as_echo "$as_me: error: $2" >&2
+  as_fn_exit $as_status
+} # as_fn_error
+
+
+# as_fn_set_status STATUS
+# -----------------------
+# Set $? to STATUS, without forking.
+as_fn_set_status ()
+{
+  return $1
+} # as_fn_set_status
+
+# as_fn_exit STATUS
+# -----------------
+# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
+as_fn_exit ()
+{
+  set +e
+  as_fn_set_status $1
+  exit $1
+} # as_fn_exit
+
+# as_fn_unset VAR
+# ---------------
+# Portably unset VAR.
+as_fn_unset ()
+{
+  { eval $1=; unset $1;}
+}
+as_unset=as_fn_unset
+# as_fn_append VAR VALUE
+# ----------------------
+# Append the text in VALUE to the end of the definition contained in VAR. Take
+# advantage of any shell optimizations that allow amortized linear growth over
+# repeated appends, instead of the typical quadratic growth present in naive
+# implementations.
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
+  eval 'as_fn_append ()
+  {
+    eval $1+=\$2
+  }'
+else
+  as_fn_append ()
+  {
+    eval $1=\$$1\$2
+  }
+fi # as_fn_append
+
+# as_fn_arith ARG...
+# ------------------
+# Perform arithmetic evaluation on the ARGs, and store the result in the
+# global $as_val. Take advantage of shells that can avoid forks. The arguments
+# must be portable across $(()) and expr.
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
+  eval 'as_fn_arith ()
+  {
+    as_val=$(( $* ))
+  }'
+else
+  as_fn_arith ()
+  {
+    as_val=`expr "$@" || test $? -eq 1`
+  }
+fi # as_fn_arith
+
+
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+  as_basename=basename
+else
+  as_basename=false
+fi
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+  as_dirname=dirname
+else
+  as_dirname=false
+fi
+
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+	 X"$0" : 'X\(//\)$' \| \
+	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+    sed '/^.*\/\([^/][^/]*\)\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in #(((((
+-n*)
+  case `echo 'xy\c'` in
+  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
+  xy)  ECHO_C='\c';;
+  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
+       ECHO_T='	';;
+  esac;;
+*)
+  ECHO_N='-n';;
+esac
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+  rm -f conf$$.dir/conf$$.file
+else
+  rm -f conf$$.dir
+  mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+  if ln -s conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s='ln -s'
+    # ... but there are two gotchas:
+    # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+    # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+    # In both cases, we have to default to `cp -pR'.
+    ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+      as_ln_s='cp -pR'
+  elif ln conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s=ln
+  else
+    as_ln_s='cp -pR'
+  fi
+else
+  as_ln_s='cp -pR'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+
+# as_fn_mkdir_p
+# -------------
+# Create "$as_dir" as a directory, including parents if necessary.
+as_fn_mkdir_p ()
+{
+
+  case $as_dir in #(
+  -*) as_dir=./$as_dir;;
+  esac
+  test -d "$as_dir" || eval $as_mkdir_p || {
+    as_dirs=
+    while :; do
+      case $as_dir in #(
+      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *) as_qdir=$as_dir;;
+      esac
+      as_dirs="'$as_qdir' $as_dirs"
+      as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_dir" : 'X\(//\)[^/]' \| \
+	 X"$as_dir" : 'X\(//\)$' \| \
+	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_dir" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      test -d "$as_dir" && break
+    done
+    test -z "$as_dirs" || eval "mkdir $as_dirs"
+  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
+
+
+} # as_fn_mkdir_p
+if mkdir -p . 2>/dev/null; then
+  as_mkdir_p='mkdir -p "$as_dir"'
+else
+  test -d ./-p && rmdir ./-p
+  as_mkdir_p=false
+fi
+
+
+# as_fn_executable_p FILE
+# -----------------------
+# Test if FILE is an executable regular file.
+as_fn_executable_p ()
+{
+  test -f "$1" && test -x "$1"
+} # as_fn_executable_p
+as_test_x='test -x'
+as_executable_p=as_fn_executable_p
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+exec 6>&1
+## ----------------------------------- ##
+## Main body of $CONFIG_STATUS script. ##
+## ----------------------------------- ##
+_ASEOF
+test $as_write_fail = 0 && chmod +x $CONFIG_STATUS || ac_write_fail=1
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# Save the log message, to keep $0 and so on meaningful, and to
+# report actual input values of CONFIG_FILES etc. instead of their
+# values after options handling.
+ac_log="
+This file was extended by faiss $as_me 1.0, which was
+generated by GNU Autoconf 2.69.  Invocation command line was
+
+  CONFIG_FILES    = $CONFIG_FILES
+  CONFIG_HEADERS  = $CONFIG_HEADERS
+  CONFIG_LINKS    = $CONFIG_LINKS
+  CONFIG_COMMANDS = $CONFIG_COMMANDS
+  $ $0 $@
+
+on `(hostname || uname -n) 2>/dev/null | sed 1q`
+"
+
+_ACEOF
+
+case $ac_config_files in *"
+"*) set x $ac_config_files; shift; ac_config_files=$*;;
+esac
+
+
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+# Files that config.status was made for.
+config_files="$ac_config_files"
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+ac_cs_usage="\
+\`$as_me' instantiates files and other configuration actions
+from templates according to the current configuration.  Unless the files
+and actions are specified as TAGs, all are instantiated by default.
+
+Usage: $0 [OPTION]... [TAG]...
+
+  -h, --help       print this help, then exit
+  -V, --version    print version number and configuration settings, then exit
+      --config     print configuration, then exit
+  -q, --quiet, --silent
+                   do not print progress messages
+  -d, --debug      don't remove temporary files
+      --recheck    update $as_me by reconfiguring in the same conditions
+      --file=FILE[:TEMPLATE]
+                   instantiate the configuration file FILE
+
+Configuration files:
+$config_files
+
+Report bugs to the package provider."
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
+ac_cs_version="\\
+faiss config.status 1.0
+configured by $0, generated by GNU Autoconf 2.69,
+  with options \\"\$ac_cs_config\\"
+
+Copyright (C) 2012 Free Software Foundation, Inc.
+This config.status script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it."
+
+ac_pwd='$ac_pwd'
+srcdir='$srcdir'
+MKDIR_P='$MKDIR_P'
+test -n "\$AWK" || AWK=awk
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# The default lists apply if the user does not specify any file.
+ac_need_defaults=:
+while test $# != 0
+do
+  case $1 in
+  --*=?*)
+    ac_option=`expr "X$1" : 'X\([^=]*\)='`
+    ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'`
+    ac_shift=:
+    ;;
+  --*=)
+    ac_option=`expr "X$1" : 'X\([^=]*\)='`
+    ac_optarg=
+    ac_shift=:
+    ;;
+  *)
+    ac_option=$1
+    ac_optarg=$2
+    ac_shift=shift
+    ;;
+  esac
+
+  case $ac_option in
+  # Handling of the options.
+  -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
+    ac_cs_recheck=: ;;
+  --version | --versio | --versi | --vers | --ver | --ve | --v | -V )
+    $as_echo "$ac_cs_version"; exit ;;
+  --config | --confi | --conf | --con | --co | --c )
+    $as_echo "$ac_cs_config"; exit ;;
+  --debug | --debu | --deb | --de | --d | -d )
+    debug=: ;;
+  --file | --fil | --fi | --f )
+    $ac_shift
+    case $ac_optarg in
+    *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    '') as_fn_error $? "missing file argument" ;;
+    esac
+    as_fn_append CONFIG_FILES " '$ac_optarg'"
+    ac_need_defaults=false;;
+  --he | --h |  --help | --hel | -h )
+    $as_echo "$ac_cs_usage"; exit ;;
+  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+  | -silent | --silent | --silen | --sile | --sil | --si | --s)
+    ac_cs_silent=: ;;
+
+  # This is an error.
+  -*) as_fn_error $? "unrecognized option: \`$1'
+Try \`$0 --help' for more information." ;;
+
+  *) as_fn_append ac_config_targets " $1"
+     ac_need_defaults=false ;;
+
+  esac
+  shift
+done
+
+ac_configure_extra_args=
+
+if $ac_cs_silent; then
+  exec 6>/dev/null
+  ac_configure_extra_args="$ac_configure_extra_args --silent"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+if \$ac_cs_recheck; then
+  set X $SHELL '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
+  shift
+  \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6
+  CONFIG_SHELL='$SHELL'
+  export CONFIG_SHELL
+  exec "\$@"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+exec 5>>config.log
+{
+  echo
+  sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
+## Running $as_me. ##
+_ASBOX
+  $as_echo "$ac_log"
+} >&5
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+
+# Handling of arguments.
+for ac_config_target in $ac_config_targets
+do
+  case $ac_config_target in
+    "makefile.inc") CONFIG_FILES="$CONFIG_FILES makefile.inc" ;;
+
+  *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
+  esac
+done
+
+
+# If the user did not use the arguments to specify the items to instantiate,
+# then the envvar interface is used.  Set only those that are not.
+# We use the long form for the default assignment because of an extremely
+# bizarre bug on SunOS 4.1.3.
+if $ac_need_defaults; then
+  test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
+fi
+
+# Have a temporary directory for convenience.  Make it in the build tree
+# simply because there is no reason against having it here, and in addition,
+# creating and moving files from /tmp can sometimes cause problems.
+# Hook for its removal unless debugging.
+# Note that there is a small window in which the directory will not be cleaned:
+# after its creation but before its name has been assigned to `$tmp'.
+$debug ||
+{
+  tmp= ac_tmp=
+  trap 'exit_status=$?
+  : "${ac_tmp:=$tmp}"
+  { test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status
+' 0
+  trap 'as_fn_exit 1' 1 2 13 15
+}
+# Create a (secure) tmp directory for tmp files.
+
+{
+  tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` &&
+  test -d "$tmp"
+}  ||
+{
+  tmp=./conf$$-$RANDOM
+  (umask 077 && mkdir "$tmp")
+} || as_fn_error $? "cannot create a temporary directory in ." "$LINENO" 5
+ac_tmp=$tmp
+
+# Set up the scripts for CONFIG_FILES section.
+# No need to generate them if there are no CONFIG_FILES.
+# This happens for instance with `./config.status config.h'.
+if test -n "$CONFIG_FILES"; then
+
+
+ac_cr=`echo X | tr X '\015'`
+# On cygwin, bash can eat \r inside `` if the user requested igncr.
+# But we know of no other shell where ac_cr would be empty at this
+# point, so we can use a bashism as a fallback.
+if test "x$ac_cr" = x; then
+  eval ac_cr=\$\'\\r\'
+fi
+ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' </dev/null 2>/dev/null`
+if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then
+  ac_cs_awk_cr='\\r'
+else
+  ac_cs_awk_cr=$ac_cr
+fi
+
+echo 'BEGIN {' >"$ac_tmp/subs1.awk" &&
+_ACEOF
+
+
+{
+  echo "cat >conf$$subs.awk <<_ACEOF" &&
+  echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' &&
+  echo "_ACEOF"
+} >conf$$subs.sh ||
+  as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+ac_delim_num=`echo "$ac_subst_vars" | grep -c '^'`
+ac_delim='%!_!# '
+for ac_last_try in false false false false false :; do
+  . ./conf$$subs.sh ||
+    as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+
+  ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X`
+  if test $ac_delim_n = $ac_delim_num; then
+    break
+  elif $ac_last_try; then
+    as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+  else
+    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
+  fi
+done
+rm -f conf$$subs.sh
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+cat >>"\$ac_tmp/subs1.awk" <<\\_ACAWK &&
+_ACEOF
+sed -n '
+h
+s/^/S["/; s/!.*/"]=/
+p
+g
+s/^[^!]*!//
+:repl
+t repl
+s/'"$ac_delim"'$//
+t delim
+:nl
+h
+s/\(.\{148\}\)..*/\1/
+t more1
+s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/
+p
+n
+b repl
+:more1
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t nl
+:delim
+h
+s/\(.\{148\}\)..*/\1/
+t more2
+s/["\\]/\\&/g; s/^/"/; s/$/"/
+p
+b
+:more2
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t delim
+' <conf$$subs.awk | sed '
+/^[^""]/{
+  N
+  s/\n//
+}
+' >>$CONFIG_STATUS || ac_write_fail=1
+rm -f conf$$subs.awk
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+_ACAWK
+cat >>"\$ac_tmp/subs1.awk" <<_ACAWK &&
+  for (key in S) S_is_set[key] = 1
+  FS = ""
+
+}
+{
+  line = $ 0
+  nfields = split(line, field, "@")
+  substed = 0
+  len = length(field[1])
+  for (i = 2; i < nfields; i++) {
+    key = field[i]
+    keylen = length(key)
+    if (S_is_set[key]) {
+      value = S[key]
+      line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3)
+      len += length(value) + length(field[++i])
+      substed = 1
+    } else
+      len += 1 + keylen
+  }
+
+  print line
+}
+
+_ACAWK
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then
+  sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g"
+else
+  cat
+fi < "$ac_tmp/subs1.awk" > "$ac_tmp/subs.awk" \
+  || as_fn_error $? "could not setup config files machinery" "$LINENO" 5
+_ACEOF
+
+# VPATH may cause trouble with some makes, so we remove sole $(srcdir),
+# ${srcdir} and @srcdir@ entries from VPATH if srcdir is ".", strip leading and
+# trailing colons and then remove the whole line if VPATH becomes empty
+# (actually we leave an empty line to preserve line numbers).
+if test "x$srcdir" = x.; then
+  ac_vpsub='/^[	 ]*VPATH[	 ]*=[	 ]*/{
+h
+s///
+s/^/:/
+s/[	 ]*$/:/
+s/:\$(srcdir):/:/g
+s/:\${srcdir}:/:/g
+s/:@srcdir@:/:/g
+s/^:*//
+s/:*$//
+x
+s/\(=[	 ]*\).*/\1/
+G
+s/\n//
+s/^[^=]*=[	 ]*$//
+}'
+fi
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+fi # test -n "$CONFIG_FILES"
+
+
+eval set X "  :F $CONFIG_FILES      "
+shift
+for ac_tag
+do
+  case $ac_tag in
+  :[FHLC]) ac_mode=$ac_tag; continue;;
+  esac
+  case $ac_mode$ac_tag in
+  :[FHL]*:*);;
+  :L* | :C*:*) as_fn_error $? "invalid tag \`$ac_tag'" "$LINENO" 5;;
+  :[FH]-) ac_tag=-:-;;
+  :[FH]*) ac_tag=$ac_tag:$ac_tag.in;;
+  esac
+  ac_save_IFS=$IFS
+  IFS=:
+  set x $ac_tag
+  IFS=$ac_save_IFS
+  shift
+  ac_file=$1
+  shift
+
+  case $ac_mode in
+  :L) ac_source=$1;;
+  :[FH])
+    ac_file_inputs=
+    for ac_f
+    do
+      case $ac_f in
+      -) ac_f="$ac_tmp/stdin";;
+      *) # Look for the file first in the build tree, then in the source tree
+	 # (if the path is not absolute).  The absolute path cannot be DOS-style,
+	 # because $ac_f cannot contain `:'.
+	 test -f "$ac_f" ||
+	   case $ac_f in
+	   [\\/$]*) false;;
+	   *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";;
+	   esac ||
+	   as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;;
+      esac
+      case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac
+      as_fn_append ac_file_inputs " '$ac_f'"
+    done
+
+    # Let's still pretend it is `configure' which instantiates (i.e., don't
+    # use $as_me), people would be surprised to read:
+    #    /* config.h.  Generated by config.status.  */
+    configure_input='Generated from '`
+	  $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g'
+	`' by configure.'
+    if test x"$ac_file" != x-; then
+      configure_input="$ac_file.  $configure_input"
+      { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5
+$as_echo "$as_me: creating $ac_file" >&6;}
+    fi
+    # Neutralize special characters interpreted by sed in replacement strings.
+    case $configure_input in #(
+    *\&* | *\|* | *\\* )
+       ac_sed_conf_input=`$as_echo "$configure_input" |
+       sed 's/[\\\\&|]/\\\\&/g'`;; #(
+    *) ac_sed_conf_input=$configure_input;;
+    esac
+
+    case $ac_tag in
+    *:-:* | *:-) cat >"$ac_tmp/stdin" \
+      || as_fn_error $? "could not create $ac_file" "$LINENO" 5 ;;
+    esac
+    ;;
+  esac
+
+  ac_dir=`$as_dirname -- "$ac_file" ||
+$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$ac_file" : 'X\(//\)[^/]' \| \
+	 X"$ac_file" : 'X\(//\)$' \| \
+	 X"$ac_file" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$ac_file" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+  as_dir="$ac_dir"; as_fn_mkdir_p
+  ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+  ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+  # A ".." for each directory in $ac_dir_suffix.
+  ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+  case $ac_top_builddir_sub in
+  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+  esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+  .)  # We are building in place.
+    ac_srcdir=.
+    ac_top_srcdir=$ac_top_builddir_sub
+    ac_abs_top_srcdir=$ac_pwd ;;
+  [\\/]* | ?:[\\/]* )  # Absolute name.
+    ac_srcdir=$srcdir$ac_dir_suffix;
+    ac_top_srcdir=$srcdir
+    ac_abs_top_srcdir=$srcdir ;;
+  *) # Relative name.
+    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+    ac_top_srcdir=$ac_top_build_prefix$srcdir
+    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+
+  case $ac_mode in
+  :F)
+  #
+  # CONFIG_FILE
+  #
+
+  ac_MKDIR_P=$MKDIR_P
+  case $MKDIR_P in
+  [\\/$]* | ?:[\\/]* ) ;;
+  */*) ac_MKDIR_P=$ac_top_build_prefix$MKDIR_P ;;
+  esac
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# If the template does not know about datarootdir, expand it.
+# FIXME: This hack should be removed a few years after 2.60.
+ac_datarootdir_hack=; ac_datarootdir_seen=
+ac_sed_dataroot='
+/datarootdir/ {
+  p
+  q
+}
+/@datadir@/p
+/@docdir@/p
+/@infodir@/p
+/@localedir@/p
+/@mandir@/p'
+case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in
+*datarootdir*) ac_datarootdir_seen=yes;;
+*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*)
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
+$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+  ac_datarootdir_hack='
+  s&@datadir@&$datadir&g
+  s&@docdir@&$docdir&g
+  s&@infodir@&$infodir&g
+  s&@localedir@&$localedir&g
+  s&@mandir@&$mandir&g
+  s&\\\${datarootdir}&$datarootdir&g' ;;
+esac
+_ACEOF
+
+# Neutralize VPATH when `$srcdir' = `.'.
+# Shell code in configure.ac might set extrasub.
+# FIXME: do we really want to maintain this feature?
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_sed_extra="$ac_vpsub
+$extrasub
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+:t
+/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
+s|@configure_input@|$ac_sed_conf_input|;t t
+s&@top_builddir@&$ac_top_builddir_sub&;t t
+s&@top_build_prefix@&$ac_top_build_prefix&;t t
+s&@srcdir@&$ac_srcdir&;t t
+s&@abs_srcdir@&$ac_abs_srcdir&;t t
+s&@top_srcdir@&$ac_top_srcdir&;t t
+s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t
+s&@builddir@&$ac_builddir&;t t
+s&@abs_builddir@&$ac_abs_builddir&;t t
+s&@abs_top_builddir@&$ac_abs_top_builddir&;t t
+s&@MKDIR_P@&$ac_MKDIR_P&;t t
+$ac_datarootdir_hack
+"
+eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$ac_tmp/subs.awk" \
+  >$ac_tmp/out || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+
+test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
+  { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } &&
+  { ac_out=`sed -n '/^[	 ]*datarootdir[	 ]*:*=/p' \
+      "$ac_tmp/out"`; test -z "$ac_out"; } &&
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined.  Please make sure it is defined" >&5
+$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined.  Please make sure it is defined" >&2;}
+
+  rm -f "$ac_tmp/stdin"
+  case $ac_file in
+  -) cat "$ac_tmp/out" && rm -f "$ac_tmp/out";;
+  *) rm -f "$ac_file" && mv "$ac_tmp/out" "$ac_file";;
+  esac \
+  || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+ ;;
+
+
+
+  esac
+
+done # for ac_tag
+
+
+as_fn_exit 0
+_ACEOF
+ac_clean_files=$ac_clean_files_save
+
+test $ac_write_fail = 0 ||
+  as_fn_error $? "write failure creating $CONFIG_STATUS" "$LINENO" 5
+
+
+# configure is writing to config.log, and then calls config.status.
+# config.status does its own redirection, appending to config.log.
+# Unfortunately, on DOS this fails, as config.log is still kept open
+# by configure, so config.status won't be able to write to it; its
+# output is simply discarded.  So we exec the FD to /dev/null,
+# effectively closing config.log, so it can be properly (re)opened and
+# appended to by config.status.  When coming back to configure, we
+# need to make the FD available again.
+if test "$no_create" != yes; then
+  ac_cs_success=:
+  ac_config_status_args=
+  test "$silent" = yes &&
+    ac_config_status_args="$ac_config_status_args --quiet"
+  exec 5>/dev/null
+  $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false
+  exec 5>>config.log
+  # Use ||, not &&, to avoid exiting from the if with $? = 1, which
+  # would make configure fail if this is the last instruction.
+  $ac_cs_success || as_fn_exit 1
+fi
+if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
+$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
+fi
+
diff --git a/core/src/index/thirdparty/faiss/configure.ac b/core/src/index/thirdparty/faiss/configure.ac
new file mode 100644
index 0000000000..31b587b86d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/configure.ac
@@ -0,0 +1,70 @@
+# -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+AC_PREREQ([2.69])
+AC_INIT([faiss], [1.0])
+AC_COPYRIGHT([Copyright (c) Facebook, Inc. and its affiliates.
+
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.])
+AC_CONFIG_SRCDIR([Index.h])
+AC_CONFIG_AUX_DIR([build-aux])
+AC_CONFIG_MACRO_DIR([acinclude])
+
+: ${CXXFLAGS="-g -O3 -Wall -Wextra"}
+
+# Checks for programs.
+AC_LANG(C++)
+AC_PROG_CXX
+AX_CXX_COMPILE_STDCXX([11], [noext], [mandatory])
+AC_PROG_CPP
+AC_PROG_MAKE_SET
+AC_PROG_MKDIR_P
+
+FA_PYTHON
+
+if test x$PYTHON != x; then
+   FA_NUMPY
+fi
+
+FA_PROG_SWIG
+
+FA_CHECK_CUDA
+
+
+# Checks for header files.
+AC_CHECK_HEADERS([float.h limits.h stddef.h stdint.h stdlib.h string.h sys/time.h unistd.h])
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_CHECK_HEADER_STDBOOL
+AC_C_INLINE
+AC_TYPE_INT32_T
+AC_TYPE_INT64_T
+AC_C_RESTRICT
+AC_TYPE_SIZE_T
+AC_TYPE_UINT16_T
+AC_TYPE_UINT32_T
+AC_TYPE_UINT64_T
+AC_TYPE_UINT8_T
+
+# Checks for library functions.
+AC_FUNC_MALLOC
+AC_FUNC_MMAP
+AC_CHECK_FUNCS([clock_gettime floor gettimeofday memmove memset munmap pow sqrt strerror strstr])
+
+AC_OPENMP
+
+AX_BLAS
+if test "x$ax_blas_ok" == "xno"; then
+  AC_MSG_ERROR([An implementation of BLAS is required but none was found.])
+fi
+
+AX_LAPACK
+if test "x$ax_lapack_ok" == "xno"; then
+  AC_MSG_ERROR([An implementation of LAPACK is required but none was found.])
+fi
+
+AX_CPU_ARCH
+
+AC_CONFIG_FILES([makefile.inc])
+AC_OUTPUT
diff --git a/core/src/index/thirdparty/faiss/demos/Makefile b/core/src/index/thirdparty/faiss/demos/Makefile
new file mode 100644
index 0000000000..9d871697a9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/demos/Makefile
@@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+-include ../makefile.inc
+
+DEMOS_SRC=$(wildcard demo_*.cpp)
+DEMOS=$(DEMOS_SRC:.cpp=)
+
+
+all: $(DEMOS)
+
+clean:
+	rm -f $(DEMOS)
+
+%: %.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ $^ $(LDFLAGS) $(LIBS) -lfaiss
+
+
+.PHONY: all clean
diff --git a/core/src/index/thirdparty/faiss/demos/README.md b/core/src/index/thirdparty/faiss/demos/README.md
new file mode 100644
index 0000000000..71a23f272e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/demos/README.md
@@ -0,0 +1,28 @@
+
+
+Demos for a few Faiss functionalities
+=====================================
+
+
+demo_auto_tune.py
+-----------------
+
+Demonstrates the auto-tuning functionality of Faiss
+
+
+demo_ondisk_ivf.py
+------------------
+
+Shows how to construct a Faiss index that stores the inverted file
+data on disk, eg. when it does not fit in RAM. The script works on a
+small dataset (sift1M) for demonstration and proceeds in stages:
+
+0: train on the dataset
+
+1-4: build 4 indexes, each containing 1/4 of the dataset. This can be
+done in parallel on several machines
+
+5: merge the 4 indexes into one that is written directly to disk
+(needs not to fit in RAM)
+
+6: load and test the index
diff --git a/core/src/index/thirdparty/faiss/demos/demo_auto_tune.py b/core/src/index/thirdparty/faiss/demos/demo_auto_tune.py
new file mode 100644
index 0000000000..3eb6421019
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/demos/demo_auto_tune.py
@@ -0,0 +1,175 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python2
+
+from __future__ import print_function
+import os
+import time
+import numpy as np
+
+try:
+    import matplotlib
+    matplotlib.use('Agg')
+    from matplotlib import pyplot
+    graphical_output = True
+except ImportError:
+    graphical_output = False
+
+import faiss
+
+#################################################################
+# Small I/O functions
+#################################################################
+
+def ivecs_read(fname):
+    f = open(fname)
+    d, = np.fromfile(f, count = 1, dtype = 'int32')
+    sz = os.stat(fname).st_size
+    assert sz % (4 * (d + 1)) == 0
+    n = sz / (4 * (d + 1))
+    f.seek(0)
+    a = np.fromfile(f, count = n * (d +1), dtype = 'int32').reshape(n, d + 1)
+    return a[:, 1:].copy()
+
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+
+
+def plot_OperatingPoints(ops, nq, **kwargs):
+    ops = ops.optimal_pts
+    n = ops.size() * 2 - 1
+    pyplot.plot([ops.at( i      / 2).perf for i in range(n)],
+                [ops.at((i + 1) / 2).t / nq * 1000 for i in range(n)],
+                **kwargs)
+
+
+#################################################################
+# prepare common data for all indexes
+#################################################################
+
+
+
+t0 = time.time()
+
+print("load data")
+
+xt = fvecs_read("sift1M/sift_learn.fvecs")
+xb = fvecs_read("sift1M/sift_base.fvecs")
+xq = fvecs_read("sift1M/sift_query.fvecs")
+
+d = xt.shape[1]
+
+print("load GT")
+
+gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
+gt = gt.astype('int64')
+k = gt.shape[1]
+
+print("prepare criterion")
+
+# criterion = 1-recall at 1
+crit = faiss.OneRecallAtRCriterion(xq.shape[0], 1)
+crit.set_groundtruth(None, gt)
+crit.nnn = k
+
+# indexes that are useful when there is no limitation on memory usage
+unlimited_mem_keys = [
+    "IMI2x10,Flat", "IMI2x11,Flat",
+    "IVF4096,Flat", "IVF16384,Flat",
+    "PCA64,IMI2x10,Flat"]
+
+# memory limited to 16 bytes / vector
+keys_mem_16 = [
+    'IMI2x10,PQ16', 'IVF4096,PQ16',
+    'IMI2x10,PQ8+8', 'OPQ16_64,IMI2x10,PQ16'
+    ]
+
+# limited to 32 bytes / vector
+keys_mem_32 = [
+    'IMI2x10,PQ32', 'IVF4096,PQ32', 'IVF16384,PQ32',
+    'IMI2x10,PQ16+16',
+    'OPQ32,IVF4096,PQ32', 'IVF4096,PQ16+16', 'OPQ16,IMI2x10,PQ16+16'
+    ]
+
+# indexes that can run on the GPU
+keys_gpu = [
+    "PCA64,IVF4096,Flat",
+    "PCA64,Flat", "Flat", "IVF4096,Flat", "IVF16384,Flat",
+    "IVF4096,PQ32"]
+
+
+keys_to_test = unlimited_mem_keys
+use_gpu = False
+
+
+if use_gpu:
+    # if this fails, it means that the GPU version was not comp
+    assert faiss.StandardGpuResources, \
+        "FAISS was not compiled with GPU support, or loading _swigfaiss_gpu.so failed"
+    res = faiss.StandardGpuResources()
+    dev_no = 0
+
+# remember results from other index types
+op_per_key = []
+
+
+# keep track of optimal operating points seen so far
+op = faiss.OperatingPoints()
+
+
+for index_key in keys_to_test:
+
+    print("============ key", index_key)
+
+    # make the index described by the key
+    index = faiss.index_factory(d, index_key)
+
+
+    if use_gpu:
+        # transfer to GPU (may be partial)
+        index = faiss.index_cpu_to_gpu(res, dev_no, index)
+        params = faiss.GpuParameterSpace()
+    else:
+        params = faiss.ParameterSpace()
+
+    params.initialize(index)
+
+    print("[%.3f s] train & add" % (time.time() - t0))
+
+    index.train(xt)
+    index.add(xb)
+
+    print("[%.3f s] explore op points" % (time.time() - t0))
+
+    # find operating points for this index
+    opi = params.explore(index, xq, crit)
+
+    print("[%.3f s] result operating points:" % (time.time() - t0))
+    opi.display()
+
+    # update best operating points so far
+    op.merge_with(opi, index_key + " ")
+
+    op_per_key.append((index_key, opi))
+
+    if graphical_output:
+        # graphical output (to tmp/ subdirectory)
+
+        fig = pyplot.figure(figsize=(12, 9))
+        pyplot.xlabel("1-recall at 1")
+        pyplot.ylabel("search time (ms/query, %d threads)" % faiss.omp_get_max_threads())
+        pyplot.gca().set_yscale('log')
+        pyplot.grid()
+        for i2, opi2 in op_per_key:
+            plot_OperatingPoints(opi2, crit.nq, label = i2, marker = 'o')
+        # plot_OperatingPoints(op, crit.nq, label = 'best', marker = 'o', color = 'r')
+        pyplot.legend(loc=2)
+        fig.savefig('tmp/demo_auto_tune.png')
+
+
+print("[%.3f s] final result:" % (time.time() - t0))
+
+op.display()
diff --git a/core/src/index/thirdparty/faiss/demos/demo_imi_flat.cpp b/core/src/index/thirdparty/faiss/demos/demo_imi_flat.cpp
new file mode 100644
index 0000000000..b037817321
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/demos/demo_imi_flat.cpp
@@ -0,0 +1,151 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+
+#include <sys/time.h>
+
+
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/index_io.h>
+
+double elapsed ()
+{
+    struct timeval tv;
+    gettimeofday (&tv, nullptr);
+    return  tv.tv_sec + tv.tv_usec * 1e-6;
+}
+
+
+int main ()
+{
+    double t0 = elapsed();
+
+    // dimension of the vectors to index
+    int d = 128;
+
+    // size of the database we plan to index
+    size_t nb = 1000 * 1000;
+
+    // make a set of nt training vectors in the unit cube
+    // (could be the database)
+    size_t nt = 100 * 1000;
+
+    //---------------------------------------------------------------
+    // Define the core quantizer
+    // We choose a multiple inverted index for faster training with less data
+    // and because it usually offers best accuracy/speed trade-offs
+    //
+    // We here assume that its lifespan of this coarse quantizer will cover the
+    // lifespan of the inverted-file quantizer IndexIVFFlat below
+    // With dynamic allocation, one may give the responsability to free the
+    // quantizer to the inverted-file index (with attribute do_delete_quantizer)
+    //
+    // Note: a regular clustering algorithm would be defined as:
+    //       faiss::IndexFlatL2 coarse_quantizer (d);
+    //
+    // Use nhash=2 subquantizers used to define the product coarse quantizer
+    // Number of bits: we will have 2^nbits_coarse centroids per subquantizer
+    //                 meaning (2^12)^nhash distinct inverted lists
+    size_t nhash = 2;
+    size_t nbits_subq = int (log2 (nb+1) / 2);        // good choice in general
+    size_t ncentroids = 1 << (nhash * nbits_subq);  // total # of centroids
+
+    faiss::MultiIndexQuantizer coarse_quantizer (d, nhash, nbits_subq);
+
+    printf ("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
+            nhash, nbits_subq, ncentroids, nb);
+
+    // the coarse quantizer should not be dealloced before the index
+    // 4 = nb of bytes per code (d must be a multiple of this)
+    // 8 = nb of bits per sub-code (almost always 8)
+    faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
+    faiss::IndexIVFFlat index (&coarse_quantizer, d, ncentroids, metric);
+    index.quantizer_trains_alone = true;
+
+    // define the number of probes. 2048 is for high-dim, overkilled in practice
+    // Use 4-1024 depending on the trade-off speed accuracy that you want
+    index.nprobe = 2048;
+
+
+    { // training
+        printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
+                elapsed() - t0, nt, d);
+
+        std::vector <float> trainvecs (nt * d);
+        for (size_t i = 0; i < nt * d; i++) {
+            trainvecs[i] = drand48();
+        }
+
+        printf ("[%.3f s] Training the index\n", elapsed() - t0);
+        index.verbose = true;
+        index.train (nt, trainvecs.data());
+    }
+
+    size_t nq;
+    std::vector<float> queries;
+
+    { // populating the database
+        printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
+                elapsed() - t0, nb);
+
+        std::vector <float> database (nb * d);
+        for (size_t i = 0; i < nb * d; i++) {
+            database[i] = drand48();
+        }
+
+        printf ("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
+
+        index.add (nb, database.data());
+
+        // remember a few elements from the database as queries
+        int i0 = 1234;
+        int i1 = 1244;
+
+        nq = i1 - i0;
+        queries.resize (nq * d);
+        for (int i = i0; i < i1; i++) {
+            for (int j = 0; j < d; j++) {
+                queries [(i - i0) * d  + j]  = database [i * d + j];
+            }
+        }
+    }
+
+    { // searching the database
+        int k = 5;
+        printf ("[%.3f s] Searching the %d nearest neighbors "
+                "of %ld vectors in the index\n",
+                elapsed() - t0, k, nq);
+
+        std::vector<faiss::Index::idx_t> nns (k * nq);
+        std::vector<float>               dis (k * nq);
+
+        index.search (nq, queries.data(), k, dis.data(), nns.data());
+
+        printf ("[%.3f s] Query results (vector ids, then distances):\n",
+                elapsed() - t0);
+
+        for (int i = 0; i < nq; i++) {
+            printf ("query %2d: ", i);
+            for (int j = 0; j < k; j++) {
+                printf ("%7ld ", nns[j + i * k]);
+            }
+            printf ("\n     dis: ");
+            for (int j = 0; j < k; j++) {
+                printf ("%7g ", dis[j + i * k]);
+            }
+            printf ("\n");
+        }
+    }
+    return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/demos/demo_imi_pq.cpp b/core/src/index/thirdparty/faiss/demos/demo_imi_pq.cpp
new file mode 100644
index 0000000000..ea6f998c6e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/demos/demo_imi_pq.cpp
@@ -0,0 +1,199 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+
+#include <sys/time.h>
+
+
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/index_io.h>
+
+double elapsed ()
+{
+    struct timeval tv;
+    gettimeofday (&tv, nullptr);
+    return  tv.tv_sec + tv.tv_usec * 1e-6;
+}
+
+
+int main ()
+{
+    double t0 = elapsed();
+
+    // dimension of the vectors to index
+    int d = 64;
+
+    // size of the database we plan to index
+    size_t nb = 1000 * 1000;
+    size_t add_bs = 10000; // # size of the blocks to add
+
+    // make a set of nt training vectors in the unit cube
+    // (could be the database)
+    size_t nt = 100 * 1000;
+
+    //---------------------------------------------------------------
+    // Define the core quantizer
+    // We choose a multiple inverted index for faster training with less data
+    // and because it usually offers best accuracy/speed trade-offs
+    //
+    // We here assume that its lifespan of this coarse quantizer will cover the
+    // lifespan of the inverted-file quantizer IndexIVFFlat below
+    // With dynamic allocation, one may give the responsability to free the
+    // quantizer to the inverted-file index (with attribute do_delete_quantizer)
+    //
+    // Note: a regular clustering algorithm would be defined as:
+    //       faiss::IndexFlatL2 coarse_quantizer (d);
+    //
+    // Use nhash=2 subquantizers used to define the product coarse quantizer
+    // Number of bits: we will have 2^nbits_coarse centroids per subquantizer
+    //                 meaning (2^12)^nhash distinct inverted lists
+    //
+    // The parameter bytes_per_code is determined by the memory
+    // constraint, the dataset will use nb * (bytes_per_code + 8)
+    // bytes.
+    //
+    // The parameter nbits_subq is determined by the size of the dataset to index.
+    //
+    size_t nhash = 2;
+    size_t nbits_subq = 9;
+    size_t ncentroids = 1 << (nhash * nbits_subq);  // total # of centroids
+    int bytes_per_code = 16;
+
+    faiss::MultiIndexQuantizer coarse_quantizer (d, nhash, nbits_subq);
+
+    printf ("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
+            nhash, nbits_subq, ncentroids, nb);
+
+    // the coarse quantizer should not be dealloced before the index
+    // 4 = nb of bytes per code (d must be a multiple of this)
+    // 8 = nb of bits per sub-code (almost always 8)
+    faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
+    faiss::IndexIVFPQ index (&coarse_quantizer, d, ncentroids, bytes_per_code, 8);
+    index.quantizer_trains_alone = true;
+
+    // define the number of probes. 2048 is for high-dim, overkill in practice
+    // Use 4-1024 depending on the trade-off speed accuracy that you want
+    index.nprobe = 2048;
+
+
+    { // training.
+
+        // The distribution of the training vectors should be the same
+        // as the database vectors. It could be a sub-sample of the
+        // database vectors, if sampling is not biased. Here we just
+        // randomly generate the vectors.
+
+        printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
+                elapsed() - t0, nt, d);
+
+        std::vector <float> trainvecs (nt * d);
+        for (size_t i = 0; i < nt; i++) {
+            for (size_t j = 0; j < d; j++) {
+                trainvecs[i * d + j] = drand48();
+            }
+        }
+
+        printf ("[%.3f s] Training the index\n", elapsed() - t0);
+        index.verbose = true;
+        index.train (nt, trainvecs.data());
+    }
+
+    // the index can be re-loaded later with
+    // faiss::Index * idx = faiss::read_index("/tmp/trained_index.faissindex");
+    faiss::write_index(&index, "/tmp/trained_index.faissindex");
+
+    size_t nq;
+    std::vector<float> queries;
+
+    { // populating the database
+        printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
+                elapsed() - t0, nb);
+
+        std::vector <float> database (nb * d);
+        std::vector <long> ids (nb);
+        for (size_t i = 0; i < nb; i++) {
+            for (size_t j = 0; j < d; j++) {
+                database[i * d + j] = drand48();
+            }
+            ids[i] = 8760000000L + i;
+        }
+
+        printf ("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
+
+        for (size_t begin = 0; begin < nb; begin += add_bs) {
+            size_t end = std::min (begin + add_bs, nb);
+            index.add_with_ids (end - begin,
+                                database.data() + d * begin,
+                                ids.data() + begin);
+        }
+
+        // remember a few elements from the database as queries
+        int i0 = 1234;
+        int i1 = 1244;
+
+        nq = i1 - i0;
+        queries.resize (nq * d);
+        for (int i = i0; i < i1; i++) {
+            for (int j = 0; j < d; j++) {
+                queries [(i - i0) * d  + j] = database [i * d + j];
+            }
+        }
+    }
+
+    // A few notes on the internal format of the index:
+    //
+    // - the positing lists for PQ codes are index.codes, which is a
+    //    std::vector < std::vector<uint8_t> >
+    //   if n is the length of posting list #i, codes[i] has length bytes_per_code * n
+    //
+    // - the corresponding ids are stored in index.ids
+    //
+    // - given a vector float *x, finding which k centroids are
+    //   closest to it (ie to find the nearest neighbors) can be done with
+    //
+    //   long *centroid_ids = new long[k];
+    //   float *distances = new float[k];
+    //   index.quantizer->search (1, x, k, dis, centroids_ids);
+    //
+
+    faiss::write_index(&index, "/tmp/populated_index.faissindex");
+
+    { // searching the database
+        int k = 5;
+        printf ("[%.3f s] Searching the %d nearest neighbors "
+                "of %ld vectors in the index\n",
+                elapsed() - t0, k, nq);
+
+        std::vector<faiss::Index::idx_t> nns (k * nq);
+        std::vector<float>               dis (k * nq);
+
+        index.search (nq, queries.data(), k, dis.data(), nns.data());
+
+        printf ("[%.3f s] Query results (vector ids, then distances):\n",
+                elapsed() - t0);
+
+        for (int i = 0; i < nq; i++) {
+            printf ("query %2d: ", i);
+            for (int j = 0; j < k; j++) {
+                printf ("%7ld ", nns[j + i * k]);
+            }
+            printf ("\n     dis: ");
+            for (int j = 0; j < k; j++) {
+                printf ("%7g ", dis[j + i * k]);
+            }
+            printf ("\n");
+        }
+    }
+    return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/demos/demo_ivfpq_indexing.cpp b/core/src/index/thirdparty/faiss/demos/demo_ivfpq_indexing.cpp
new file mode 100644
index 0000000000..743395ec2f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/demos/demo_ivfpq_indexing.cpp
@@ -0,0 +1,146 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+
+#include <sys/time.h>
+
+
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/index_io.h>
+
+double elapsed ()
+{
+    struct timeval tv;
+    gettimeofday (&tv, NULL);
+    return  tv.tv_sec + tv.tv_usec * 1e-6;
+}
+
+
+int main ()
+{
+
+    double t0 = elapsed();
+
+    // dimension of the vectors to index
+    int d = 128;
+
+    // size of the database we plan to index
+    size_t nb = 200 * 1000;
+
+    // make a set of nt training vectors in the unit cube
+    // (could be the database)
+    size_t nt = 100 * 1000;
+
+    // make the index object and train it
+    faiss::IndexFlatL2 coarse_quantizer (d);
+
+    // a reasonable number of centroids to index nb vectors
+    int ncentroids = int (4 * sqrt (nb));
+
+    // the coarse quantizer should not be dealloced before the index
+    // 4 = nb of bytes per code (d must be a multiple of this)
+    // 8 = nb of bits per sub-code (almost always 8)
+    faiss::IndexIVFPQ index (&coarse_quantizer, d,
+                             ncentroids, 4, 8);
+
+
+    { // training
+        printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
+                elapsed() - t0, nt, d);
+
+        std::vector <float> trainvecs (nt * d);
+        for (size_t i = 0; i < nt * d; i++) {
+            trainvecs[i] = drand48();
+        }
+
+        printf ("[%.3f s] Training the index\n",
+                elapsed() - t0);
+        index.verbose = true;
+
+        index.train (nt, trainvecs.data());
+    }
+
+    { // I/O demo
+        const char *outfilename = "/tmp/index_trained.faissindex";
+        printf ("[%.3f s] storing the pre-trained index to %s\n",
+                elapsed() - t0, outfilename);
+
+        write_index (&index, outfilename);
+    }
+
+    size_t nq;
+    std::vector<float> queries;
+
+    { // populating the database
+        printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
+                elapsed() - t0, nb);
+
+        std::vector <float> database (nb * d);
+        for (size_t i = 0; i < nb * d; i++) {
+            database[i] = drand48();
+        }
+
+        printf ("[%.3f s] Adding the vectors to the index\n",
+                elapsed() - t0);
+
+        index.add (nb, database.data());
+
+        printf ("[%.3f s] imbalance factor: %g\n",
+                elapsed() - t0, index.invlists->imbalance_factor ());
+
+        // remember a few elements from the database as queries
+        int i0 = 1234;
+        int i1 = 1243;
+
+        nq = i1 - i0;
+        queries.resize (nq * d);
+        for (int i = i0; i < i1; i++) {
+            for (int j = 0; j < d; j++) {
+                queries [(i - i0) * d  + j]  = database [i * d + j];
+            }
+        }
+
+    }
+
+    { // searching the database
+        int k = 5;
+        printf ("[%.3f s] Searching the %d nearest neighbors "
+                "of %ld vectors in the index\n",
+                elapsed() - t0, k, nq);
+
+        std::vector<faiss::Index::idx_t> nns (k * nq);
+        std::vector<float>               dis (k * nq);
+
+        index.search (nq, queries.data(), k, dis.data(), nns.data());
+
+        printf ("[%.3f s] Query results (vector ids, then distances):\n",
+                elapsed() - t0);
+
+        for (int i = 0; i < nq; i++) {
+            printf ("query %2d: ", i);
+            for (int j = 0; j < k; j++) {
+                printf ("%7ld ", nns[j + i * k]);
+            }
+            printf ("\n     dis: ");
+            for (int j = 0; j < k; j++) {
+                printf ("%7g ", dis[j + i * k]);
+            }
+            printf ("\n");
+        }
+
+        printf ("note that the nearest neighbor is not at "
+                "distance 0 due to quantization errors\n");
+    }
+
+    return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/demos/demo_ondisk_ivf.py b/core/src/index/thirdparty/faiss/demos/demo_ondisk_ivf.py
new file mode 100644
index 0000000000..c89acc8402
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/demos/demo_ondisk_ivf.py
@@ -0,0 +1,112 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python2
+
+import sys
+import numpy as np
+import faiss
+
+
+#################################################################
+# Small I/O functions
+#################################################################
+
+
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype='int32')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+
+
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+
+
+#################################################################
+#  Main program
+#################################################################
+
+stage = int(sys.argv[1])
+
+tmpdir = '/tmp/'
+
+if stage == 0:
+    # train the index
+    xt = fvecs_read("sift1M/sift_learn.fvecs")
+    index = faiss.index_factory(xt.shape[1], "IVF4096,Flat")
+    print("training index")
+    index.train(xt)
+    print("write " + tmpdir + "trained.index")
+    faiss.write_index(index, tmpdir + "trained.index")
+
+
+if 1 <= stage <= 4:
+    # add 1/4 of the database to 4 independent indexes
+    bno = stage - 1
+    xb = fvecs_read("sift1M/sift_base.fvecs")
+    i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4)
+    index = faiss.read_index(tmpdir + "trained.index")
+    print("adding vectors %d:%d" % (i0, i1))
+    index.add_with_ids(xb[i0:i1], np.arange(i0, i1))
+    print("write " + tmpdir + "block_%d.index" % bno)
+    faiss.write_index(index, tmpdir + "block_%d.index" % bno)
+
+
+if stage == 5:
+    # merge the images into an on-disk index
+    # first load the inverted lists
+    ivfs = []
+    for bno in range(4):
+        # the IO_FLAG_MMAP is to avoid actually loading the data thus
+        # the total size of the inverted lists can exceed the
+        # available RAM
+        print("read " + tmpdir + "block_%d.index" % bno)
+        index = faiss.read_index(tmpdir + "block_%d.index" % bno,
+                                 faiss.IO_FLAG_MMAP)
+        ivfs.append(index.invlists)
+
+        # avoid that the invlists get deallocated with the index
+        index.own_invlists = False
+
+    # construct the output index
+    index = faiss.read_index(tmpdir + "trained.index")
+
+    # prepare the output inverted lists. They will be written
+    # to merged_index.ivfdata
+    invlists = faiss.OnDiskInvertedLists(
+        index.nlist, index.code_size,
+        tmpdir + "merged_index.ivfdata")
+
+    # merge all the inverted lists
+    ivf_vector = faiss.InvertedListsPtrVector()
+    for ivf in ivfs:
+        ivf_vector.push_back(ivf)
+
+    print("merge %d inverted lists " % ivf_vector.size())
+    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
+
+    # now replace the inverted lists in the output index
+    index.ntotal = ntotal
+    index.replace_invlists(invlists)
+
+    print("write " + tmpdir + "populated.index")
+    faiss.write_index(index, tmpdir + "populated.index")
+
+
+if stage == 6:
+    # perform a search from disk
+    print("read " + tmpdir + "populated.index")
+    index = faiss.read_index(tmpdir + "populated.index")
+    index.nprobe = 16
+
+    # load query vectors and ground-truth
+    xq = fvecs_read("sift1M/sift_query.fvecs")
+    gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
+
+    D, I = index.search(xq, 5)
+
+    recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / float(xq.shape[0])
+    print("recall@1: %.3f" % recall_at_1)
diff --git a/core/src/index/thirdparty/faiss/demos/demo_sift1M.cpp b/core/src/index/thirdparty/faiss/demos/demo_sift1M.cpp
new file mode 100644
index 0000000000..8b6fe0f4f4
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/demos/demo_sift1M.cpp
@@ -0,0 +1,252 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <cstring>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <sys/time.h>
+
+#include <faiss/AutoTune.h>
+
+
+/**
+ * To run this demo, please download the ANN_SIFT1M dataset from
+ *
+ *   http://corpus-texmex.irisa.fr/
+ *
+ * and unzip it to the sudirectory sift1M.
+ **/
+
+/*****************************************************
+ * I/O functions for fvecs and ivecs
+ *****************************************************/
+
+
+float * fvecs_read (const char *fname,
+                    size_t *d_out, size_t *n_out)
+{
+    FILE *f = fopen(fname, "r");
+    if(!f) {
+        fprintf(stderr, "could not open %s\n", fname);
+        perror("");
+        abort();
+    }
+    int d;
+    fread(&d, 1, sizeof(int), f);
+    assert((d > 0 && d < 1000000) || !"unreasonable dimension");
+    fseek(f, 0, SEEK_SET);
+    struct stat st;
+    fstat(fileno(f), &st);
+    size_t sz = st.st_size;
+    assert(sz % ((d + 1) * 4) == 0 || !"weird file size");
+    size_t n = sz / ((d + 1) * 4);
+
+    *d_out = d; *n_out = n;
+    float *x = new float[n * (d + 1)];
+    size_t nr = fread(x, sizeof(float), n * (d + 1), f);
+    assert(nr == n * (d + 1) || !"could not read whole file");
+
+    // shift array to remove row headers
+    for(size_t i = 0; i < n; i++)
+        memmove(x + i * d, x + 1 + i * (d + 1), d * sizeof(*x));
+
+    fclose(f);
+    return x;
+}
+
+// not very clean, but works as long as sizeof(int) == sizeof(float)
+int *ivecs_read(const char *fname, size_t *d_out, size_t *n_out)
+{
+    return (int*)fvecs_read(fname, d_out, n_out);
+}
+
+double elapsed ()
+{
+    struct timeval tv;
+    gettimeofday (&tv, nullptr);
+    return  tv.tv_sec + tv.tv_usec * 1e-6;
+}
+
+
+
+int main()
+{
+    double t0 = elapsed();
+
+    // this is typically the fastest one.
+    const char *index_key = "IVF4096,Flat";
+
+    // these ones have better memory usage
+    // const char *index_key = "Flat";
+    // const char *index_key = "PQ32";
+    // const char *index_key = "PCA80,Flat";
+    // const char *index_key = "IVF4096,PQ8+16";
+    // const char *index_key = "IVF4096,PQ32";
+    // const char *index_key = "IMI2x8,PQ32";
+    // const char *index_key = "IMI2x8,PQ8+16";
+    // const char *index_key = "OPQ16_64,IMI2x8,PQ8+16";
+
+    faiss::Index * index;
+
+    size_t d;
+
+    {
+        printf ("[%.3f s] Loading train set\n", elapsed() - t0);
+
+        size_t nt;
+        float *xt = fvecs_read("sift1M/sift_learn.fvecs", &d, &nt);
+
+        printf ("[%.3f s] Preparing index \"%s\" d=%ld\n",
+                elapsed() - t0, index_key, d);
+        index = faiss::index_factory(d, index_key);
+
+        printf ("[%.3f s] Training on %ld vectors\n", elapsed() - t0, nt);
+
+        index->train(nt, xt);
+        delete [] xt;
+    }
+
+
+    {
+        printf ("[%.3f s] Loading database\n", elapsed() - t0);
+
+        size_t nb, d2;
+        float *xb = fvecs_read("sift1M/sift_base.fvecs", &d2, &nb);
+        assert(d == d2 || !"dataset does not have same dimension as train set");
+
+        printf ("[%.3f s] Indexing database, size %ld*%ld\n",
+                elapsed() - t0, nb, d);
+
+        index->add(nb, xb);
+
+        delete [] xb;
+    }
+
+    size_t nq;
+    float *xq;
+
+    {
+        printf ("[%.3f s] Loading queries\n", elapsed() - t0);
+
+        size_t d2;
+        xq = fvecs_read("sift1M/sift_query.fvecs", &d2, &nq);
+        assert(d == d2 || !"query does not have same dimension as train set");
+
+    }
+
+    size_t k; // nb of results per query in the GT
+    faiss::Index::idx_t *gt;  // nq * k matrix of ground-truth nearest-neighbors
+
+    {
+        printf ("[%.3f s] Loading ground truth for %ld queries\n",
+                elapsed() - t0, nq);
+
+        // load ground-truth and convert int to long
+        size_t nq2;
+        int *gt_int = ivecs_read("sift1M/sift_groundtruth.ivecs", &k, &nq2);
+        assert(nq2 == nq || !"incorrect nb of ground truth entries");
+
+        gt = new faiss::Index::idx_t[k * nq];
+        for(int i = 0; i < k * nq; i++) {
+            gt[i] = gt_int[i];
+        }
+        delete [] gt_int;
+    }
+
+    // Result of the auto-tuning
+    std::string selected_params;
+
+    { // run auto-tuning
+
+        printf ("[%.3f s] Preparing auto-tune criterion 1-recall at 1 "
+                "criterion, with k=%ld nq=%ld\n", elapsed() - t0, k, nq);
+
+        faiss::OneRecallAtRCriterion crit(nq, 1);
+        crit.set_groundtruth (k, nullptr, gt);
+        crit.nnn = k; // by default, the criterion will request only 1 NN
+
+        printf ("[%.3f s] Preparing auto-tune parameters\n", elapsed() - t0);
+
+        faiss::ParameterSpace params;
+        params.initialize(index);
+
+        printf ("[%.3f s] Auto-tuning over %ld parameters (%ld combinations)\n",
+                elapsed() - t0, params.parameter_ranges.size(),
+                params.n_combinations());
+
+        faiss::OperatingPoints ops;
+        params.explore (index, nq, xq, crit, &ops);
+
+        printf ("[%.3f s] Found the following operating points: \n",
+                elapsed() - t0);
+
+        ops.display ();
+
+        // keep the first parameter that obtains > 0.5 1-recall@1
+        for (int i = 0; i < ops.optimal_pts.size(); i++) {
+            if (ops.optimal_pts[i].perf > 0.5) {
+                selected_params = ops.optimal_pts[i].key;
+                break;
+            }
+        }
+        assert (selected_params.size() >= 0 ||
+                !"could not find good enough op point");
+    }
+
+
+    { // Use the found configuration to perform a search
+
+        faiss::ParameterSpace params;
+
+        printf ("[%.3f s] Setting parameter configuration \"%s\" on index\n",
+                elapsed() - t0, selected_params.c_str());
+
+        params.set_index_parameters (index, selected_params.c_str());
+
+        printf ("[%.3f s] Perform a search on %ld queries\n",
+                elapsed() - t0, nq);
+
+        // output buffers
+        faiss::Index::idx_t *I = new  faiss::Index::idx_t[nq * k];
+        float *D = new float[nq * k];
+
+        index->search(nq, xq, k, D, I);
+
+        printf ("[%.3f s] Compute recalls\n", elapsed() - t0);
+
+        // evaluate result by hand.
+        int n_1 = 0, n_10 = 0, n_100 = 0;
+        for(int i = 0; i < nq; i++) {
+            int gt_nn = gt[i * k];
+            for(int j = 0; j < k; j++) {
+                if (I[i * k + j] == gt_nn) {
+                    if(j < 1) n_1++;
+                    if(j < 10) n_10++;
+                    if(j < 100) n_100++;
+                }
+            }
+        }
+        printf("R@1 = %.4f\n", n_1 / float(nq));
+        printf("R@10 = %.4f\n", n_10 / float(nq));
+        printf("R@100 = %.4f\n", n_100 / float(nq));
+
+    }
+
+    delete [] xq;
+    delete [] gt;
+    delete index;
+    return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Linux b/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Linux
new file mode 100644
index 0000000000..409e99ccdb
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Linux
@@ -0,0 +1,140 @@
+# -*- makefile -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# tested on CentOS 7, Ubuntu 16 and Ubuntu 14, see below to adjust flags to distribution.
+
+
+CXX      = g++ -std=c++11
+CXXFLAGS = -fPIC -m64 -Wall -g -O3 -fopenmp -Wno-sign-compare
+CPUFLAGS = -mavx2 -mf16c -msse4 -mpopcnt
+LDFLAGS  = -fPIC -fopenmp
+
+# common linux flags
+SHAREDEXT   = so
+SHAREDFLAGS = -shared
+MKDIR_P = mkdir -p
+
+prefix      ?= /usr/local
+exec_prefix ?= ${prefix}
+libdir       = ${exec_prefix}/lib
+includedir   = ${prefix}/include
+
+##########################################################################
+# Uncomment one of the 4 BLAS/Lapack implementation options
+# below. They are sorted # from fastest to slowest (in our
+# experiments).
+##########################################################################
+
+#
+# 1. Intel MKL
+#
+# This is the fastest BLAS implementation we tested. Unfortunately it
+# is not open-source and determining the correct linking flags is a
+# nightmare. See
+#
+#   https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor
+#
+# The latest tested version is MKL 2017.0.098 (2017 Initial Release) and can
+# be downloaded here:
+#
+#   https://registrationcenter.intel.com/en/forms/?productid=2558&licensetype=2
+#
+# The following settings are working if MKL is installed on its default folder:
+#
+# MKLROOT   = /opt/intel/compilers_and_libraries/linux/mkl/
+#
+# LDFLAGS  += -Wl,--no-as-needed -L$(MKLROOT)/lib/intel64
+# LIBS     += -lmkl_intel_ilp64 -lmkl_core -lmkl_gnu_thread -ldl -lpthread
+#
+# CPPFLAGS += -DFINTEGER=long
+#
+# You may have to set the LD_LIBRARY_PATH=$MKLROOT/lib/intel64 at runtime.
+#
+# If at runtime you get the error:
+#   Intel MKL FATAL ERROR: Cannot load libmkl_avx2.so or libmkl_def.so
+# you may set
+#   LD_PRELOAD=$MKLROOT/lib/intel64/libmkl_core.so:$MKLROOT/lib/intel64/libmkl_sequential.so
+# at runtime as well.
+
+#
+# 2. Openblas
+#
+# The library contains both BLAS and Lapack. About 30% slower than MKL. Please see
+#   https://github.com/facebookresearch/faiss/wiki/Troubleshooting#slow-brute-force-search-with-openblas
+# to fix performance problemes with OpenBLAS
+
+# for Ubuntu 16:
+# sudo apt-get install libopenblas-dev python-numpy python-dev
+
+# for Ubuntu 14:
+# sudo apt-get install libopenblas-dev liblapack3 python-numpy python-dev
+
+CPPFLAGS += -DFINTEGER=int
+LIBS     += -lopenblas -llapack
+
+# 3. Atlas
+#
+# Automatically tuned linear algebra package. As the name indicates,
+# it is tuned automatically for a give architecture, and in Linux
+# distributions, it the architecture is typically indicated by the
+# directory name, eg. atlas-sse3 = optimized for SSE3 architecture.
+#
+# BLASCFLAGS=-DFINTEGER=int
+# BLASLDFLAGS=/usr/lib64/atlas-sse3/libptf77blas.so.3 /usr/lib64/atlas-sse3/liblapack.so
+#
+# 4. reference implementation
+#
+# This is just a compiled version of the reference BLAS
+# implementation, that is not optimized at all.
+#
+# CPPFLAGS += -DFINTEGER=int
+# LIBS += /usr/lib64/libblas.so.3 /usr/lib64/liblapack.so.3.2
+#
+
+
+##########################################################################
+# SWIG and Python flags
+##########################################################################
+
+# SWIG executable. This should be at least version 3.x
+SWIG = swig
+
+# The Python include directories for a given python executable can
+# typically be found with
+#
+# python -c "import distutils.sysconfig; print distutils.sysconfig.get_python_inc()"
+# python -c "import numpy ; print numpy.get_include()"
+#
+# or, for Python 3, with
+#
+# python3 -c "import distutils.sysconfig; print(distutils.sysconfig.get_python_inc())"
+# python3 -c "import numpy ; print(numpy.get_include())"
+#
+
+PYTHONCFLAGS = -I/usr/include/python2.7/ -I/usr/lib64/python2.7/site-packages/numpy/core/include/
+PYTHONLIB    = -lpython
+PYTHON = /usr/bin/python
+
+###########################################################################
+# Cuda GPU flags
+###########################################################################
+
+
+
+# root of the cuda 8 installation
+CUDAROOT     = /usr/local/cuda-8.0
+NVCC         = $(CUDAROOT)/bin/nvcc
+NVCCLDFLAGS  = -L$(CUDAROOT)/lib64
+NVCCLIBS     = -lcudart -lcublas -lcuda
+CUDACFLAGS   = -I$(CUDAROOT)/include
+NVCCFLAGS    = -I $(CUDAROOT)/targets/x86_64-linux/include/ \
+-Xcompiler -fPIC \
+-Xcudafe --diag_suppress=unrecognized_attribute \
+-gencode arch=compute_35,code="compute_35" \
+-gencode arch=compute_52,code="compute_52" \
+-gencode arch=compute_60,code="compute_60" \
+-lineinfo \
+-ccbin $(CXX) -DFAISS_USE_FLOAT16
diff --git a/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.brew b/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.brew
new file mode 100644
index 0000000000..9152f6a1ac
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.brew
@@ -0,0 +1,99 @@
+# -*- makefile -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# Tested on macOS Sierra (10.12.2) with llvm installed using Homebrew (https://brew.sh)
+# brew install llvm
+CXX      = /usr/local/opt/llvm/bin/clang++ -std=c++11
+CXXFLAGS = -fPIC -m64 -Wall -g -O3 -fopenmp -Wno-sign-compare -I/usr/local/opt/llvm/include
+CPUFLAGS = -mavx2 -mf16c -msse4 -mpopcnt
+LLVM_VERSION_PATH=$(shell ls -rt /usr/local/Cellar/llvm/ | tail -n1)
+LDFLAGS  = -fPIC -fopenmp -L/usr/local/opt/llvm/lib -L/usr/local/Cellar/llvm/${LLVM_VERSION_PATH}/lib
+
+# common mac flags
+SHAREDEXT   = dylib
+SHAREDFLAGS = -dynamiclib
+MKDIR_P = mkdir -p
+
+prefix      ?= /usr/local
+exec_prefix ?= ${prefix}
+libdir       = ${exec_prefix}/lib
+includedir   = ${prefix}/include
+
+##########################################################################
+# Uncomment one of the 4 BLAS/Lapack implementation options
+# below. They are sorted # from fastest to slowest (in our
+# experiments).
+##########################################################################
+
+#
+# 1. Intel MKL
+#
+# This is the fastest BLAS implementation we tested. Unfortunately it
+# is not open-source and determining the correct linking flags is a
+# nightmare. See
+#
+#   https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor
+#
+# The latest tested version is MKL 2017.0.098 (2017 Initial Release) and can
+# be downloaded here:
+#
+#   https://registrationcenter.intel.com/en/forms/?productid=2558&licensetype=2
+#
+# The following settings are working if MKL is installed on its default folder:
+#
+# MKLROOT   = /opt/intel/compilers_and_libraries/linux/mkl/
+#
+# LDFLAGS  += -Wl,--no-as-needed -L$(MKLROOT)/lib/intel64
+# LIBS     += -lmkl_intel_ilp64 -lmkl_core -lmkl_gnu_thread -ldl -lpthread
+#
+# CPPFLAGS += -DFINTEGER=long
+#
+# You may have to set the LD_LIBRARY_PATH=$MKLROOT/lib/intel64 at runtime.
+
+#
+# 2. Openblas
+#
+# The library contains both BLAS and Lapack. Install with brew install OpenBLAS
+#
+# CPPFLAGS += -DFINTEGER=int
+# LIBS     += /usr/local/opt/openblas/lib/libblas.dylib
+#
+
+#
+# 3. Apple's framework accelerate
+#
+# This has the advantage that it does not require to install anything,
+# as it is provided by default on the mac. It is not very fast, though.
+#
+
+CPPFLAGS += -DFINTEGER=int
+LIBS     += -framework Accelerate
+
+
+
+##########################################################################
+# SWIG and Python flags
+##########################################################################
+
+# SWIG executable. This should be at least version 3.x
+# brew install swig
+
+SWIG = /usr/local/bin/swig
+
+# The Python include directories for the current python executable
+
+PYTHON_INC=$(shell python -c "import distutils.sysconfig; print(distutils.sysconfig.get_python_inc())")
+NUMPY_INC=$(shell python -c "import numpy ; print(numpy.get_include())")
+PYTHONCFLAGS=-I${PYTHON_INC} -I${NUMPY_INC}
+PYTHONLIB=-lpython
+
+##########################################################################
+# Faiss GPU
+##########################################################################
+
+# As we don't have access to a Mac with nvidia GPUs installed, we
+# could not validate the GPU compile of Faiss.
diff --git a/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.port b/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.port
new file mode 100644
index 0000000000..1ed397bfcc
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.port
@@ -0,0 +1,106 @@
+# -*- makefile -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# tested on Mac OS X 10.12.2 Sierra with additional software installed via macports
+
+
+# The system default clang does not support openmp
+# You can install an openmp compatible g++ with macports:
+# port install g++-mp-6
+CXX      = /opt/local/bin/g++-mp-6 -std=c++11
+CXXFLAGS = -fPIC -m64 -Wall -g -O3 -fopenmp -Wno-sign-compare
+CPUFLAGS = -mavx2 -mf16c -msse4 -mpopcnt
+LDFLAGS  = -g -fPIC -fopenmp
+
+# common linux flags
+SHAREDEXT   = dylib
+SHAREDFLAGS = -dynamiclib
+MKDIR_P = mkdir -p
+
+prefix      ?= /usr/local
+exec_prefix ?= ${prefix}
+libdir       = ${exec_prefix}/lib
+includedir   = ${prefix}/include
+
+##########################################################################
+# Uncomment one of the 4 BLAS/Lapack implementation options
+# below. They are sorted # from fastest to slowest (in our
+# experiments).
+##########################################################################
+
+#
+# 1. Intel MKL
+#
+# This is the fastest BLAS implementation we tested. Unfortunately it
+# is not open-source and determining the correct linking flags is a
+# nightmare. See
+#
+#   https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor
+#
+# The latest tested version is MKL 2017.0.098 (2017 Initial Release) and can
+# be downloaded here:
+#
+#   https://registrationcenter.intel.com/en/forms/?productid=2558&licensetype=2
+#
+# The following settings are working if MKL is installed on its default folder:
+#
+# MKLROOT   = /opt/intel/compilers_and_libraries/linux/mkl/
+#
+# LDFLAGS  += -Wl,--no-as-needed -L$(MKLROOT)/lib/intel64
+# LIBS     += -lmkl_intel_ilp64 -lmkl_core -lmkl_gnu_thread -ldl -lpthread
+#
+# CPPFLAGS += -DFINTEGER=long
+#
+# You may have to set the LD_LIBRARY_PATH=$MKLROOT/lib/intel64 at runtime.
+
+#
+# 2. Openblas
+#
+# The library contains both BLAS and Lapack. Install with port install OpenBLAS
+#
+# CPPFLAGS += -DFINTEGER=int
+# LIBS     += /opt/local/lib/libopenblas.dylib
+#
+
+#
+# 3. Apple's framework accelerate
+#
+# This has the advantage that it does not require to install anything,
+# as it is provided by default on the mac. It is not very fast, though.
+#
+
+CPPFLAGS += -DFINTEGER=int
+LIBS     += -framework Accelerate
+
+
+
+##########################################################################
+# SWIG and Python flags
+##########################################################################
+
+# SWIG executable. This should be at least version 3.x
+# port install swig swig-python
+
+SWIG = /opt/local/bin/swig
+
+# The Python include directories for the current python executable can
+# typically be found with
+#
+# python -c "import distutils.sysconfig; print distutils.sysconfig.get_python_inc()"
+# python -c "import numpy ; print numpy.get_include()"
+#
+# the paths below are for the system python (not the macports one)
+
+PYTHONCFLAGS=-I/System/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7 \
+-I/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/numpy/core/include
+PYTHONLIB=-lpython
+
+##########################################################################
+# Faiss GPU
+##########################################################################
+
+# As we don't have access to a Mac with nvidia GPUs installed, we
+# could not validate the GPU compile of Faiss.
diff --git a/core/src/index/thirdparty/faiss/faiss b/core/src/index/thirdparty/faiss/faiss
new file mode 120000
index 0000000000..6a043149e8
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/faiss
@@ -0,0 +1 @@
+./
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuAutoTune.cpp b/core/src/index/thirdparty/faiss/gpu/GpuAutoTune.cpp
new file mode 100644
index 0000000000..c734fdabb5
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuAutoTune.cpp
@@ -0,0 +1,95 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuAutoTune.h>
+#include <typeinfo>
+
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/IndexReplicas.h>
+#include <faiss/IndexShards.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+
+namespace faiss { namespace gpu {
+
+
+using namespace ::faiss;
+
+/**********************************************************
+ * Parameters to auto-tune on GpuIndex'es
+ **********************************************************/
+
+#define DC(classname) auto ix = dynamic_cast<const classname *>(index)
+
+
+void GpuParameterSpace::initialize (const Index * index)
+{
+    if (DC (IndexPreTransform)) {
+        index = ix->index;
+    }
+    if (DC (IndexReplicas)) {
+        if (ix->count() == 0) return;
+        index = ix->at(0);
+    }
+    if (DC (IndexShards)) {
+        if (ix->count() == 0) return;
+        index = ix->at(0);
+    }
+    if (DC (GpuIndexIVF)) {
+        ParameterRange & pr = add_range("nprobe");
+        for (int i = 0; i < 12; i++) {
+            size_t nprobe = 1 << i;
+            if (nprobe >= ix->getNumLists() ||
+                nprobe > getMaxKSelection()) break;
+            pr.values.push_back (nprobe);
+        }
+    }
+    // not sure we should call the parent initializer
+}
+
+
+
+#undef DC
+// non-const version
+#define DC(classname) auto *ix = dynamic_cast<classname *>(index)
+
+
+
+void GpuParameterSpace::set_index_parameter (
+        Index * index, const std::string & name, double val) const
+{
+    if (DC (IndexReplicas)) {
+        for (int i = 0; i < ix->count(); i++)
+            set_index_parameter (ix->at(i), name, val);
+        return;
+    }
+    if (name == "nprobe") {
+        if (DC (GpuIndexIVF)) {
+            ix->setNumProbes (int (val));
+            return;
+        }
+    }
+    if (name == "use_precomputed_table") {
+        if (DC (GpuIndexIVFPQ)) {
+            ix->setPrecomputedCodes(bool (val));
+            return;
+        }
+    }
+
+    // maybe normal index parameters apply?
+    ParameterSpace::set_index_parameter (index, name, val);
+}
+
+
+
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuAutoTune.h b/core/src/index/thirdparty/faiss/gpu/GpuAutoTune.h
new file mode 100644
index 0000000000..1bcc9205d8
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuAutoTune.h
@@ -0,0 +1,27 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/AutoTune.h>
+
+namespace faiss { namespace gpu {
+
+
+/// parameter space and setters for GPU indexes
+struct GpuParameterSpace: faiss::ParameterSpace {
+    /// initialize with reasonable parameters for the index
+    void initialize (const faiss::Index * index) override;
+
+    /// set a combination of parameters on an index
+    void set_index_parameter (
+          faiss::Index * index, const std::string & name,
+          double val) const override;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuCloner.cpp b/core/src/index/thirdparty/faiss/gpu/GpuCloner.cpp
new file mode 100644
index 0000000000..1610aaf1cd
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuCloner.cpp
@@ -0,0 +1,475 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/GpuCloner.h>
+#include <typeinfo>
+
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/index_io.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/IndexSQHybrid.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexReplicas.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/MetaIndexes.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/GpuIndexIVFSQHybrid.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+
+namespace faiss { namespace gpu {
+
+
+/**********************************************************
+ * Cloning to CPU
+ **********************************************************/
+
+void ToCPUCloner::merge_index(Index *dst, Index *src, bool successive_ids)
+{
+    if (auto ifl = dynamic_cast<IndexFlat *>(dst)) {
+        auto ifl2 = dynamic_cast<const IndexFlat *>(src);
+        FAISS_ASSERT(ifl2);
+        FAISS_ASSERT(successive_ids);
+        ifl->add(ifl2->ntotal, ifl2->xb.data());
+    } else if(auto ifl = dynamic_cast<IndexIVFFlat *>(dst)) {
+        auto ifl2 = dynamic_cast<IndexIVFFlat *>(src);
+        FAISS_ASSERT(ifl2);
+        ifl->merge_from(*ifl2, successive_ids ? ifl->ntotal : 0);
+    } else if(auto ifl = dynamic_cast<IndexIVFScalarQuantizer *>(dst)) {
+        auto ifl2 = dynamic_cast<IndexIVFScalarQuantizer *>(src);
+        FAISS_ASSERT(ifl2);
+        ifl->merge_from(*ifl2, successive_ids ? ifl->ntotal : 0);
+    } else if(auto ifl = dynamic_cast<IndexIVFPQ *>(dst)) {
+        auto ifl2 = dynamic_cast<IndexIVFPQ *>(src);
+        FAISS_ASSERT(ifl2);
+        ifl->merge_from(*ifl2, successive_ids ? ifl->ntotal : 0);
+    } else {
+        FAISS_ASSERT(!"merging not implemented for this type of class");
+    }
+}
+
+
+Index *ToCPUCloner::clone_Index(const Index *index)
+{
+    if(auto ifl = dynamic_cast<const GpuIndexFlat *>(index)) {
+        IndexFlat *res = new IndexFlat();
+        ifl->copyTo(res);
+        return res;
+    } else if(auto ifl = dynamic_cast<const GpuIndexIVFFlat *>(index)) {
+        IndexIVFFlat *res = new IndexIVFFlat();
+        ifl->copyTo(res);
+        return res;
+    } else if(auto ifl =
+              dynamic_cast<const GpuIndexIVFScalarQuantizer *>(index)) {
+        IndexIVFScalarQuantizer *res = new IndexIVFScalarQuantizer();
+        ifl->copyTo(res);
+        return res;
+    } else if(auto ifl =
+              dynamic_cast<const GpuIndexIVFSQHybrid*>(index)) {
+        IndexIVFSQHybrid *res = new IndexIVFSQHybrid();
+        ifl->copyTo(res);
+        return res;
+    } else if(auto ipq = dynamic_cast<const GpuIndexIVFPQ *>(index)) {
+        IndexIVFPQ *res = new IndexIVFPQ();
+        ipq->copyTo(res);
+        return res;
+
+        // for IndexShards and IndexReplicas we assume that the
+        // objective is to make a single component out of them
+        // (inverse op of ToGpuClonerMultiple)
+
+    } else if(auto ish = dynamic_cast<const IndexShards *>(index)) {
+        int nshard = ish->count();
+        FAISS_ASSERT(nshard > 0);
+        Index *res = clone_Index(ish->at(0));
+        for(int i = 1; i < ish->count(); i++) {
+            Index *res_i = clone_Index(ish->at(i));
+            merge_index(res, res_i, ish->successive_ids);
+            delete res_i;
+        }
+        return res;
+    } else if(auto ipr = dynamic_cast<const IndexReplicas *>(index)) {
+        // just clone one of the replicas
+        FAISS_ASSERT(ipr->count() > 0);
+        return clone_Index(ipr->at(0));
+    } else {
+        return Cloner::clone_Index(index);
+    }
+}
+
+faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index)
+{
+    ToCPUCloner cl;
+    return cl.clone_Index(gpu_index);
+}
+
+
+
+
+/**********************************************************
+ * Cloning to 1 GPU
+ **********************************************************/
+
+ToGpuCloner::ToGpuCloner(GpuResources *resources, int device,
+                         const GpuClonerOptions &options):
+    GpuClonerOptions(options), resources(resources), device(device)
+{}
+
+Index *ToGpuCloner::clone_Index (IndexComposition* index_composition) {
+    Index* index = index_composition->index;
+
+    if(auto ifl = dynamic_cast<faiss::IndexIVFSQHybrid*>(index)) {
+        gpu::GpuIndexFlat *&quantizer = index_composition->quantizer;
+        long mode = index_composition->mode;
+
+        GpuIndexIVFSQHybridConfig config;
+        config.device = device;
+        config.indicesOptions = indicesOptions;
+        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+        config.flatConfig.storeTransposed = storeTransposed;
+
+        GpuIndexIVFSQHybrid *res =
+                new GpuIndexIVFSQHybrid(resources,
+                                        ifl->d,
+                                        ifl->nlist,
+                                        ifl->sq.qtype,
+                                        ifl->metric_type,
+                                        ifl->by_residual,
+                                        config);
+        if(reserveVecs > 0 && ifl->ntotal == 0) {
+            res->reserveMemory(reserveVecs);
+        }
+
+        res->copyFrom(ifl, quantizer, mode);
+        return res;
+    } else {
+        return clone_Index(index);
+    }
+}
+
+Index *ToGpuCloner::clone_Index(const Index *index)
+{
+    auto ivf_sqh = dynamic_cast<const faiss::IndexIVFSQHybrid*>(index);
+    if(ivf_sqh) {
+        auto ifl = ivf_sqh;
+        GpuIndexIVFSQHybridConfig config;
+        config.device = device;
+        config.indicesOptions = indicesOptions;
+        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+        config.flatConfig.storeTransposed = storeTransposed;
+
+        GpuIndexIVFSQHybrid *res =
+                new GpuIndexIVFSQHybrid(resources,
+                                        ifl->d,
+                                        ifl->nlist,
+                                        ifl->sq.qtype,
+                                        ifl->metric_type,
+                                        ifl->by_residual,
+                                        config);
+        if(reserveVecs > 0 && ifl->ntotal == 0) {
+            res->reserveMemory(reserveVecs);
+        }
+
+        res->copyFrom(ifl);
+        return res;
+    } else if(auto ifl = dynamic_cast<const IndexFlat *>(index)) {
+        GpuIndexFlatConfig config;
+        config.device = device;
+        config.useFloat16 = useFloat16;
+        config.storeTransposed = storeTransposed;
+        config.storeInCpu = storeInCpu;
+
+        return new GpuIndexFlat(resources, ifl, config);
+    } else if(auto ifl = dynamic_cast<const faiss::IndexIVFFlat *>(index)) {
+        GpuIndexIVFFlatConfig config;
+        config.device = device;
+        config.indicesOptions = indicesOptions;
+        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+        config.flatConfig.storeTransposed = storeTransposed;
+
+        GpuIndexIVFFlat *res =
+            new GpuIndexIVFFlat(resources,
+                                ifl->d,
+                                ifl->nlist,
+                                ifl->metric_type,
+                                config);
+        if(reserveVecs > 0 && ifl->ntotal == 0) {
+            res->reserveMemory(reserveVecs);
+        }
+
+        res->copyFrom(ifl);
+        return res;
+    } else if(auto ifl =
+              dynamic_cast<const faiss::IndexIVFScalarQuantizer *>(index)) {
+        GpuIndexIVFScalarQuantizerConfig config;
+        config.device = device;
+        config.indicesOptions = indicesOptions;
+        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+        config.flatConfig.storeTransposed = storeTransposed;
+
+        GpuIndexIVFScalarQuantizer *res =
+            new GpuIndexIVFScalarQuantizer(resources,
+                                           ifl->d,
+                                           ifl->nlist,
+                                           ifl->sq.qtype,
+                                           ifl->metric_type,
+                                           ifl->by_residual,
+                                           config);
+        if(reserveVecs > 0 && ifl->ntotal == 0) {
+            res->reserveMemory(reserveVecs);
+        }
+
+        res->copyFrom(ifl);
+        return res;
+    } else if(auto ipq = dynamic_cast<const faiss::IndexIVFPQ *>(index)) {
+        if(verbose)
+            printf("  IndexIVFPQ size %ld -> GpuIndexIVFPQ "
+                   "indicesOptions=%d "
+                   "usePrecomputed=%d useFloat16=%d reserveVecs=%ld\n",
+                   ipq->ntotal, indicesOptions, usePrecomputed,
+                   useFloat16, reserveVecs);
+        GpuIndexIVFPQConfig config;
+        config.device = device;
+        config.indicesOptions = indicesOptions;
+        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+        config.flatConfig.storeTransposed = storeTransposed;
+        config.useFloat16LookupTables = useFloat16;
+        config.usePrecomputedTables = usePrecomputed;
+
+        GpuIndexIVFPQ *res = new GpuIndexIVFPQ(resources, ipq, config);
+
+        if(reserveVecs > 0 && ipq->ntotal == 0) {
+            res->reserveMemory(reserveVecs);
+        }
+
+        return res;
+    } else {
+        return Cloner::clone_Index(index);
+    }
+}
+
+
+faiss::Index * index_cpu_to_gpu(
+       GpuResources* resources, int device,
+       const faiss::Index *index,
+       const GpuClonerOptions *options)
+{
+    GpuClonerOptions defaults;
+    ToGpuCloner cl(resources, device, options ? *options : defaults);
+    return cl.clone_Index(index);
+}
+
+faiss::Index * index_cpu_to_gpu(
+        GpuResources* resources, int device,
+        IndexComposition* index_composition,
+        const GpuClonerOptions *options) {
+    GpuClonerOptions defaults;
+    ToGpuCloner cl(resources, device, options ? *options : defaults);
+    return cl.clone_Index(index_composition);
+}
+
+
+/**********************************************************
+ * Cloning to multiple GPUs
+ **********************************************************/
+
+ToGpuClonerMultiple::ToGpuClonerMultiple(
+                        std::vector<GpuResources *> & resources,
+                        std::vector<int>& devices,
+                        const GpuMultipleClonerOptions &options):
+    GpuMultipleClonerOptions(options)
+{
+    FAISS_ASSERT(resources.size() == devices.size());
+    for(int i = 0; i < resources.size(); i++) {
+        sub_cloners.push_back(ToGpuCloner(resources[i], devices[i], options));
+    }
+}
+
+
+ToGpuClonerMultiple::ToGpuClonerMultiple(
+                        const std::vector<ToGpuCloner> & sub_cloners,
+                        const GpuMultipleClonerOptions &options):
+    GpuMultipleClonerOptions(options),
+    sub_cloners(sub_cloners)
+{}
+
+
+void ToGpuClonerMultiple::copy_ivf_shard (
+                         const IndexIVF *index_ivf, IndexIVF *idx2,
+                         long n, long i)
+{
+    if (shard_type == 2) {
+        long i0 = i * index_ivf->ntotal / n;
+        long i1 = (i + 1) * index_ivf->ntotal / n;
+
+        if(verbose)
+            printf("IndexShards shard %ld indices %ld:%ld\n",
+                   i, i0, i1);
+        index_ivf->copy_subset_to(*idx2, 2, i0, i1);
+        FAISS_ASSERT(idx2->ntotal == i1 - i0);
+    } else if (shard_type == 1) {
+        if(verbose)
+            printf("IndexShards shard %ld select modulo %ld = %ld\n",
+                   i, n, i);
+        index_ivf->copy_subset_to(*idx2, 1, n, i);
+    } else {
+        FAISS_THROW_FMT ("shard_type %d not implemented", shard_type);
+    }
+
+}
+
+Index * ToGpuClonerMultiple::clone_Index_to_shards (const Index *index)
+{
+    long n = sub_cloners.size();
+
+    auto index_ivfpq =
+        dynamic_cast<const faiss::IndexIVFPQ *>(index);
+    auto index_ivfflat =
+        dynamic_cast<const faiss::IndexIVFFlat *>(index);
+    auto index_ivfsq =
+        dynamic_cast<const faiss::IndexIVFScalarQuantizer *>(index);
+    auto index_flat =
+        dynamic_cast<const faiss::IndexFlat *>(index);
+    FAISS_THROW_IF_NOT_MSG (
+        index_ivfpq || index_ivfflat || index_flat || index_ivfsq,
+        "IndexShards implemented only for "
+        "IndexIVFFlat, IndexIVFScalarQuantizer, "
+        "IndexFlat and IndexIVFPQ");
+
+    std::vector<faiss::Index*> shards(n);
+
+    for(long i = 0; i < n; i++) {
+        // make a shallow copy
+        if(reserveVecs)
+            sub_cloners[i].reserveVecs =
+                (reserveVecs + n - 1) / n;
+
+        if (index_ivfpq) {
+            faiss::IndexIVFPQ idx2(
+                       index_ivfpq->quantizer, index_ivfpq->d,
+                       index_ivfpq->nlist, index_ivfpq->code_size,
+                       index_ivfpq->pq.nbits);
+            idx2.metric_type = index_ivfpq->metric_type;
+            idx2.pq = index_ivfpq->pq;
+            idx2.nprobe = index_ivfpq->nprobe;
+            idx2.use_precomputed_table = 0;
+            idx2.is_trained = index->is_trained;
+            copy_ivf_shard (index_ivfpq, &idx2, n, i);
+            shards[i] = sub_cloners[i].clone_Index(&idx2);
+        } else if (index_ivfflat) {
+            faiss::IndexIVFFlat idx2(
+                       index_ivfflat->quantizer, index->d,
+                       index_ivfflat->nlist, index_ivfflat->metric_type);
+            idx2.nprobe = index_ivfflat->nprobe;
+            copy_ivf_shard (index_ivfflat, &idx2, n, i);
+            shards[i] = sub_cloners[i].clone_Index(&idx2);
+        } else if (index_ivfsq) {
+            faiss::IndexIVFScalarQuantizer idx2(
+                       index_ivfsq->quantizer, index->d, index_ivfsq->nlist,
+                       index_ivfsq->sq.qtype,
+                       index_ivfsq->metric_type,
+                       index_ivfsq->by_residual);
+            idx2.nprobe = index_ivfsq->nprobe;
+            copy_ivf_shard (index_ivfsq, &idx2, n, i);
+            shards[i] = sub_cloners[i].clone_Index(&idx2);
+        } else if (index_flat) {
+            faiss::IndexFlat idx2 (
+                                   index->d, index->metric_type);
+            shards[i] = sub_cloners[i].clone_Index(&idx2);
+            if (index->ntotal > 0) {
+                long i0 = index->ntotal * i / n;
+                long i1 = index->ntotal * (i + 1) / n;
+                shards[i]->add (i1 - i0,
+                                index_flat->xb.data() + i0 * index->d);
+            }
+        }
+    }
+
+    bool successive_ids = index_flat != nullptr;
+    faiss::IndexShards *res =
+        new faiss::IndexShards(index->d, true,
+                               successive_ids);
+
+    for (int i = 0; i < n; i++) {
+        res->add_shard(shards[i]);
+    }
+    res->own_fields = true;
+    FAISS_ASSERT(index->ntotal == res->ntotal);
+    return res;
+}
+
+Index *ToGpuClonerMultiple::clone_Index(const Index *index)
+{
+    long n = sub_cloners.size();
+    if (n == 1)
+        return sub_cloners[0].clone_Index(index);
+
+    if(dynamic_cast<const IndexFlat *>(index) ||
+       dynamic_cast<const faiss::IndexIVFFlat *>(index) ||
+       dynamic_cast<const faiss::IndexIVFScalarQuantizer *>(index) ||
+       dynamic_cast<const faiss::IndexIVFPQ *>(index)) {
+        if(!shard) {
+            IndexReplicas * res = new IndexReplicas();
+            for(auto & sub_cloner: sub_cloners) {
+                res->addIndex(sub_cloner.clone_Index(index));
+            }
+            res->own_fields = true;
+            return res;
+        } else {
+            return clone_Index_to_shards (index);
+        }
+    } else if(auto miq = dynamic_cast<const MultiIndexQuantizer *>(index)) {
+        if (verbose) {
+            printf("cloning MultiIndexQuantizer: "
+                   "will be valid only for search k=1\n");
+        }
+        const ProductQuantizer & pq = miq->pq;
+        IndexSplitVectors *splitv = new IndexSplitVectors(pq.d, true);
+        splitv->own_fields = true;
+
+        for (int m = 0; m < pq.M; m++) {
+            // which GPU(s) will be assigned to this sub-quantizer
+
+            long i0 = m * n / pq.M;
+            long i1 = pq.M <= n ? (m + 1) * n / pq.M : i0 + 1;
+            std::vector<ToGpuCloner> sub_cloners_2;
+            sub_cloners_2.insert(
+                                 sub_cloners_2.begin(), sub_cloners.begin() + i0,
+                                 sub_cloners.begin() + i1);
+            ToGpuClonerMultiple cm(sub_cloners_2, *this);
+            IndexFlatL2 idxc (pq.dsub);
+            idxc.add (pq.ksub, pq.centroids.data() + m * pq.d * pq.ksub);
+            Index *idx2 = cm.clone_Index(&idxc);
+            splitv->add_sub_index(idx2);
+        }
+        return splitv;
+    } else {
+        return Cloner::clone_Index(index);
+    }
+}
+
+
+
+faiss::Index * index_cpu_to_gpu_multiple(
+       std::vector<GpuResources*> & resources,
+       std::vector<int> &devices,
+       const faiss::Index *index,
+       const GpuMultipleClonerOptions *options)
+{
+    GpuMultipleClonerOptions defaults;
+    ToGpuClonerMultiple cl(resources, devices, options ? *options : defaults);
+    return cl.clone_Index(index);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuCloner.h b/core/src/index/thirdparty/faiss/gpu/GpuCloner.h
new file mode 100644
index 0000000000..5c687cee20
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuCloner.h
@@ -0,0 +1,88 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include <faiss/Index.h>
+#include <faiss/clone_index.h>
+#include <faiss/gpu/GpuClonerOptions.h>
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndicesOptions.h>
+
+namespace faiss { namespace gpu {
+
+class GpuResources;
+
+
+/// Cloner specialized for GPU -> CPU
+struct ToCPUCloner: faiss::Cloner {
+    void merge_index(Index *dst, Index *src, bool successive_ids);
+    Index *clone_Index(const Index *index) override;
+};
+
+
+/// Cloner specialized for CPU -> 1 GPU
+struct ToGpuCloner: faiss::Cloner, GpuClonerOptions {
+    GpuResources *resources;
+    int device;
+
+    ToGpuCloner(GpuResources *resources, int device,
+                const GpuClonerOptions &options);
+
+    Index *clone_Index(const Index *index) override;
+
+    Index *clone_Index (IndexComposition* index_composition) override;
+};
+
+/// Cloner specialized for CPU -> multiple GPUs
+struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
+    std::vector<ToGpuCloner> sub_cloners;
+
+    ToGpuClonerMultiple(std::vector<GpuResources *> & resources,
+                        std::vector<int>& devices,
+                        const GpuMultipleClonerOptions &options);
+
+    ToGpuClonerMultiple(const std::vector<ToGpuCloner> & sub_cloners,
+                        const GpuMultipleClonerOptions &options);
+
+    void copy_ivf_shard (const IndexIVF *index_ivf, IndexIVF *idx2,
+                         long n, long i);
+
+    Index * clone_Index_to_shards (const Index *index);
+
+    /// main function
+    Index *clone_Index(const Index *index) override;
+};
+
+
+
+
+/// converts any GPU index inside gpu_index to a CPU index
+faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index);
+
+/// converts any CPU index that can be converted to GPU
+faiss::Index * index_cpu_to_gpu(
+       GpuResources* resources, int device,
+       const faiss::Index *index,
+       const GpuClonerOptions *options = nullptr);
+
+faiss::Index * index_cpu_to_gpu(
+        GpuResources* resources, int device,
+        IndexComposition* index_composition,
+        const GpuClonerOptions *options = nullptr);
+
+faiss::Index * index_cpu_to_gpu_multiple(
+       std::vector<GpuResources*> & resources,
+       std::vector<int> &devices,
+       const faiss::Index *index,
+       const GpuMultipleClonerOptions *options = nullptr);
+
+
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuClonerOptions.cpp b/core/src/index/thirdparty/faiss/gpu/GpuClonerOptions.cpp
new file mode 100644
index 0000000000..a6abee6f3a
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuClonerOptions.cpp
@@ -0,0 +1,30 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuClonerOptions.h>
+
+namespace faiss { namespace gpu {
+
+GpuClonerOptions::GpuClonerOptions()
+    : indicesOptions(INDICES_64_BIT),
+      useFloat16CoarseQuantizer(false),
+      useFloat16(false),
+      usePrecomputed(true),
+      reserveVecs(0),
+      storeTransposed(false),
+      storeInCpu(false),
+      allInGpu(false),
+      verbose(false) {
+}
+
+GpuMultipleClonerOptions::GpuMultipleClonerOptions()
+    : shard(false),
+      shard_type(1)
+{
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuClonerOptions.h b/core/src/index/thirdparty/faiss/gpu/GpuClonerOptions.h
new file mode 100644
index 0000000000..b56a33d8d7
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuClonerOptions.h
@@ -0,0 +1,58 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/GpuIndicesOptions.h>
+
+namespace faiss { namespace gpu {
+
+/// set some options on how to copy to GPU
+struct GpuClonerOptions {
+  GpuClonerOptions();
+
+  /// how should indices be stored on index types that support indices
+  /// (anything but GpuIndexFlat*)?
+  IndicesOptions indicesOptions;
+
+  /// is the coarse quantizer in float16?
+  bool useFloat16CoarseQuantizer;
+
+  /// for GpuIndexIVFFlat, is storage in float16?
+  /// for GpuIndexIVFPQ, are intermediate calculations in float16?
+  bool useFloat16;
+
+  /// use precomputed tables?
+  bool usePrecomputed;
+
+  /// reserve vectors in the invfiles?
+  long reserveVecs;
+
+  /// For GpuIndexFlat, store data in transposed layout?
+  bool storeTransposed;
+
+  bool storeInCpu;
+
+  /// For IndexIVFScalarQuantizer
+  bool allInGpu;
+
+  /// Set verbose options on the index
+  bool verbose;
+};
+
+struct GpuMultipleClonerOptions : public GpuClonerOptions {
+  GpuMultipleClonerOptions ();
+
+  /// Whether to shard the index across GPUs, versus replication
+  /// across GPUs
+  bool shard;
+
+  /// IndexIVF::copy_subset_to subset type
+  int shard_type;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu b/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu
new file mode 100644
index 0000000000..6d7e67b89b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu
@@ -0,0 +1,108 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/GpuDistance.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/Distance.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+
+namespace faiss { namespace gpu {
+
+void bruteForceKnn(GpuResources* resources,
+                   faiss::MetricType metric,
+                   // A region of memory size numVectors x dims, with dims
+                   // innermost
+                   const float* vectors,
+                   bool vectorsRowMajor,
+                   int numVectors,
+                   // A region of memory size numQueries x dims, with dims
+                   // innermost
+                   const float* queries,
+                   bool queriesRowMajor,
+                   int numQueries,
+                   int dims,
+                   int k,
+                   // A region of memory size numQueries x k, with k
+                   // innermost
+                   float* outDistances,
+                   // A region of memory size numQueries x k, with k
+                   // innermost
+                   faiss::Index::idx_t* outIndices) {
+  auto device = getCurrentDevice();
+  auto stream = resources->getDefaultStreamCurrentDevice();
+  auto& mem = resources->getMemoryManagerCurrentDevice();
+
+  auto tVectors = toDevice<float, 2>(resources,
+                                     device,
+                                     const_cast<float*>(vectors),
+                                     stream,
+                                     {vectorsRowMajor ? numVectors : dims,
+                                      vectorsRowMajor ? dims : numVectors});
+  auto tQueries = toDevice<float, 2>(resources,
+                                     device,
+                                     const_cast<float*>(queries),
+                                     stream,
+                                     {queriesRowMajor ? numQueries : dims,
+                                      queriesRowMajor ? dims : numQueries});
+
+  auto tOutDistances = toDevice<float, 2>(resources,
+                                          device,
+                                          outDistances,
+                                          stream,
+                                          {numQueries, k});
+
+  // FlatIndex only supports an interface returning int indices, allocate
+  // temporary memory for it
+  DeviceTensor<int, 2, true> tOutIntIndices(mem, {numQueries, k}, stream);
+
+  // Do the work
+  if (metric == faiss::MetricType::METRIC_L2) {
+    runL2Distance(resources,
+                  tVectors,
+                  vectorsRowMajor,
+                  nullptr, // compute norms in temp memory
+                  tQueries,
+                  queriesRowMajor,
+                  k,
+                  tOutDistances,
+                  tOutIntIndices);
+  } else if (metric == faiss::MetricType::METRIC_INNER_PRODUCT) {
+    runIPDistance(resources,
+                  tVectors,
+                  vectorsRowMajor,
+                  tQueries,
+                  queriesRowMajor,
+                  k,
+                  tOutDistances,
+                  tOutIntIndices);
+  } else {
+    FAISS_THROW_MSG("metric should be METRIC_L2 or METRIC_INNER_PRODUCT");
+  }
+
+  // Convert and copy int indices out
+  auto tOutIndices = toDevice<faiss::Index::idx_t, 2>(resources,
+                                                      device,
+                                                      outIndices,
+                                                      stream,
+                                                      {numQueries, k});
+
+  // Convert int to idx_t
+  convertTensor<int, faiss::Index::idx_t, 2>(stream,
+                                             tOutIntIndices,
+                                             tOutIndices);
+
+  // Copy back if necessary
+  fromDevice<float, 2>(tOutDistances, outDistances, stream);
+  fromDevice<faiss::Index::idx_t, 2>(tOutIndices, outIndices, stream);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuDistance.h b/core/src/index/thirdparty/faiss/gpu/GpuDistance.h
new file mode 100644
index 0000000000..5002a91407
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuDistance.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/Index.h>
+
+namespace faiss { namespace gpu {
+
+class GpuResources;
+
+/// A wrapper for gpu/impl/Distance.cuh to expose direct brute-force k-nearest
+/// neighbor searches on an externally-provided region of memory (e.g., from a
+/// pytorch tensor).
+/// The data (vectors, queries, outDistances, outIndices) can be resident on the
+/// GPU or the CPU, but all calculations are performed on the GPU. If the result
+/// buffers are on the CPU, results will be copied back when done.
+///
+/// All GPU computation is performed on the current CUDA device, and ordered
+/// with respect to resources->getDefaultStreamCurrentDevice().
+///
+/// For each vector in `queries`, searches all of `vectors` to find its k
+/// nearest neighbors with respect to the given metric
+void bruteForceKnn(GpuResources* resources,
+                   faiss::MetricType metric,
+                   // If vectorsRowMajor is true, this is
+                   // numVectors x dims, with dims innermost; otherwise,
+                   // dims x numVectors, with numVectors innermost
+                   const float* vectors,
+                   bool vectorsRowMajor,
+                   int numVectors,
+                   // If queriesRowMajor is true, this is
+                   // numQueries x dims, with dims innermost; otherwise,
+                   // dims x numQueries, with numQueries innermost
+                   const float* queries,
+                   bool queriesRowMajor,
+                   int numQueries,
+                   int dims,
+                   int k,
+                   // A region of memory size numQueries x k, with k
+                   // innermost (row major)
+                   float* outDistances,
+                   // A region of memory size numQueries x k, with k
+                   // innermost (row major)
+                   faiss::Index::idx_t* outIndices);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuFaissAssert.h b/core/src/index/thirdparty/faiss/gpu/GpuFaissAssert.h
new file mode 100644
index 0000000000..1931b916cc
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuFaissAssert.h
@@ -0,0 +1,29 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#ifndef GPU_FAISS_ASSERT_INCLUDED
+#define GPU_FAISS_ASSERT_INCLUDED
+
+#include <faiss/impl/FaissAssert.h>
+#include <cuda.h>
+
+///
+/// Assertions
+///
+
+#ifdef __CUDA_ARCH__
+#define GPU_FAISS_ASSERT(X) assert(X)
+#define GPU_FAISS_ASSERT_MSG(X, MSG) assert(X)
+#define GPU_FAISS_ASSERT_FMT(X, FMT, ...) assert(X)
+#else
+#define GPU_FAISS_ASSERT(X) FAISS_ASSERT(X)
+#define GPU_FAISS_ASSERT_MSG(X, MSG) FAISS_ASSERT_MSG(X, MSG)
+#define GPU_FAISS_ASSERT_FMT(X, FMT, ...) FAISS_ASSERT_FMT(X, FMT, __VA_ARGS)
+#endif // __CUDA_ARCH__
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu
new file mode 100644
index 0000000000..0f8891fa99
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu
@@ -0,0 +1,461 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/Metrics.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <limits>
+#include <memory>
+
+namespace faiss { namespace gpu {
+
+/// Default CPU search size for which we use paged copies
+constexpr size_t kMinPageSize = (size_t) 256 * 1024 * 1024;
+
+/// Size above which we page copies from the CPU to GPU (non-paged
+/// memory usage)
+constexpr size_t kNonPinnedPageSize = (size_t) 256 * 1024 * 1024;
+
+// Default size for which we page add or search
+constexpr size_t kAddPageSize = (size_t) 256 * 1024 * 1024;
+
+// Or, maximum number of vectors to consider per page of add or search
+constexpr size_t kAddVecSize = (size_t) 512 * 1024;
+
+// Use a smaller search size, as precomputed code usage on IVFPQ
+// requires substantial amounts of memory
+// FIXME: parameterize based on algorithm need
+constexpr size_t kSearchVecSize = (size_t) 32 * 1024;
+
+GpuIndex::GpuIndex(GpuResources* resources,
+                   int dims,
+                   faiss::MetricType metric,
+                   GpuIndexConfig config) :
+    Index(dims, metric),
+    resources_(resources),
+    device_(config.device),
+    memorySpace_(config.memorySpace),
+    minPagedSize_(kMinPageSize) {
+  FAISS_THROW_IF_NOT_FMT(device_ < getNumDevices(),
+                     "Invalid GPU device %d", device_);
+
+  FAISS_THROW_IF_NOT_MSG(dims > 0, "Invalid number of dimensions");
+
+#ifdef FAISS_UNIFIED_MEM
+  FAISS_THROW_IF_NOT_FMT(
+    memorySpace_ == MemorySpace::Device ||
+    (memorySpace_ == MemorySpace::Unified &&
+     getFullUnifiedMemSupport(device_)),
+    "Device %d does not support full CUDA 8 Unified Memory (CC 6.0+)",
+    config.device);
+#else
+  FAISS_THROW_IF_NOT_MSG(memorySpace_ == MemorySpace::Device,
+                     "Must compile with CUDA 8+ for Unified Memory support");
+#endif
+
+  FAISS_THROW_IF_NOT_MSG(isMetricSupported(metric),
+                         "Unsupported metric type on GPU");
+
+  FAISS_ASSERT(resources_);
+  resources_->initializeForDevice(device_);
+}
+
+void
+GpuIndex::setMinPagingSize(size_t size) {
+  minPagedSize_ = size;
+}
+
+size_t
+GpuIndex::getMinPagingSize() const {
+  return minPagedSize_;
+}
+
+void
+GpuIndex::add(Index::idx_t n, const float* x) {
+  // Pass to add_with_ids
+  add_with_ids(n, x, nullptr);
+}
+
+void
+GpuIndex::add_with_ids(Index::idx_t n,
+                       const float* x,
+                       const Index::idx_t* ids) {
+  FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
+
+  // For now, only support <= max int results
+  FAISS_THROW_IF_NOT_FMT(n <= (Index::idx_t) std::numeric_limits<int>::max(),
+                         "GPU index only supports up to %d indices",
+                         std::numeric_limits<int>::max());
+
+  if (n == 0) {
+    // nothing to add
+    return;
+  }
+
+  std::vector<Index::idx_t> generatedIds;
+
+  // Generate IDs if we need them
+  if (!ids && addImplRequiresIDs_()) {
+    generatedIds = std::vector<Index::idx_t>(n);
+
+    for (Index::idx_t i = 0; i < n; ++i) {
+      generatedIds[i] = this->ntotal + i;
+    }
+  }
+
+  DeviceScope scope(device_);
+  addPaged_((int) n, x, ids ? ids : generatedIds.data());
+}
+
+void
+GpuIndex::addPaged_(int n,
+                    const float* x,
+                    const Index::idx_t* ids) {
+  if (n > 0) {
+    size_t totalSize = (size_t) n * this->d * sizeof(float);
+
+    if (totalSize > kAddPageSize || n > kAddVecSize) {
+      // How many vectors fit into kAddPageSize?
+      size_t maxNumVecsForPageSize =
+        kAddPageSize / ((size_t) this->d * sizeof(float));
+
+      // Always add at least 1 vector, if we have huge vectors
+      maxNumVecsForPageSize = std::max(maxNumVecsForPageSize, (size_t) 1);
+
+      size_t tileSize = std::min((size_t) n, maxNumVecsForPageSize);
+      tileSize = std::min(tileSize, kSearchVecSize);
+
+      for (size_t i = 0; i < (size_t) n; i += tileSize) {
+        size_t curNum = std::min(tileSize, n - i);
+
+        addPage_(curNum,
+                 x + i * (size_t) this->d,
+                 ids ? ids + i : nullptr);
+      }
+    } else {
+      addPage_(n, x, ids);
+    }
+  }
+}
+
+void
+GpuIndex::addPage_(int n,
+                   const float* x,
+                   const Index::idx_t* ids) {
+  // At this point, `x` can be resident on CPU or GPU, and `ids` may be resident
+  // on CPU, GPU or may be null.
+  //
+  // Before continuing, we guarantee that all data will be resident on the GPU.
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+
+  auto vecs = toDevice<float, 2>(resources_,
+                                 device_,
+                                 const_cast<float*>(x),
+                                 stream,
+                                 {n, this->d});
+
+  if (ids) {
+    auto indices = toDevice<Index::idx_t, 1>(resources_,
+                                             device_,
+                                             const_cast<Index::idx_t*>(ids),
+                                             stream,
+                                             {n});
+
+    addImpl_(n, vecs.data(), ids ? indices.data() : nullptr);
+  } else {
+    addImpl_(n, vecs.data(), nullptr);
+  }
+}
+
+void
+GpuIndex::search(Index::idx_t n,
+                 const float* x,
+                 Index::idx_t k,
+                 float* distances,
+                 Index::idx_t* labels) const {
+  FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
+
+  // For now, only support <= max int results
+  FAISS_THROW_IF_NOT_FMT(n <= (Index::idx_t) std::numeric_limits<int>::max(),
+                         "GPU index only supports up to %d indices",
+                         std::numeric_limits<int>::max());
+
+  // Maximum k-selection supported is based on the CUDA SDK
+  FAISS_THROW_IF_NOT_FMT(k <= (Index::idx_t) getMaxKSelection(),
+                         "GPU index only supports k <= %d (requested %d)",
+                         getMaxKSelection(),
+                         (int) k); // select limitation
+
+  if (n == 0 || k == 0) {
+    // nothing to search
+    return;
+  }
+
+  DeviceScope scope(device_);
+  auto stream = resources_->getDefaultStream(device_);
+
+  // We guarantee that the searchImpl_ will be called with device-resident
+  // pointers.
+
+  // The input vectors may be too large for the GPU, but we still
+  // assume that the output distances and labels are not.
+  // Go ahead and make space for output distances and labels on the
+  // GPU.
+  // If we reach a point where all inputs are too big, we can add
+  // another level of tiling.
+  auto outDistances =
+    toDevice<float, 2>(resources_, device_, distances, stream,
+                       {(int) n, (int) k});
+
+  auto outLabels =
+    toDevice<faiss::Index::idx_t, 2>(resources_, device_, labels, stream,
+                                     {(int) n, (int) k});
+
+  bool usePaged = false;
+
+  if (getDeviceForAddress(x) == -1) {
+    // It is possible that the user is querying for a vector set size
+    // `x` that won't fit on the GPU.
+    // In this case, we will have to handle paging of the data from CPU
+    // -> GPU.
+    // Currently, we don't handle the case where the output data won't
+    // fit on the GPU (e.g., n * k is too large for the GPU memory).
+    size_t dataSize = (size_t) n * this->d * sizeof(float);
+
+    if (dataSize >= minPagedSize_) {
+      searchFromCpuPaged_(n, x, k,
+                          outDistances.data(),
+                          outLabels.data());
+      usePaged = true;
+    }
+  }
+
+  if (!usePaged) {
+    searchNonPaged_(n, x, k,
+                    outDistances.data(),
+                    outLabels.data());
+  }
+
+  // Copy back if necessary
+  fromDevice<float, 2>(outDistances, distances, stream);
+  fromDevice<faiss::Index::idx_t, 2>(outLabels, labels, stream);
+}
+
+void
+GpuIndex::searchNonPaged_(int n,
+                          const float* x,
+                          int k,
+                          float* outDistancesData,
+                          Index::idx_t* outIndicesData) const {
+  auto stream = resources_->getDefaultStream(device_);
+
+  // Make sure arguments are on the device we desire; use temporary
+  // memory allocations to move it if necessary
+  auto vecs = toDevice<float, 2>(resources_,
+                                 device_,
+                                 const_cast<float*>(x),
+                                 stream,
+                                 {n, (int) this->d});
+
+  searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData);
+}
+
+void
+GpuIndex::searchFromCpuPaged_(int n,
+                              const float* x,
+                              int k,
+                              float* outDistancesData,
+                              Index::idx_t* outIndicesData) const {
+  Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
+  Tensor<Index::idx_t, 2, true> outIndices(outIndicesData, {n, k});
+
+  // Is pinned memory available?
+  auto pinnedAlloc = resources_->getPinnedMemory();
+  int pageSizeInVecs =
+    (int) ((pinnedAlloc.second / 2) / (sizeof(float) * this->d));
+
+  if (!pinnedAlloc.first || pageSizeInVecs < 1) {
+    // Just page without overlapping copy with compute
+    int batchSize = utils::nextHighestPowerOf2(
+      (int) ((size_t) kNonPinnedPageSize /
+             (sizeof(float) * this->d)));
+
+    for (int cur = 0; cur < n; cur += batchSize) {
+      int num = std::min(batchSize, n - cur);
+
+      auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
+      auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
+
+      searchNonPaged_(num,
+                      x + (size_t) cur * this->d,
+                      k,
+                      outDistancesSlice.data(),
+                      outIndicesSlice.data());
+    }
+
+    return;
+  }
+
+  //
+  // Pinned memory is available, so we can overlap copy with compute.
+  // We use two pinned memory buffers, and triple-buffer the
+  // procedure:
+  //
+  // 1 CPU copy -> pinned
+  // 2 pinned copy -> GPU
+  // 3 GPU compute
+  //
+  // 1 2 3 1 2 3 ...   (pinned buf A)
+  //   1 2 3 1 2 ...   (pinned buf B)
+  //     1 2 3 1 ...   (pinned buf A)
+  // time ->
+  //
+  auto defaultStream = resources_->getDefaultStream(device_);
+  auto copyStream = resources_->getAsyncCopyStream(device_);
+
+  FAISS_ASSERT((size_t) pageSizeInVecs * this->d <=
+               (size_t) std::numeric_limits<int>::max());
+
+  float* bufPinnedA = (float*) pinnedAlloc.first;
+  float* bufPinnedB = bufPinnedA + (size_t) pageSizeInVecs * this->d;
+  float* bufPinned[2] = {bufPinnedA, bufPinnedB};
+
+  // Reserve space on the GPU for the destination of the pinned buffer
+  // copy
+  DeviceTensor<float, 2, true> bufGpuA(
+    resources_->getMemoryManagerCurrentDevice(),
+    {(int) pageSizeInVecs, (int) this->d},
+    defaultStream);
+  DeviceTensor<float, 2, true> bufGpuB(
+    resources_->getMemoryManagerCurrentDevice(),
+    {(int) pageSizeInVecs, (int) this->d},
+    defaultStream);
+  DeviceTensor<float, 2, true>* bufGpus[2] = {&bufGpuA, &bufGpuB};
+
+  // Copy completion events for the pinned buffers
+  std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
+
+  // Execute completion events for the GPU buffers
+  std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
+
+  // All offsets are in terms of number of vectors; they remain within
+  // int bounds (as this function only handles max in vectors)
+
+  // Current start offset for buffer 1
+  int cur1 = 0;
+  int cur1BufIndex = 0;
+
+  // Current start offset for buffer 2
+  int cur2 = -1;
+  int cur2BufIndex = 0;
+
+  // Current start offset for buffer 3
+  int cur3 = -1;
+  int cur3BufIndex = 0;
+
+  while (cur3 < n) {
+    // Start async pinned -> GPU copy first (buf 2)
+    if (cur2 != -1 && cur2 < n) {
+      // Copy pinned to GPU
+      int numToCopy = std::min(pageSizeInVecs, n - cur2);
+
+      // Make sure any previous execution has completed before continuing
+      auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
+      if (eventPrev.get()) {
+        eventPrev->streamWaitOnEvent(copyStream);
+      }
+
+      CUDA_VERIFY(cudaMemcpyAsync(bufGpus[cur2BufIndex]->data(),
+                                  bufPinned[cur2BufIndex],
+                                  (size_t) numToCopy * this->d * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  copyStream));
+
+      // Mark a completion event in this stream
+      eventPinnedCopyDone[cur2BufIndex] =
+        std::move(std::unique_ptr<CudaEvent>(new CudaEvent(copyStream)));
+
+      // We pick up from here
+      cur3 = cur2;
+      cur2 += numToCopy;
+      cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
+    }
+
+    if (cur3 != -1 && cur3 < n) {
+      // Process on GPU
+      int numToProcess = std::min(pageSizeInVecs, n - cur3);
+
+      // Make sure the previous copy has completed before continuing
+      auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
+      FAISS_ASSERT(eventPrev.get());
+
+      eventPrev->streamWaitOnEvent(defaultStream);
+
+      // Create tensor wrappers
+      // DeviceTensor<float, 2, true> input(bufGpus[cur3BufIndex]->data(),
+      //                                    {numToProcess, this->d});
+      auto outDistancesSlice = outDistances.narrowOutermost(cur3, numToProcess);
+      auto outIndicesSlice = outIndices.narrowOutermost(cur3, numToProcess);
+
+      searchImpl_(numToProcess,
+                  bufGpus[cur3BufIndex]->data(),
+                  k,
+                  outDistancesSlice.data(),
+                  outIndicesSlice.data());
+
+      // Create completion event
+      eventGpuExecuteDone[cur3BufIndex] =
+        std::move(std::unique_ptr<CudaEvent>(new CudaEvent(defaultStream)));
+
+      // We pick up from here
+      cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
+      cur3 += numToProcess;
+    }
+
+    if (cur1 < n) {
+      // Copy CPU mem to CPU pinned
+      int numToCopy = std::min(pageSizeInVecs, n - cur1);
+
+      // Make sure any previous copy has completed before continuing
+      auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
+      if (eventPrev.get()) {
+        eventPrev->cpuWaitOnEvent();
+      }
+
+      memcpy(bufPinned[cur1BufIndex],
+             x + (size_t) cur1 * this->d,
+             (size_t) numToCopy * this->d * sizeof(float));
+
+      // We pick up from here
+      cur2 = cur1;
+      cur1 += numToCopy;
+      cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
+    }
+  }
+}
+
+void
+GpuIndex::compute_residual(const float* x,
+                           float* residual,
+                           Index::idx_t key) const {
+  FAISS_THROW_MSG("compute_residual not implemented for this type of index");
+}
+
+void
+GpuIndex::compute_residual_n(Index::idx_t n,
+                             const float* xs,
+                             float* residuals,
+                             const Index::idx_t* keys) const {
+  FAISS_THROW_MSG("compute_residual_n not implemented for this type of index");
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndex.h b/core/src/index/thirdparty/faiss/gpu/GpuIndex.h
new file mode 100644
index 0000000000..d029c44a2d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndex.h
@@ -0,0 +1,148 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/gpu/utils/MemorySpace.h>
+
+namespace faiss { namespace gpu {
+
+class GpuResources;
+
+struct GpuIndexConfig {
+  inline GpuIndexConfig()
+      : device(0),
+        memorySpace(MemorySpace::Device) {
+  }
+
+  /// GPU device on which the index is resident
+  int device;
+
+  /// What memory space to use for primary storage.
+  /// On Pascal and above (CC 6+) architectures, allows GPUs to use
+  /// more memory than is available on the GPU.
+  MemorySpace memorySpace;
+};
+
+class GpuIndex : public faiss::Index {
+ public:
+  GpuIndex(GpuResources* resources,
+           int dims,
+           faiss::MetricType metric,
+           GpuIndexConfig config);
+
+  inline int getDevice() const {
+    return device_;
+  }
+
+  inline GpuResources* getResources() {
+    return resources_;
+  }
+
+  /// Set the minimum data size for searches (in MiB) for which we use
+  /// CPU -> GPU paging
+  void setMinPagingSize(size_t size);
+
+  /// Returns the current minimum data size for paged searches
+  size_t getMinPagingSize() const;
+
+  /// `x` can be resident on the CPU or any GPU; copies are performed
+  /// as needed
+  /// Handles paged adds if the add set is too large; calls addInternal_
+  void add(faiss::Index::idx_t, const float* x) override;
+
+  /// `x` and `ids` can be resident on the CPU or any GPU; copies are
+  /// performed as needed
+  /// Handles paged adds if the add set is too large; calls addInternal_
+  void add_with_ids(Index::idx_t n,
+                    const float* x,
+                    const Index::idx_t* ids) override;
+
+  /// `x`, `distances` and `labels` can be resident on the CPU or any
+  /// GPU; copies are performed as needed
+  void search(Index::idx_t n,
+              const float* x,
+              Index::idx_t k,
+              float* distances,
+              Index::idx_t* labels) const override;
+
+  /// Overridden to force GPU indices to provide their own GPU-friendly
+  /// implementation
+  void compute_residual(const float* x,
+                        float* residual,
+                        Index::idx_t key) const override;
+
+  /// Overridden to force GPU indices to provide their own GPU-friendly
+  /// implementation
+  void compute_residual_n(Index::idx_t n,
+                          const float* xs,
+                          float* residuals,
+                          const Index::idx_t* keys) const override;
+
+ protected:
+  /// Does addImpl_ require IDs? If so, and no IDs are provided, we will
+  /// generate them sequentially based on the order in which the IDs are added
+  virtual bool addImplRequiresIDs_() const = 0;
+
+  /// Overridden to actually perform the add
+  /// All data is guaranteed to be resident on our device
+  virtual void addImpl_(int n,
+                        const float* x,
+                        const Index::idx_t* ids) = 0;
+
+  /// Overridden to actually perform the search
+  /// All data is guaranteed to be resident on our device
+  virtual void searchImpl_(int n,
+                           const float* x,
+                           int k,
+                           float* distances,
+                           Index::idx_t* labels) const = 0;
+
+private:
+  /// Handles paged adds if the add set is too large, passes to
+  /// addImpl_ to actually perform the add for the current page
+  void addPaged_(int n,
+                 const float* x,
+                 const Index::idx_t* ids);
+
+  /// Calls addImpl_ for a single page of GPU-resident data
+  void addPage_(int n,
+                const float* x,
+                const Index::idx_t* ids);
+
+  /// Calls searchImpl_ for a single page of GPU-resident data
+  void searchNonPaged_(int n,
+                       const float* x,
+                       int k,
+                       float* outDistancesData,
+                       Index::idx_t* outIndicesData) const;
+
+  /// Calls searchImpl_ for a single page of GPU-resident data,
+  /// handling paging of the data and copies from the CPU
+  void searchFromCpuPaged_(int n,
+                           const float* x,
+                           int k,
+                           float* outDistancesData,
+                           Index::idx_t* outIndicesData) const;
+
+ protected:
+  /// Manages streams, cuBLAS handles and scratch memory for devices
+  GpuResources* resources_;
+
+  /// The GPU device we are resident on
+  const int device_;
+
+  /// The memory space of our primary storage on the GPU
+  const MemorySpace memorySpace_;
+
+  /// Size above which we page copies from the CPU to GPU
+  size_t minPagedSize_;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexBinaryFlat.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexBinaryFlat.cu
new file mode 100644
index 0000000000..9d7e18c727
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexBinaryFlat.cu
@@ -0,0 +1,289 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuIndexBinaryFlat.h>
+
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/BinaryFlatIndex.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+
+namespace faiss { namespace gpu {
+
+/// Default CPU search size for which we use paged copies
+constexpr size_t kMinPageSize = (size_t) 256 * 1024 * 1024;
+
+GpuIndexBinaryFlat::GpuIndexBinaryFlat(GpuResources* resources,
+                                       const faiss::IndexBinaryFlat* index,
+                                       GpuIndexBinaryFlatConfig config)
+    : IndexBinary(index->d),
+      resources_(resources),
+      config_(std::move(config)),
+      data_(nullptr) {
+  FAISS_THROW_IF_NOT_FMT(this->d % 8 == 0,
+                         "vector dimension (number of bits) "
+                         "must be divisible by 8 (passed %d)",
+                         this->d);
+
+  // Flat index doesn't need training
+  this->is_trained = true;
+
+  copyFrom(index);
+}
+
+
+GpuIndexBinaryFlat::GpuIndexBinaryFlat(GpuResources* resources,
+                                       int dims,
+                                       GpuIndexBinaryFlatConfig config)
+    : IndexBinary(dims),
+      resources_(resources),
+      config_(std::move(config)),
+      data_(nullptr) {
+  FAISS_THROW_IF_NOT_FMT(this->d % 8 == 0,
+                         "vector dimension (number of bits) "
+                         "must be divisible by 8 (passed %d)",
+                         this->d);
+
+  // Flat index doesn't need training
+  this->is_trained = true;
+
+  // Construct index
+  DeviceScope scope(config_.device);
+  data_ = new BinaryFlatIndex(resources,
+                              this->d,
+                              config_.memorySpace);
+}
+
+GpuIndexBinaryFlat::~GpuIndexBinaryFlat() {
+  delete data_;
+}
+
+void
+GpuIndexBinaryFlat::copyFrom(const faiss::IndexBinaryFlat* index) {
+  DeviceScope scope(config_.device);
+
+  this->d = index->d;
+
+  // GPU code has 32 bit indices
+  FAISS_THROW_IF_NOT_FMT(index->ntotal <=
+                         (faiss::Index::idx_t) std::numeric_limits<int>::max(),
+                         "GPU index only supports up to %zu indices; "
+                         "attempting to copy CPU index with %zu parameters",
+                         (size_t) std::numeric_limits<int>::max(),
+                         (size_t) index->ntotal);
+  this->ntotal = index->ntotal;
+
+  delete data_;
+  data_ = new BinaryFlatIndex(resources_,
+                              this->d,
+                              config_.memorySpace);
+
+  // The index could be empty
+  if (index->ntotal > 0) {
+    data_->add(index->xb.data(),
+               index->ntotal,
+               resources_->getDefaultStream(config_.device));
+  }
+}
+
+void
+GpuIndexBinaryFlat::copyTo(faiss::IndexBinaryFlat* index) const {
+  DeviceScope scope(config_.device);
+
+  index->d = this->d;
+  index->ntotal = this->ntotal;
+
+  FAISS_ASSERT(data_);
+  FAISS_ASSERT(data_->getSize() == this->ntotal);
+  index->xb.resize(this->ntotal * (this->d / 8));
+
+  if (this->ntotal > 0) {
+    fromDevice(data_->getVectorsRef(),
+               index->xb.data(),
+               resources_->getDefaultStream(config_.device));
+  }
+}
+
+void
+GpuIndexBinaryFlat::add(faiss::IndexBinary::idx_t n,
+                        const uint8_t* x) {
+  DeviceScope scope(config_.device);
+
+  // To avoid multiple re-allocations, ensure we have enough storage
+  // available
+  data_->reserve(n, resources_->getDefaultStream(config_.device));
+
+  // Due to GPU indexing in int32, we can't store more than this
+  // number of vectors on a GPU
+  FAISS_THROW_IF_NOT_FMT(this->ntotal + n <=
+                         (faiss::Index::idx_t) std::numeric_limits<int>::max(),
+                         "GPU index only supports up to %zu indices",
+                         (size_t) std::numeric_limits<int>::max());
+
+  data_->add((const unsigned char*) x,
+             n,
+             resources_->getDefaultStream(config_.device));
+  this->ntotal += n;
+}
+
+void
+GpuIndexBinaryFlat::reset() {
+  DeviceScope scope(config_.device);
+
+  // Free the underlying memory
+  data_->reset();
+  this->ntotal = 0;
+}
+
+void
+GpuIndexBinaryFlat::search(faiss::IndexBinary::idx_t n,
+                           const uint8_t* x,
+                           faiss::IndexBinary::idx_t k,
+                           int32_t* distances,
+                           faiss::IndexBinary::idx_t* labels) const {
+  if (n == 0) {
+    return;
+  }
+
+  // For now, only support <= max int results
+  FAISS_THROW_IF_NOT_FMT(n <= (Index::idx_t) std::numeric_limits<int>::max(),
+                         "GPU index only supports up to %zu indices",
+                         (size_t) std::numeric_limits<int>::max());
+  FAISS_THROW_IF_NOT_FMT(k <= (Index::idx_t) getMaxKSelection(),
+                         "GPU only supports k <= %d (requested %d)",
+                         getMaxKSelection(),
+                         (int) k); // select limitation
+
+  DeviceScope scope(config_.device);
+  auto stream = resources_->getDefaultStream(config_.device);
+
+  // The input vectors may be too large for the GPU, but we still
+  // assume that the output distances and labels are not.
+  // Go ahead and make space for output distances and labels on the
+  // GPU.
+  // If we reach a point where all inputs are too big, we can add
+  // another level of tiling.
+  auto outDistances = toDevice<int32_t, 2>(resources_,
+                                           config_.device,
+                                           distances,
+                                           stream,
+                                           {(int) n, (int) k});
+
+  // FlatIndex only supports an interface returning int indices
+  DeviceTensor<int, 2, true> outIntIndices(
+    resources_->getMemoryManagerCurrentDevice(),
+    {(int) n, (int) k}, stream);
+
+  bool usePaged = false;
+
+  if (getDeviceForAddress(x) == -1) {
+    // It is possible that the user is querying for a vector set size
+    // `x` that won't fit on the GPU.
+    // In this case, we will have to handle paging of the data from CPU
+    // -> GPU.
+    // Currently, we don't handle the case where the output data won't
+    // fit on the GPU (e.g., n * k is too large for the GPU memory).
+    size_t dataSize = (size_t) n * (this->d / 8) * sizeof(uint8_t);
+
+    if (dataSize >= kMinPageSize) {
+      searchFromCpuPaged_(n, x, k,
+                          outDistances.data(),
+                          outIntIndices.data());
+      usePaged = true;
+    }
+  }
+
+  if (!usePaged) {
+    searchNonPaged_(n, x, k,
+                    outDistances.data(),
+                    outIntIndices.data());
+  }
+
+  // Convert and copy int indices out
+  auto outIndices = toDevice<faiss::Index::idx_t, 2>(resources_,
+                                                     config_.device,
+                                                     labels,
+                                                     stream,
+                                                     {(int) n, (int) k});
+
+  // Convert int to long
+  convertTensor<int, faiss::Index::idx_t, 2>(stream,
+                                             outIntIndices,
+                                             outIndices);
+
+  // Copy back if necessary
+  fromDevice<int32_t, 2>(outDistances, distances, stream);
+  fromDevice<faiss::Index::idx_t, 2>(outIndices, labels, stream);
+}
+
+void
+GpuIndexBinaryFlat::searchNonPaged_(int n,
+                                    const uint8_t* x,
+                                    int k,
+                                    int32_t* outDistancesData,
+                                    int* outIndicesData) const {
+  Tensor<int32_t, 2, true> outDistances(outDistancesData, {n, k});
+  Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
+
+  auto stream = resources_->getDefaultStream(config_.device);
+
+  // Make sure arguments are on the device we desire; use temporary
+  // memory allocations to move it if necessary
+  auto vecs = toDevice<uint8_t, 2>(resources_,
+                                   config_.device,
+                                   const_cast<uint8_t*>(x),
+                                   stream,
+                                   {n, (int) (this->d / 8)});
+
+  data_->query(vecs, k, outDistances, outIndices);
+}
+
+void
+GpuIndexBinaryFlat::searchFromCpuPaged_(int n,
+                                        const uint8_t* x,
+                                        int k,
+                                        int32_t* outDistancesData,
+                                        int* outIndicesData) const {
+  Tensor<int32_t, 2, true> outDistances(outDistancesData, {n, k});
+  Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
+
+  auto vectorSize = sizeof(uint8_t) * (this->d / 8);
+
+  // Just page without overlapping copy with compute (as GpuIndexFlat does)
+  int batchSize = utils::nextHighestPowerOf2(
+    (int) ((size_t) kMinPageSize / vectorSize));
+
+  for (int cur = 0; cur < n; cur += batchSize) {
+    int num = std::min(batchSize, n - cur);
+
+    auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
+    auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
+
+    searchNonPaged_(num,
+                    x + (size_t) cur * (this->d / 8),
+                    k,
+                    outDistancesSlice.data(),
+                    outIndicesSlice.data());
+  }
+}
+
+void
+GpuIndexBinaryFlat::reconstruct(faiss::IndexBinary::idx_t key,
+                                uint8_t* out) const {
+  DeviceScope scope(config_.device);
+
+  FAISS_THROW_IF_NOT_MSG(key < this->ntotal, "index out of bounds");
+  auto stream = resources_->getDefaultStream(config_.device);
+
+  auto& vecs = data_->getVectorsRef();
+  auto vec = vecs[key];
+
+  fromDevice(vec.data(), out, vecs.getSize(1), stream);
+}
+
+} } // namespace gpu
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexBinaryFlat.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexBinaryFlat.h
new file mode 100644
index 0000000000..a4037896c4
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexBinaryFlat.h
@@ -0,0 +1,89 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/gpu/GpuIndex.h>
+
+namespace faiss { namespace gpu {
+
+class BinaryFlatIndex;
+class GpuResources;
+
+struct GpuIndexBinaryFlatConfig : public GpuIndexConfig {
+};
+
+/// A GPU version of IndexBinaryFlat for brute-force comparison of bit vectors
+/// via Hamming distance
+class GpuIndexBinaryFlat : public IndexBinary {
+ public:
+  /// Construct from a pre-existing faiss::IndexBinaryFlat instance, copying
+  /// data over to the given GPU
+  GpuIndexBinaryFlat(GpuResources* resources,
+                     const faiss::IndexBinaryFlat* index,
+                     GpuIndexBinaryFlatConfig config =
+                     GpuIndexBinaryFlatConfig());
+
+  /// Construct an empty instance that can be added to
+  GpuIndexBinaryFlat(GpuResources* resources,
+                     int dims,
+                     GpuIndexBinaryFlatConfig config =
+                     GpuIndexBinaryFlatConfig());
+
+  ~GpuIndexBinaryFlat() override;
+
+  /// Initialize ourselves from the given CPU index; will overwrite
+  /// all data in ourselves
+  void copyFrom(const faiss::IndexBinaryFlat* index);
+
+  /// Copy ourselves to the given CPU index; will overwrite all data
+  /// in the index instance
+  void copyTo(faiss::IndexBinaryFlat* index) const;
+
+  void add(faiss::IndexBinary::idx_t n,
+           const uint8_t* x) override;
+
+  void reset() override;
+
+  void search(faiss::IndexBinary::idx_t n,
+              const uint8_t* x,
+              faiss::IndexBinary::idx_t k,
+              int32_t* distances,
+              faiss::IndexBinary::idx_t* labels) const override;
+
+  void reconstruct(faiss::IndexBinary::idx_t key,
+                   uint8_t* recons) const override;
+
+ protected:
+  /// Called from search when the input data is on the CPU;
+  /// potentially allows for pinned memory usage
+  void searchFromCpuPaged_(int n,
+                           const uint8_t* x,
+                           int k,
+                           int32_t* outDistancesData,
+                           int* outIndicesData) const;
+
+  void searchNonPaged_(int n,
+                       const uint8_t* x,
+                       int k,
+                       int32_t* outDistancesData,
+                       int* outIndicesData) const;
+
+ protected:
+  /// Manages streans, cuBLAS handles and scratch memory for devices
+  GpuResources* resources_;
+
+  /// Configuration options
+  GpuIndexBinaryFlatConfig config_;
+
+  /// Holds our GPU data containing the list of vectors; is managed via raw
+  /// pointer so as to allow non-CUDA compilers to see this header
+  BinaryFlatIndex* data_;
+};
+
+} } // namespace gpu
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu
new file mode 100644
index 0000000000..09c23363fc
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu
@@ -0,0 +1,399 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <limits>
+
+namespace faiss { namespace gpu {
+
+GpuIndexFlat::GpuIndexFlat(GpuResources* resources,
+                           const faiss::IndexFlat* index,
+                           GpuIndexFlatConfig config) :
+    GpuIndex(resources, index->d, index->metric_type, config),
+    config_(std::move(config)),
+    data_(nullptr) {
+  verifySettings_();
+
+  // Flat index doesn't need training
+  this->is_trained = true;
+
+  copyFrom(index);
+}
+
+GpuIndexFlat::GpuIndexFlat(GpuResources* resources,
+                           int dims,
+                           faiss::MetricType metric,
+                           GpuIndexFlatConfig config) :
+    GpuIndex(resources, dims, metric, config),
+    config_(std::move(config)),
+    data_(nullptr) {
+  verifySettings_();
+
+  // Flat index doesn't need training
+  this->is_trained = true;
+
+  // Construct index
+  DeviceScope scope(device_);
+  data_ = new FlatIndex(resources,
+                        dims,
+                        metric == faiss::METRIC_L2,
+                        config_.useFloat16,
+                        config_.useFloat16Accumulator,
+                        config_.storeTransposed,
+                        memorySpace_);
+}
+
+GpuIndexFlat::~GpuIndexFlat() {
+  delete data_;
+}
+
+void
+GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) {
+  DeviceScope scope(device_);
+
+  this->d = index->d;
+  this->metric_type = index->metric_type;
+
+  // GPU code has 32 bit indices
+  FAISS_THROW_IF_NOT_FMT(index->ntotal <=
+                         (faiss::Index::idx_t) std::numeric_limits<int>::max(),
+                         "GPU index only supports up to %zu indices; "
+                         "attempting to copy CPU index with %zu parameters",
+                         (size_t) std::numeric_limits<int>::max(),
+                         (size_t) index->ntotal);
+  this->ntotal = index->ntotal;
+
+  delete data_;
+  data_ = new FlatIndex(resources_,
+                        this->d,
+                        index->metric_type == faiss::METRIC_L2,
+                        config_.useFloat16,
+                        config_.useFloat16Accumulator,
+                        config_.storeTransposed,
+                        memorySpace_);
+
+  // The index could be empty
+  if (index->ntotal > 0) {
+    data_->add(index->xb.data(),
+               index->ntotal,
+               resources_->getDefaultStream(device_));
+  }
+
+  xb_.clear();
+
+  if (config_.storeInCpu) {
+      xb_ = index->xb;
+  }
+}
+
+void
+GpuIndexFlat::copyTo(faiss::IndexFlat* index) const {
+  DeviceScope scope(device_);
+
+  index->d = this->d;
+  index->ntotal = this->ntotal;
+  index->metric_type = this->metric_type;
+
+  FAISS_ASSERT(data_);
+  FAISS_ASSERT(data_->getSize() == this->ntotal);
+  index->xb.resize(this->ntotal * this->d);
+
+  auto stream = resources_->getDefaultStream(device_);
+
+  if (this->ntotal > 0) {
+    if (config_.useFloat16) {
+      auto vecFloat32 = data_->getVectorsFloat32Copy(stream);
+      fromDevice(vecFloat32, index->xb.data(), stream);
+    } else {
+      fromDevice(data_->getVectorsFloat32Ref(), index->xb.data(), stream);
+    }
+  }
+}
+
+size_t
+GpuIndexFlat::getNumVecs() const {
+  return this->ntotal;
+}
+
+void
+GpuIndexFlat::reset() {
+  DeviceScope scope(device_);
+
+  // Free the underlying memory
+  data_->reset();
+  this->ntotal = 0;
+}
+
+void
+GpuIndexFlat::train(Index::idx_t n, const float* x) {
+  // nothing to do
+}
+
+void
+GpuIndexFlat::add(Index::idx_t n, const float* x) {
+  FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
+
+  // For now, only support <= max int results
+  FAISS_THROW_IF_NOT_FMT(n <= (Index::idx_t) std::numeric_limits<int>::max(),
+                         "GPU index only supports up to %d indices",
+                         std::numeric_limits<int>::max());
+
+  if (n == 0) {
+    // nothing to add
+    return;
+  }
+
+  DeviceScope scope(device_);
+
+  // To avoid multiple re-allocations, ensure we have enough storage
+  // available
+  data_->reserve(n, resources_->getDefaultStream(device_));
+
+  // If we're not operating in float16 mode, we don't need the input
+  // data to be resident on our device; we can add directly.
+  if (!config_.useFloat16) {
+    addImpl_(n, x, nullptr);
+  } else {
+    // Otherwise, perform the paging
+    GpuIndex::add(n, x);
+  }
+}
+
+bool
+GpuIndexFlat::addImplRequiresIDs_() const {
+  return false;
+}
+
+void
+GpuIndexFlat::addImpl_(int n,
+                       const float* x,
+                       const Index::idx_t* ids) {
+  FAISS_ASSERT(data_);
+  FAISS_ASSERT(n > 0);
+
+  // We do not support add_with_ids
+  FAISS_THROW_IF_NOT_MSG(!ids, "add_with_ids not supported");
+
+  // Due to GPU indexing in int32, we can't store more than this
+  // number of vectors on a GPU
+  FAISS_THROW_IF_NOT_FMT(this->ntotal + n <=
+                         (faiss::Index::idx_t) std::numeric_limits<int>::max(),
+                         "GPU index only supports up to %zu indices",
+                         (size_t) std::numeric_limits<int>::max());
+
+  data_->add(x, n, resources_->getDefaultStream(device_));
+  this->ntotal += n;
+}
+
+void
+GpuIndexFlat::searchImpl_(int n,
+                          const float* x,
+                          int k,
+                          float* distances,
+                          Index::idx_t* labels) const {
+  auto stream = resources_->getDefaultStream(device_);
+
+  // Input and output data are already resident on the GPU
+  Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
+  Tensor<float, 2, true> outDistances(distances, {n, k});
+  Tensor<Index::idx_t, 2, true> outLabels(labels, {n, k});
+
+  // FlatIndex only supports int indices
+  DeviceTensor<int, 2, true> outIntLabels(
+    resources_->getMemoryManagerCurrentDevice(), {n, k}, stream);
+
+  data_->query(queries, k, outDistances, outIntLabels, true);
+
+  // Convert int to idx_t
+  convertTensor<int, faiss::Index::idx_t, 2>(stream,
+                                             outIntLabels,
+                                             outLabels);
+}
+
+void
+GpuIndexFlat::reconstruct(faiss::Index::idx_t key,
+                          float* out) const {
+  if(config_.storeInCpu && xb_.size() > 0) {
+      memcpy (out, &(this->xb_[key * this->d]), sizeof(*out) * this->d);
+      return;
+  }
+
+  DeviceScope scope(device_);
+
+  FAISS_THROW_IF_NOT_MSG(key < this->ntotal, "index out of bounds");
+  auto stream = resources_->getDefaultStream(device_);
+
+  if (config_.useFloat16) {
+    // FIXME jhj: kernel for copy
+    auto vec = data_->getVectorsFloat32Copy(key, 1, stream);
+    fromDevice(vec.data(), out, this->d, stream);
+  } else {
+    auto vec = data_->getVectorsFloat32Ref()[key];
+    fromDevice(vec.data(), out, this->d, stream);
+  }
+}
+
+void
+GpuIndexFlat::reconstruct_n(faiss::Index::idx_t i0,
+                            faiss::Index::idx_t num,
+                            float* out) const {
+  DeviceScope scope(device_);
+
+  FAISS_THROW_IF_NOT_MSG(i0 < this->ntotal, "index out of bounds");
+  FAISS_THROW_IF_NOT_MSG(i0 + num - 1 < this->ntotal, "num out of bounds");
+  auto stream = resources_->getDefaultStream(device_);
+
+  if (config_.useFloat16) {
+    // FIXME jhj: kernel for copy
+    auto vec = data_->getVectorsFloat32Copy(i0, num, stream);
+    fromDevice(vec.data(), out, num * this->d, stream);
+  } else {
+    auto vec = data_->getVectorsFloat32Ref()[i0];
+    fromDevice(vec.data(), out, this->d * num, stream);
+  }
+}
+
+void
+GpuIndexFlat::compute_residual(const float* x,
+                               float* residual,
+                               faiss::Index::idx_t key) const {
+  compute_residual_n(1, x, residual, &key);
+}
+
+void
+GpuIndexFlat::compute_residual_n(faiss::Index::idx_t n,
+                                 const float* xs,
+                                 float* residuals,
+                                 const faiss::Index::idx_t* keys) const {
+  FAISS_THROW_IF_NOT_FMT(n <=
+                         (faiss::Index::idx_t) std::numeric_limits<int>::max(),
+                         "GPU index only supports up to %zu indices",
+                         (size_t) std::numeric_limits<int>::max());
+
+  auto stream = resources_->getDefaultStream(device_);
+
+  DeviceScope scope(device_);
+
+  auto vecsDevice =
+    toDevice<float, 2>(resources_, device_,
+                       const_cast<float*>(xs), stream,
+                       {(int) n, (int) this->d});
+  auto idsDevice =
+    toDevice<faiss::Index::idx_t, 1>(resources_, device_,
+                                     const_cast<faiss::Index::idx_t*>(keys),
+                                     stream,
+                                     {(int) n});
+  auto residualDevice =
+    toDevice<float, 2>(resources_, device_, residuals, stream,
+                       {(int) n, (int) this->d});
+
+  // Convert idx_t to int
+  auto keysInt =
+    convertTensor<faiss::Index::idx_t, int, 1>(resources_, stream, idsDevice);
+
+  FAISS_ASSERT(data_);
+  data_->computeResidual(vecsDevice,
+                         keysInt,
+                         residualDevice);
+
+  fromDevice<float, 2>(residualDevice, residuals, stream);
+}
+
+void
+GpuIndexFlat::verifySettings_() const {
+  // If we want Hgemm, ensure that it is supported on this device
+  if (config_.useFloat16Accumulator) {
+    FAISS_THROW_IF_NOT_MSG(config_.useFloat16,
+                       "useFloat16Accumulator can only be enabled "
+                       "with useFloat16");
+
+    FAISS_THROW_IF_NOT_FMT(getDeviceSupportsFloat16Math(config_.device),
+                       "Device %d does not support Hgemm "
+                       "(useFloat16Accumulator)",
+                       config_.device);
+  }
+}
+
+//
+// GpuIndexFlatL2
+//
+
+GpuIndexFlatL2::GpuIndexFlatL2(GpuResources* resources,
+                               faiss::IndexFlatL2* index,
+                               GpuIndexFlatConfig config) :
+    GpuIndexFlat(resources, index, config) {
+}
+
+GpuIndexFlatL2::GpuIndexFlatL2(GpuResources* resources,
+                               int dims,
+                               GpuIndexFlatConfig config) :
+    GpuIndexFlat(resources, dims, faiss::METRIC_L2, config) {
+}
+
+void
+GpuIndexFlatL2::copyFrom(faiss::IndexFlat* index) {
+  FAISS_THROW_IF_NOT_MSG(index->metric_type == metric_type,
+                         "Cannot copy a GpuIndexFlatL2 from an index of "
+                         "different metric_type");
+
+  GpuIndexFlat::copyFrom(index);
+}
+
+void
+GpuIndexFlatL2::copyTo(faiss::IndexFlat* index) {
+  FAISS_THROW_IF_NOT_MSG(index->metric_type == metric_type,
+                         "Cannot copy a GpuIndexFlatL2 to an index of "
+                         "different metric_type");
+
+  GpuIndexFlat::copyTo(index);
+}
+
+//
+// GpuIndexFlatIP
+//
+
+GpuIndexFlatIP::GpuIndexFlatIP(GpuResources* resources,
+                               faiss::IndexFlatIP* index,
+                               GpuIndexFlatConfig config) :
+    GpuIndexFlat(resources, index, config) {
+}
+
+GpuIndexFlatIP::GpuIndexFlatIP(GpuResources* resources,
+                               int dims,
+                               GpuIndexFlatConfig config) :
+    GpuIndexFlat(resources, dims, faiss::METRIC_INNER_PRODUCT, config) {
+}
+
+void
+GpuIndexFlatIP::copyFrom(faiss::IndexFlat* index) {
+  FAISS_THROW_IF_NOT_MSG(index->metric_type == metric_type,
+                         "Cannot copy a GpuIndexFlatIP from an index of "
+                         "different metric_type");
+
+  GpuIndexFlat::copyFrom(index);
+}
+
+void
+GpuIndexFlatIP::copyTo(faiss::IndexFlat* index) {
+  // The passed in index must be IP
+  FAISS_THROW_IF_NOT_MSG(index->metric_type == metric_type,
+                         "Cannot copy a GpuIndexFlatIP to an index of "
+                         "different metric_type");
+
+  GpuIndexFlat::copyTo(index);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.h
new file mode 100644
index 0000000000..ecda39dc6e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.h
@@ -0,0 +1,196 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <vector>
+#include <faiss/gpu/GpuIndex.h>
+
+namespace faiss {
+
+struct IndexFlat;
+struct IndexFlatL2;
+struct IndexFlatIP;
+
+}
+
+namespace faiss { namespace gpu {
+
+struct FlatIndex;
+
+struct GpuIndexFlatConfig : public GpuIndexConfig {
+  inline GpuIndexFlatConfig()
+      : useFloat16(false),
+        useFloat16Accumulator(false),
+        storeTransposed(false),
+        storeInCpu(false){
+  }
+
+  /// Whether or not data is stored as float16
+  bool useFloat16;
+
+  /// Whether or not all math is performed in float16, if useFloat16 is
+  /// specified. If true, we use cublasHgemm, supported only on CC
+  /// 5.3+. Otherwise, we use cublasSgemmEx.
+  bool useFloat16Accumulator;
+
+  /// Whether or not data is stored (transparently) in a transposed
+  /// layout, enabling use of the NN GEMM call, which is ~10% faster.
+  /// This will improve the speed of the flat index, but will
+  /// substantially slow down any add() calls made, as all data must
+  /// be transposed, and will increase storage requirements (we store
+  /// data in both transposed and non-transposed layouts).
+  bool storeTransposed;
+
+  bool storeInCpu;
+};
+
+/// Wrapper around the GPU implementation that looks like
+/// faiss::IndexFlat; copies over centroid data from a given
+/// faiss::IndexFlat
+class GpuIndexFlat : public GpuIndex {
+ public:
+  /// Construct from a pre-existing faiss::IndexFlat instance, copying
+  /// data over to the given GPU
+  GpuIndexFlat(GpuResources* resources,
+               const faiss::IndexFlat* index,
+               GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+  /// Construct an empty instance that can be added to
+  GpuIndexFlat(GpuResources* resources,
+               int dims,
+               faiss::MetricType metric,
+               GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+  ~GpuIndexFlat() override;
+
+  /// Initialize ourselves from the given CPU index; will overwrite
+  /// all data in ourselves
+  void copyFrom(const faiss::IndexFlat* index);
+
+  /// Copy ourselves to the given CPU index; will overwrite all data
+  /// in the index instance
+  void copyTo(faiss::IndexFlat* index) const;
+
+  /// Returns the number of vectors we contain
+  size_t getNumVecs() const;
+
+  /// Clears all vectors from this index
+  void reset() override;
+
+  /// This index is not trained, so this does nothing
+  void train(Index::idx_t n, const float* x) override;
+
+  /// Overrides to avoid excessive copies
+  void add(faiss::Index::idx_t, const float* x) override;
+
+  /// Reconstruction methods; prefer the batch reconstruct as it will
+  /// be more efficient
+  void reconstruct(faiss::Index::idx_t key, float* out) const override;
+
+  /// Batch reconstruction method
+  void reconstruct_n(faiss::Index::idx_t i0,
+                     faiss::Index::idx_t num,
+                     float* out) const override;
+
+  /// Compute residual
+  void compute_residual(const float* x,
+                        float* residual,
+                        faiss::Index::idx_t key) const override;
+
+  /// Compute residual (batch mode)
+  void compute_residual_n(faiss::Index::idx_t n,
+                          const float* xs,
+                          float* residuals,
+                          const faiss::Index::idx_t* keys) const override;
+
+  /// For internal access
+  inline FlatIndex* getGpuData() { return data_; }
+
+ protected:
+  /// Flat index does not require IDs as there is no storage available for them
+  bool addImplRequiresIDs_() const override;
+
+  /// Called from GpuIndex for add
+  void addImpl_(int n,
+                const float* x,
+                const Index::idx_t* ids) override;
+
+  /// Called from GpuIndex for search
+  void searchImpl_(int n,
+                   const float* x,
+                   int k,
+                   float* distances,
+                   faiss::Index::idx_t* labels) const override;
+
+ private:
+  /// Checks user settings for consistency
+  void verifySettings_() const;
+
+ protected:
+  /// Our config object
+  const GpuIndexFlatConfig config_;
+
+  /// Holds our GPU data containing the list of vectors; is managed via raw
+  /// pointer so as to allow non-CUDA compilers to see this header
+  FlatIndex* data_;
+
+  std::vector<float> xb_;
+};
+
+/// Wrapper around the GPU implementation that looks like
+/// faiss::IndexFlatL2; copies over centroid data from a given
+/// faiss::IndexFlat
+class GpuIndexFlatL2 : public GpuIndexFlat {
+ public:
+  /// Construct from a pre-existing faiss::IndexFlatL2 instance, copying
+  /// data over to the given GPU
+  GpuIndexFlatL2(GpuResources* resources,
+                 faiss::IndexFlatL2* index,
+                 GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+  /// Construct an empty instance that can be added to
+  GpuIndexFlatL2(GpuResources* resources,
+                 int dims,
+                 GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+  /// Initialize ourselves from the given CPU index; will overwrite
+  /// all data in ourselves
+  void copyFrom(faiss::IndexFlat* index);
+
+  /// Copy ourselves to the given CPU index; will overwrite all data
+  /// in the index instance
+  void copyTo(faiss::IndexFlat* index);
+};
+
+/// Wrapper around the GPU implementation that looks like
+/// faiss::IndexFlatIP; copies over centroid data from a given
+/// faiss::IndexFlat
+class GpuIndexFlatIP : public GpuIndexFlat {
+ public:
+  /// Construct from a pre-existing faiss::IndexFlatIP instance, copying
+  /// data over to the given GPU
+  GpuIndexFlatIP(GpuResources* resources,
+                 faiss::IndexFlatIP* index,
+                 GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+  /// Construct an empty instance that can be added to
+  GpuIndexFlatIP(GpuResources* resources,
+                 int dims,
+                 GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+  /// Initialize ourselves from the given CPU index; will overwrite
+  /// all data in ourselves
+  void copyFrom(faiss::IndexFlat* index);
+
+  /// Copy ourselves to the given CPU index; will overwrite all data
+  /// in the index instance
+  void copyTo(faiss::IndexFlat* index);
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.cu
new file mode 100644
index 0000000000..3c2fcd83e4
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.cu
@@ -0,0 +1,322 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/GpuIndexIVF.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Float16.cuh>
+
+namespace faiss { namespace gpu {
+
+GpuIndexIVF::GpuIndexIVF(GpuResources* resources,
+                         int dims,
+                         faiss::MetricType metric,
+                         int nlistIn,
+                         GpuIndexIVFConfig config) :
+    GpuIndex(resources, dims, metric, config),
+    ivfConfig_(std::move(config)),
+    nlist(nlistIn),
+    nprobe(1),
+    quantizer(nullptr) {
+  init_();
+}
+
+void
+GpuIndexIVF::init_() {
+  FAISS_ASSERT(nlist > 0);
+
+  // Spherical by default if the metric is inner_product
+  if (this->metric_type == faiss::METRIC_INNER_PRODUCT) {
+    this->cp.spherical = true;
+  }
+
+  // here we set a low # iterations because this is typically used
+  // for large clusterings
+  this->cp.niter = 10;
+  this->cp.verbose = this->verbose;
+
+  if (!quantizer) {
+    // Construct an empty quantizer
+    GpuIndexFlatConfig config = ivfConfig_.flatConfig;
+    // FIXME: inherit our same device
+    config.device = device_;
+
+    if (this->metric_type == faiss::METRIC_L2) {
+      quantizer = new GpuIndexFlatL2(resources_, this->d, config);
+    } else if (this->metric_type == faiss::METRIC_INNER_PRODUCT) {
+      quantizer = new GpuIndexFlatIP(resources_, this->d, config);
+    } else {
+      // unknown metric type
+      FAISS_THROW_IF_NOT_MSG(false, "unsupported metric type");
+    }
+  }
+}
+
+GpuIndexIVF::~GpuIndexIVF() {
+    if(remove_quantizer == 1)
+        delete quantizer;
+}
+
+GpuIndexFlat*
+GpuIndexIVF::getQuantizer() {
+  return quantizer;
+}
+
+void
+GpuIndexIVF::copyFrom(const faiss::IndexIVF* index) {
+  DeviceScope scope(device_);
+
+  this->d = index->d;
+  this->metric_type = index->metric_type;
+
+  FAISS_ASSERT(index->nlist > 0);
+  FAISS_THROW_IF_NOT_FMT(index->nlist <=
+                     (faiss::Index::idx_t) std::numeric_limits<int>::max(),
+                     "GPU index only supports %zu inverted lists",
+                     (size_t) std::numeric_limits<int>::max());
+  nlist = index->nlist;
+
+  FAISS_THROW_IF_NOT_FMT(index->nprobe > 0 &&
+                         index->nprobe <= getMaxKSelection(),
+                         "GPU index only supports nprobe <= %zu; passed %zu",
+                         (size_t) getMaxKSelection(),
+                         index->nprobe);
+  nprobe = index->nprobe;
+
+  // The metric type may have changed as well, so we might have to
+  // change our quantizer
+  delete quantizer;
+  quantizer = nullptr;
+
+  // Construct an empty quantizer
+  GpuIndexFlatConfig config = ivfConfig_.flatConfig;
+  // FIXME: inherit our same device
+  config.device = device_;
+
+  if (index->metric_type == faiss::METRIC_L2) {
+    // FIXME: 2 different float16 options?
+    quantizer = new GpuIndexFlatL2(resources_, this->d, config);
+  } else if (index->metric_type == faiss::METRIC_INNER_PRODUCT) {
+    // FIXME: 2 different float16 options?
+    quantizer = new GpuIndexFlatIP(resources_, this->d, config);
+  } else {
+    // unknown metric type
+    FAISS_ASSERT(false);
+  }
+
+  if (!index->is_trained) {
+    this->is_trained = false;
+    this->ntotal = 0;
+    return;
+  }
+
+  // Otherwise, we can populate ourselves from the other index
+  this->is_trained = true;
+
+  // ntotal can exceed max int, but the number of vectors per inverted
+  // list cannot exceed this. We check this in the subclasses.
+  this->ntotal = index->ntotal;
+
+  // Since we're trained, the quantizer must have data
+  FAISS_ASSERT(index->quantizer->ntotal > 0);
+
+  // Right now, we can only handle IndexFlat or derived classes
+  auto qFlat = dynamic_cast<faiss::IndexFlat*>(index->quantizer);
+  FAISS_THROW_IF_NOT_MSG(qFlat,
+                         "Only IndexFlat is supported for the coarse quantizer "
+                         "for copying from an IndexIVF into a GpuIndexIVF");
+
+  quantizer->copyFrom(qFlat);
+}
+
+void
+GpuIndexIVF::copyFrom(faiss::IndexIVF* index, gpu::GpuIndexFlat *&qt, int64_t mode) {
+    DeviceScope scope(device_);
+
+    this->d = index->d;
+    this->metric_type = index->metric_type;
+
+    FAISS_ASSERT(index->nlist > 0);
+    FAISS_THROW_IF_NOT_FMT(index->nlist <=
+                           (faiss::Index::idx_t) std::numeric_limits<int>::max(),
+                           "GPU index only supports %zu inverted lists",
+                           (size_t) std::numeric_limits<int>::max());
+    nlist = index->nlist;
+
+    FAISS_THROW_IF_NOT_FMT(index->nprobe > 0 &&
+                           index->nprobe <= getMaxKSelection(),
+                           "GPU index only supports nprobe <= %zu; passed %zu",
+                           (size_t) getMaxKSelection(),
+                           index->nprobe);
+    nprobe = index->nprobe;
+
+    // The metric type may have changed as well, so we might have to
+    // change our quantizer
+    delete quantizer;
+    quantizer = nullptr;
+
+    // Construct an empty quantizer
+    GpuIndexFlatConfig config = ivfConfig_.flatConfig;
+    // FIXME: inherit our same device
+    config.device = device_;
+    config.storeInCpu = true;
+
+    if(qt == nullptr) {
+        if (index->metric_type == faiss::METRIC_L2) {
+            // FIXME: 2 different float16 options?
+            quantizer = new GpuIndexFlatL2(resources_, this->d, config);
+        } else if (index->metric_type == faiss::METRIC_INNER_PRODUCT) {
+            // FIXME: 2 different float16 options?
+            quantizer = new GpuIndexFlatIP(resources_, this->d, config);
+        } else {
+            // unknown metric type
+            FAISS_ASSERT(false);
+        }
+    }
+
+    if (!index->is_trained) {
+        this->is_trained = false;
+        this->ntotal = 0;
+        return;
+    }
+
+    // Otherwise, we can populate ourselves from the other index
+    this->is_trained = true;
+
+    // restore quantizer from backup ptr
+    index->restore_quantizer();
+
+    // ntotal can exceed max int, but the number of vectors per inverted
+    // list cannot exceed this. We check this in the subclasses.
+    this->ntotal = index->ntotal;
+
+    // Since we're trained, the quantizer must have data
+    FAISS_ASSERT(index->quantizer->ntotal > 0);
+
+    if(qt == nullptr) {
+        // Right now, we can only handle IndexFlat or derived classes
+        auto qFlat = dynamic_cast<faiss::IndexFlat*>(index->quantizer);
+        FAISS_THROW_IF_NOT_MSG(qFlat,
+                               "Only IndexFlat is supported for the coarse quantizer "
+                               "for copying from an IndexIVF into a GpuIndexIVF");
+        quantizer->copyFrom(qFlat);
+        qt = quantizer;
+    } else {
+        quantizer = qt;
+    }
+    remove_quantizer = 0;
+}
+
+void
+GpuIndexIVF::copyTo(faiss::IndexIVF* index) const {
+  DeviceScope scope(device_);
+
+  //
+  // Index information
+  //
+  index->ntotal = this->ntotal;
+  index->d = this->d;
+  index->metric_type = this->metric_type;
+  index->is_trained = this->is_trained;
+
+  //
+  // IndexIVF information
+  //
+  index->nlist = nlist;
+  index->nprobe = nprobe;
+
+  // Construct and copy the appropriate quantizer
+  faiss::IndexFlat* q = nullptr;
+
+  if (this->metric_type == faiss::METRIC_L2) {
+    q = new faiss::IndexFlatL2(this->d);
+
+  } else if (this->metric_type == faiss::METRIC_INNER_PRODUCT) {
+    q = new faiss::IndexFlatIP(this->d);
+
+  } else {
+    // we should have one of the above metrics
+    FAISS_ASSERT(false);
+  }
+
+  FAISS_ASSERT(quantizer);
+  quantizer->copyTo(q);
+
+  if (index->own_fields) {
+    delete index->quantizer;
+  }
+
+  index->quantizer = q;
+  index->quantizer_trains_alone = 0;
+  index->own_fields = true;
+  index->cp = this->cp;
+  index->maintain_direct_map = false;
+  index->direct_map.clear();
+}
+
+int
+GpuIndexIVF::getNumLists() const {
+  return nlist;
+}
+
+void
+GpuIndexIVF::setNumProbes(int nprobe) {
+  FAISS_THROW_IF_NOT_FMT(nprobe > 0 && nprobe <= getMaxKSelection(),
+                         "GPU index only supports nprobe <= %d; passed %d",
+                         getMaxKSelection(),
+                         nprobe);
+  this->nprobe = nprobe;
+}
+
+int
+GpuIndexIVF::getNumProbes() const {
+  return nprobe;
+}
+
+bool
+GpuIndexIVF::addImplRequiresIDs_() const {
+  // All IVF indices have storage for IDs
+  return true;
+}
+
+void
+GpuIndexIVF::trainQuantizer_(faiss::Index::idx_t n, const float* x) {
+  if (n == 0) {
+    // nothing to do
+    return;
+  }
+
+  if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
+    if (this->verbose) {
+      printf ("IVF quantizer does not need training.\n");
+    }
+
+    return;
+  }
+
+  if (this->verbose) {
+    printf ("Training IVF quantizer on %ld vectors in %dD\n", n, d);
+  }
+
+  DeviceScope scope(device_);
+
+  // leverage the CPU-side k-means code, which works for the GPU
+  // flat index as well
+  quantizer->reset();
+  Clustering clus(this->d, nlist, this->cp);
+  clus.verbose = verbose;
+  clus.train(n, x, *quantizer);
+  quantizer->is_trained = true;
+
+  FAISS_ASSERT(quantizer->ntotal == nlist);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.h
new file mode 100644
index 0000000000..ca9a386641
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.h
@@ -0,0 +1,93 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndicesOptions.h>
+#include <faiss/Clustering.h>
+
+namespace faiss { struct IndexIVF; }
+
+namespace faiss { namespace gpu {
+
+class GpuIndexFlat;
+class GpuResources;
+
+struct GpuIndexIVFConfig : public GpuIndexConfig {
+  inline GpuIndexIVFConfig()
+      : indicesOptions(INDICES_64_BIT) {
+  }
+
+  /// Index storage options for the GPU
+  IndicesOptions indicesOptions;
+
+  /// Configuration for the coarse quantizer object
+  GpuIndexFlatConfig flatConfig;
+};
+
+class GpuIndexIVF : public GpuIndex {
+ public:
+  GpuIndexIVF(GpuResources* resources,
+              int dims,
+              faiss::MetricType metric,
+              int nlist,
+              GpuIndexIVFConfig config = GpuIndexIVFConfig());
+
+  ~GpuIndexIVF() override;
+
+ private:
+  /// Shared initialization functions
+  void init_();
+
+ public:
+  /// Copy what we need from the CPU equivalent
+  void copyFrom(const faiss::IndexIVF* index);
+
+  void copyFrom(faiss::IndexIVF* index, gpu::GpuIndexFlat *&qt, int64_t mode);
+
+  /// Copy what we have to the CPU equivalent
+  void copyTo(faiss::IndexIVF* index) const;
+
+  /// Returns the number of inverted lists we're managing
+  int getNumLists() const;
+
+  /// Return the quantizer we're using
+  GpuIndexFlat* getQuantizer();
+
+  /// Sets the number of list probes per query
+  void setNumProbes(int nprobe);
+
+  /// Returns our current number of list probes per query
+  int getNumProbes() const;
+
+ protected:
+  bool addImplRequiresIDs_() const override;
+  void trainQuantizer_(faiss::Index::idx_t n, const float* x);
+
+ public:
+  /// Exposing this like the CPU version for manipulation
+  ClusteringParameters cp;
+
+  /// Exposing this like the CPU version for query
+  int nlist;
+
+  /// Exposing this like the CPU version for manipulation
+  int nprobe;
+
+  /// Exposeing this like the CPU version for query
+  GpuIndexFlat* quantizer;
+
+  int remove_quantizer = 1;
+
+ protected:
+  GpuIndexIVFConfig ivfConfig_;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu
new file mode 100644
index 0000000000..d946f002b8
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu
@@ -0,0 +1,245 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/IVFFlat.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Float16.cuh>
+
+#include <limits>
+
+namespace faiss { namespace gpu {
+
+GpuIndexIVFFlat::GpuIndexIVFFlat(GpuResources* resources,
+                                 const faiss::IndexIVFFlat* index,
+                                 GpuIndexIVFFlatConfig config) :
+    GpuIndexIVF(resources,
+                index->d,
+                index->metric_type,
+                index->nlist,
+                config),
+    ivfFlatConfig_(config),
+    reserveMemoryVecs_(0),
+    index_(nullptr) {
+  copyFrom(index);
+}
+
+GpuIndexIVFFlat::GpuIndexIVFFlat(GpuResources* resources,
+                                 int dims,
+                                 int nlist,
+                                 faiss::MetricType metric,
+                                 GpuIndexIVFFlatConfig config) :
+    GpuIndexIVF(resources, dims, metric, nlist, config),
+    ivfFlatConfig_(config),
+    reserveMemoryVecs_(0),
+    index_(nullptr) {
+
+  // faiss::Index params
+  this->is_trained = false;
+
+  // We haven't trained ourselves, so don't construct the IVFFlat
+  // index yet
+}
+
+GpuIndexIVFFlat::~GpuIndexIVFFlat() {
+  delete index_;
+}
+
+void
+GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
+  reserveMemoryVecs_ = numVecs;
+  if (index_) {
+    index_->reserveMemory(numVecs);
+  }
+}
+
+void
+GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
+  DeviceScope scope(device_);
+
+  GpuIndexIVF::copyFrom(index);
+
+  // Clear out our old data
+  delete index_;
+  index_ = nullptr;
+
+  // The other index might not be trained
+  if (!index->is_trained) {
+    return;
+  }
+
+  // Otherwise, we can populate ourselves from the other index
+  this->is_trained = true;
+
+  // Copy our lists as well
+  index_ = new IVFFlat(resources_,
+                       quantizer->getGpuData(),
+                       index->metric_type,
+                       false, // no residual
+                       nullptr, // no scalar quantizer
+                       ivfFlatConfig_.indicesOptions,
+                       memorySpace_);
+  InvertedLists *ivf = index->invlists;
+
+    if (ReadOnlyArrayInvertedLists* rol = dynamic_cast<ReadOnlyArrayInvertedLists*>(ivf)) {
+        index_->copyCodeVectorsFromCpu((const float* )(rol->pin_readonly_codes->data),
+                                       (const long *)(rol->pin_readonly_ids->data), rol->readonly_length);
+        /* double t0 = getmillisecs(); */
+        /* std::cout << "Readonly Takes " << getmillisecs() - t0 << " ms" << std::endl; */
+    } else {
+        for (size_t i = 0; i < ivf->nlist; ++i) {
+            auto numVecs = ivf->list_size(i);
+
+            // GPU index can only support max int entries per list
+            FAISS_THROW_IF_NOT_FMT(numVecs <=
+                                   (size_t) std::numeric_limits<int>::max(),
+                                   "GPU inverted list can only support "
+                                   "%zu entries; %zu found",
+                                   (size_t) std::numeric_limits<int>::max(),
+                                   numVecs);
+
+            index_->addCodeVectorsFromCpu(i,
+                                          (const unsigned char*)(ivf->get_codes(i)),
+                                          ivf->get_ids(i),
+                                          numVecs);
+        }
+    }
+}
+
+void
+GpuIndexIVFFlat::copyTo(faiss::IndexIVFFlat* index) const {
+  DeviceScope scope(device_);
+
+  // We must have the indices in order to copy to ourselves
+  FAISS_THROW_IF_NOT_MSG(ivfFlatConfig_.indicesOptions != INDICES_IVF,
+                         "Cannot copy to CPU as GPU index doesn't retain "
+                         "indices (INDICES_IVF)");
+
+  GpuIndexIVF::copyTo(index);
+  index->code_size = this->d * sizeof(float);
+
+  InvertedLists *ivf = new ArrayInvertedLists(nlist, index->code_size);
+  index->replace_invlists(ivf, true);
+
+  // Copy the inverted lists
+  if (index_) {
+    for (int i = 0; i < nlist; ++i) {
+      auto listIndices = index_->getListIndices(i);
+      auto listData = index_->getListVectors(i);
+
+      ivf->add_entries(i,
+                       listIndices.size(),
+                       listIndices.data(),
+                       (const uint8_t*) listData.data());
+    }
+  }
+}
+
+size_t
+GpuIndexIVFFlat::reclaimMemory() {
+  if (index_) {
+    DeviceScope scope(device_);
+
+    return index_->reclaimMemory();
+  }
+
+  return 0;
+}
+
+void
+GpuIndexIVFFlat::reset() {
+  if (index_) {
+    DeviceScope scope(device_);
+
+    index_->reset();
+    this->ntotal = 0;
+  } else {
+    FAISS_ASSERT(this->ntotal == 0);
+  }
+}
+
+void
+GpuIndexIVFFlat::train(Index::idx_t n, const float* x) {
+  DeviceScope scope(device_);
+
+  if (this->is_trained) {
+    FAISS_ASSERT(quantizer->is_trained);
+    FAISS_ASSERT(quantizer->ntotal == nlist);
+    FAISS_ASSERT(index_);
+    return;
+  }
+
+  FAISS_ASSERT(!index_);
+
+  trainQuantizer_(n, x);
+
+  // The quantizer is now trained; construct the IVF index
+  index_ = new IVFFlat(resources_,
+                       quantizer->getGpuData(),
+                       this->metric_type,
+                       false, // no residual
+                       nullptr, // no scalar quantizer
+                       ivfFlatConfig_.indicesOptions,
+                       memorySpace_);
+
+  if (reserveMemoryVecs_) {
+    index_->reserveMemory(reserveMemoryVecs_);
+  }
+
+  this->is_trained = true;
+}
+
+void
+GpuIndexIVFFlat::addImpl_(int n,
+                          const float* x,
+                          const Index::idx_t* xids) {
+  // Device is already set in GpuIndex::add
+  FAISS_ASSERT(index_);
+  FAISS_ASSERT(n > 0);
+
+  // Data is already resident on the GPU
+  Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
+
+  static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
+  Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
+
+  // Not all vectors may be able to be added (some may contain NaNs etc)
+  index_->classifyAndAddVectors(data, labels);
+
+  // but keep the ntotal based on the total number of vectors that we attempted
+  // to add
+  ntotal += n;
+}
+
+void
+GpuIndexIVFFlat::searchImpl_(int n,
+                             const float* x,
+                             int k,
+                             float* distances,
+                             Index::idx_t* labels) const {
+  // Device is already set in GpuIndex::search
+  FAISS_ASSERT(index_);
+  FAISS_ASSERT(n > 0);
+
+  // Data is already resident on the GPU
+  Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
+  Tensor<float, 2, true> outDistances(distances, {n, k});
+
+  static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
+  Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
+
+  index_->query(queries, nprobe, k, outDistances, outLabels);
+}
+
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.h
new file mode 100644
index 0000000000..f5d6fba457
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.h
@@ -0,0 +1,85 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/GpuIndexIVF.h>
+
+namespace faiss { struct IndexIVFFlat; }
+
+namespace faiss { namespace gpu {
+
+class IVFFlat;
+class GpuIndexFlat;
+
+struct GpuIndexIVFFlatConfig : public GpuIndexIVFConfig {
+};
+
+/// Wrapper around the GPU implementation that looks like
+/// faiss::IndexIVFFlat
+class GpuIndexIVFFlat : public GpuIndexIVF {
+ public:
+  /// Construct from a pre-existing faiss::IndexIVFFlat instance, copying
+  /// data over to the given GPU, if the input index is trained.
+  GpuIndexIVFFlat(GpuResources* resources,
+                  const faiss::IndexIVFFlat* index,
+                  GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
+
+  /// Constructs a new instance with an empty flat quantizer; the user
+  /// provides the number of lists desired.
+  GpuIndexIVFFlat(GpuResources* resources,
+                  int dims,
+                  int nlist,
+                  faiss::MetricType metric,
+                  GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
+
+  ~GpuIndexIVFFlat() override;
+
+  /// Reserve GPU memory in our inverted lists for this number of vectors
+  void reserveMemory(size_t numVecs);
+
+  /// Initialize ourselves from the given CPU index; will overwrite
+  /// all data in ourselves
+  void copyFrom(const faiss::IndexIVFFlat* index);
+
+  /// Copy ourselves to the given CPU index; will overwrite all data
+  /// in the index instance
+  void copyTo(faiss::IndexIVFFlat* index) const;
+
+  /// After adding vectors, one can call this to reclaim device memory
+  /// to exactly the amount needed. Returns space reclaimed in bytes
+  size_t reclaimMemory();
+
+  void reset() override;
+
+  void train(Index::idx_t n, const float* x) override;
+
+ protected:
+  /// Called from GpuIndex for add/add_with_ids
+  void addImpl_(int n,
+                const float* x,
+                const Index::idx_t* ids) override;
+
+  /// Called from GpuIndex for search
+  void searchImpl_(int n,
+                   const float* x,
+                   int k,
+                   float* distances,
+                   Index::idx_t* labels) const override;
+
+ private:
+  GpuIndexIVFFlatConfig ivfFlatConfig_;
+
+  /// Desired inverted list memory reservation
+  size_t reserveMemoryVecs_;
+
+  /// Instance that we own; contains the inverted list
+  IVFFlat* index_;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu
new file mode 100644
index 0000000000..d75a9bf212
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu
@@ -0,0 +1,453 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/impl/ProductQuantizer.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/IVFPQ.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+
+#include <limits>
+
+namespace faiss { namespace gpu {
+
+GpuIndexIVFPQ::GpuIndexIVFPQ(GpuResources* resources,
+                             const faiss::IndexIVFPQ* index,
+                             GpuIndexIVFPQConfig config) :
+    GpuIndexIVF(resources,
+                index->d,
+                index->metric_type,
+                index->nlist,
+                config),
+    ivfpqConfig_(config),
+    subQuantizers_(0),
+    bitsPerCode_(0),
+    reserveMemoryVecs_(0),
+    index_(nullptr) {
+  copyFrom(index);
+}
+
+GpuIndexIVFPQ::GpuIndexIVFPQ(GpuResources* resources,
+                             int dims,
+                             int nlist,
+                             int subQuantizers,
+                             int bitsPerCode,
+                             faiss::MetricType metric,
+                             GpuIndexIVFPQConfig config) :
+    GpuIndexIVF(resources,
+                dims,
+                metric,
+                nlist,
+                config),
+    ivfpqConfig_(config),
+    subQuantizers_(subQuantizers),
+    bitsPerCode_(bitsPerCode),
+    reserveMemoryVecs_(0),
+    index_(nullptr) {
+  verifySettings_();
+
+  // FIXME make IP work fully
+  FAISS_ASSERT(this->metric_type == faiss::METRIC_L2);
+
+  // We haven't trained ourselves, so don't construct the PQ index yet
+  this->is_trained = false;
+}
+
+GpuIndexIVFPQ::~GpuIndexIVFPQ() {
+  delete index_;
+}
+
+void
+GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
+  DeviceScope scope(device_);
+
+  // FIXME: support this
+  FAISS_THROW_IF_NOT_MSG(index->metric_type == faiss::METRIC_L2,
+                     "GPU: inner product unsupported");
+  GpuIndexIVF::copyFrom(index);
+
+  // Clear out our old data
+  delete index_;
+  index_ = nullptr;
+
+  subQuantizers_ = index->pq.M;
+  bitsPerCode_ = index->pq.nbits;
+
+  // We only support this
+  FAISS_THROW_IF_NOT_MSG(index->pq.nbits == 8,
+                         "GPU: only pq.nbits == 8 is supported");
+  FAISS_THROW_IF_NOT_MSG(index->by_residual,
+                         "GPU: only by_residual = true is supported");
+  FAISS_THROW_IF_NOT_MSG(index->polysemous_ht == 0,
+                         "GPU: polysemous codes not supported");
+
+  verifySettings_();
+
+  // The other index might not be trained
+  if (!index->is_trained) {
+    return;
+  }
+
+  // Otherwise, we can populate ourselves from the other index
+  this->is_trained = true;
+
+  // Copy our lists as well
+  // The product quantizer must have data in it
+  FAISS_ASSERT(index->pq.centroids.size() > 0);
+  index_ = new IVFPQ(resources_,
+                     quantizer->getGpuData(),
+                     subQuantizers_,
+                     bitsPerCode_,
+                     (float*) index->pq.centroids.data(),
+                     ivfpqConfig_.indicesOptions,
+                     ivfpqConfig_.useFloat16LookupTables,
+                     memorySpace_);
+  // Doesn't make sense to reserve memory here
+  index_->setPrecomputedCodes(ivfpqConfig_.usePrecomputedTables);
+
+  // Copy database vectors, if any
+  const InvertedLists *ivf = index->invlists;
+  size_t nlist = ivf ? ivf->nlist : 0;
+  for (size_t i = 0; i < nlist; ++i) {
+    size_t list_size = ivf->list_size(i);
+
+    // GPU index can only support max int entries per list
+    FAISS_THROW_IF_NOT_FMT(list_size <=
+                       (size_t) std::numeric_limits<int>::max(),
+                       "GPU inverted list can only support "
+                       "%zu entries; %zu found",
+                       (size_t) std::numeric_limits<int>::max(),
+                       list_size);
+
+    index_->addCodeVectorsFromCpu(
+                       i, ivf->get_codes(i), ivf->get_ids(i), list_size);
+  }
+}
+
+void
+GpuIndexIVFPQ::copyTo(faiss::IndexIVFPQ* index) const {
+  DeviceScope scope(device_);
+
+  // We must have the indices in order to copy to ourselves
+  FAISS_THROW_IF_NOT_MSG(ivfpqConfig_.indicesOptions != INDICES_IVF,
+                     "Cannot copy to CPU as GPU index doesn't retain "
+                     "indices (INDICES_IVF)");
+
+  GpuIndexIVF::copyTo(index);
+
+  //
+  // IndexIVFPQ information
+  //
+  index->by_residual = true;
+  index->use_precomputed_table = 0;
+  index->code_size = subQuantizers_;
+  index->pq = faiss::ProductQuantizer(this->d, subQuantizers_, bitsPerCode_);
+
+  index->do_polysemous_training = false;
+  index->polysemous_training = nullptr;
+
+  index->scan_table_threshold = 0;
+  index->max_codes = 0;
+  index->polysemous_ht = 0;
+  index->precomputed_table.clear();
+
+  InvertedLists *ivf = new ArrayInvertedLists(
+      nlist, index->code_size);
+
+  index->replace_invlists(ivf, true);
+
+  if (index_) {
+    // Copy the inverted lists
+    for (int i = 0; i < nlist; ++i) {
+      auto ids = getListIndices(i);
+      auto codes = getListCodes(i);
+      index->invlists->add_entries (i, ids.size(), ids.data(), codes.data());
+    }
+
+    // Copy PQ centroids
+    auto devPQCentroids = index_->getPQCentroids();
+    index->pq.centroids.resize(devPQCentroids.numElements());
+
+    fromDevice<float, 3>(devPQCentroids,
+                         index->pq.centroids.data(),
+                         resources_->getDefaultStream(device_));
+
+    if (ivfpqConfig_.usePrecomputedTables) {
+      index->precompute_table();
+    }
+  }
+}
+
+void
+GpuIndexIVFPQ::reserveMemory(size_t numVecs) {
+  reserveMemoryVecs_ = numVecs;
+  if (index_) {
+    DeviceScope scope(device_);
+    index_->reserveMemory(numVecs);
+  }
+}
+
+void
+GpuIndexIVFPQ::setPrecomputedCodes(bool enable) {
+  ivfpqConfig_.usePrecomputedTables = enable;
+  if (index_) {
+    DeviceScope scope(device_);
+    index_->setPrecomputedCodes(enable);
+  }
+
+  verifySettings_();
+}
+
+bool
+GpuIndexIVFPQ::getPrecomputedCodes() const {
+  return ivfpqConfig_.usePrecomputedTables;
+}
+
+int
+GpuIndexIVFPQ::getNumSubQuantizers() const {
+  return subQuantizers_;
+}
+
+int
+GpuIndexIVFPQ::getBitsPerCode() const {
+  return bitsPerCode_;
+}
+
+int
+GpuIndexIVFPQ::getCentroidsPerSubQuantizer() const {
+  return utils::pow2(bitsPerCode_);
+}
+
+size_t
+GpuIndexIVFPQ::reclaimMemory() {
+  if (index_) {
+    DeviceScope scope(device_);
+    return index_->reclaimMemory();
+  }
+
+  return 0;
+}
+
+void
+GpuIndexIVFPQ::reset() {
+  if (index_) {
+    DeviceScope scope(device_);
+
+    index_->reset();
+    this->ntotal = 0;
+  } else {
+    FAISS_ASSERT(this->ntotal == 0);
+  }
+}
+
+void
+GpuIndexIVFPQ::trainResidualQuantizer_(Index::idx_t n, const float* x) {
+  // Code largely copied from faiss::IndexIVFPQ
+  // FIXME: GPUize more of this
+  n = std::min(n, (Index::idx_t) (1 << bitsPerCode_) * 64);
+
+  if (this->verbose) {
+    printf("computing residuals\n");
+  }
+
+  std::vector<Index::idx_t> assign(n);
+  quantizer->assign (n, x, assign.data());
+
+  std::vector<float> residuals(n * d);
+
+  // FIXME jhj convert to _n version
+  for (idx_t i = 0; i < n; i++) {
+    quantizer->compute_residual(x + i * d, &residuals[i * d], assign[i]);
+  }
+
+  if (this->verbose) {
+    printf("training %d x %d product quantizer on %ld vectors in %dD\n",
+           subQuantizers_, getCentroidsPerSubQuantizer(), n, this->d);
+  }
+
+  // Just use the CPU product quantizer to determine sub-centroids
+  faiss::ProductQuantizer pq(this->d, subQuantizers_, bitsPerCode_);
+  pq.verbose = this->verbose;
+  pq.train(n, residuals.data());
+
+  index_ = new IVFPQ(resources_,
+                     quantizer->getGpuData(),
+                     subQuantizers_,
+                     bitsPerCode_,
+                     pq.centroids.data(),
+                     ivfpqConfig_.indicesOptions,
+                     ivfpqConfig_.useFloat16LookupTables,
+                     memorySpace_);
+  if (reserveMemoryVecs_) {
+    index_->reserveMemory(reserveMemoryVecs_);
+  }
+
+  index_->setPrecomputedCodes(ivfpqConfig_.usePrecomputedTables);
+}
+
+void
+GpuIndexIVFPQ::train(Index::idx_t n, const float* x) {
+  DeviceScope scope(device_);
+
+  if (this->is_trained) {
+    FAISS_ASSERT(quantizer->is_trained);
+    FAISS_ASSERT(quantizer->ntotal == nlist);
+    FAISS_ASSERT(index_);
+    return;
+  }
+
+  FAISS_ASSERT(!index_);
+
+  // FIXME: GPUize more of this
+  // First, make sure that the data is resident on the CPU, if it is not on the
+  // CPU, as we depend upon parts of the CPU code
+  auto hostData = toHost<float, 2>((float*) x,
+                                   resources_->getDefaultStream(device_),
+                                   {(int) n, (int) this->d});
+
+  trainQuantizer_(n, hostData.data());
+  trainResidualQuantizer_(n, hostData.data());
+
+  FAISS_ASSERT(index_);
+
+  this->is_trained = true;
+}
+
+void
+GpuIndexIVFPQ::addImpl_(int n,
+                        const float* x,
+                        const Index::idx_t* xids) {
+  // Device is already set in GpuIndex::add
+  FAISS_ASSERT(index_);
+  FAISS_ASSERT(n > 0);
+
+  // Data is already resident on the GPU
+  Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
+
+  static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
+  Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
+
+  // Not all vectors may be able to be added (some may contain NaNs etc)
+  index_->classifyAndAddVectors(data, labels);
+
+  // but keep the ntotal based on the total number of vectors that we attempted
+  // to add
+  ntotal += n;
+}
+
+void
+GpuIndexIVFPQ::searchImpl_(int n,
+                           const float* x,
+                           int k,
+                           float* distances,
+                           Index::idx_t* labels) const {
+  // Device is already set in GpuIndex::search
+  FAISS_ASSERT(index_);
+  FAISS_ASSERT(n > 0);
+
+  // Data is already resident on the GPU
+  Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
+  Tensor<float, 2, true> outDistances(distances, {n, k});
+
+  static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
+  Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
+
+  index_->query(queries, nprobe, k, outDistances, outLabels);
+}
+
+int
+GpuIndexIVFPQ::getListLength(int listId) const {
+  FAISS_ASSERT(index_);
+  return index_->getListLength(listId);
+}
+
+std::vector<unsigned char>
+GpuIndexIVFPQ::getListCodes(int listId) const {
+  FAISS_ASSERT(index_);
+  DeviceScope scope(device_);
+
+  return index_->getListCodes(listId);
+}
+
+std::vector<long>
+GpuIndexIVFPQ::getListIndices(int listId) const {
+  FAISS_ASSERT(index_);
+  DeviceScope scope(device_);
+
+  return index_->getListIndices(listId);
+}
+
+void
+GpuIndexIVFPQ::verifySettings_() const {
+  // Our implementation has these restrictions:
+
+  // Must have some number of lists
+  FAISS_THROW_IF_NOT_MSG(nlist > 0, "nlist must be >0");
+
+  // up to a single byte per code
+  FAISS_THROW_IF_NOT_FMT(bitsPerCode_ <= 8,
+                     "Bits per code must be <= 8 (passed %d)", bitsPerCode_);
+
+  // Sub-quantizers must evenly divide dimensions available
+  FAISS_THROW_IF_NOT_FMT(this->d % subQuantizers_ == 0,
+                     "Number of sub-quantizers (%d) must be an "
+                     "even divisor of the number of dimensions (%d)",
+                     subQuantizers_, this->d);
+
+  // The number of bytes per encoded vector must be one we support
+  FAISS_THROW_IF_NOT_FMT(IVFPQ::isSupportedPQCodeLength(subQuantizers_),
+                     "Number of bytes per encoded vector / sub-quantizers (%d) "
+                     "is not supported",
+                     subQuantizers_);
+
+  // We must have enough shared memory on the current device to store
+  // our lookup distances
+  int lookupTableSize = sizeof(float);
+  if (ivfpqConfig_.useFloat16LookupTables) {
+    lookupTableSize = sizeof(half);
+  }
+
+  // 64 bytes per code is only supported with usage of float16, at 2^8
+  // codes per subquantizer
+  size_t requiredSmemSize =
+    lookupTableSize * subQuantizers_ * utils::pow2(bitsPerCode_);
+  size_t smemPerBlock = getMaxSharedMemPerBlock(device_);
+
+  FAISS_THROW_IF_NOT_FMT(requiredSmemSize
+                     <= getMaxSharedMemPerBlock(device_),
+                     "Device %d has %zu bytes of shared memory, while "
+                     "%d bits per code and %d sub-quantizers requires %zu "
+                     "bytes. Consider useFloat16LookupTables and/or "
+                     "reduce parameters",
+                     device_, smemPerBlock, bitsPerCode_, subQuantizers_,
+                     requiredSmemSize);
+
+  // If precomputed codes are disabled, we have an extra limitation in
+  // terms of the number of dimensions per subquantizer
+  FAISS_THROW_IF_NOT_FMT(ivfpqConfig_.usePrecomputedTables ||
+                     IVFPQ::isSupportedNoPrecomputedSubDimSize(
+                       this->d / subQuantizers_),
+                     "Number of dimensions per sub-quantizer (%d) "
+                     "is not currently supported without precomputed codes. "
+                     "Only 1, 2, 3, 4, 6, 8, 10, 12, 16, 20, 24, 28, 32 dims "
+                     "per sub-quantizer are currently supported with no "
+                     "precomputed codes. "
+                     "Precomputed codes supports any number of dimensions, but "
+                     "will involve memory overheads.",
+                     this->d / subQuantizers_);
+
+  // TODO: fully implement METRIC_INNER_PRODUCT
+  FAISS_THROW_IF_NOT_MSG(this->metric_type == faiss::METRIC_L2,
+                     "METRIC_INNER_PRODUCT is currently unsupported");
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.h
new file mode 100644
index 0000000000..0bde2596ae
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.h
@@ -0,0 +1,143 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/GpuIndexIVF.h>
+#include <vector>
+
+namespace faiss { struct IndexIVFPQ; }
+
+namespace faiss { namespace gpu {
+
+class GpuIndexFlat;
+class IVFPQ;
+
+struct GpuIndexIVFPQConfig : public GpuIndexIVFConfig {
+  inline GpuIndexIVFPQConfig()
+      : useFloat16LookupTables(false),
+        usePrecomputedTables(false) {
+  }
+
+  /// Whether or not float16 residual distance tables are used in the
+  /// list scanning kernels. When subQuantizers * 2^bitsPerCode >
+  /// 16384, this is required.
+  bool useFloat16LookupTables;
+
+  /// Whether or not we enable the precomputed table option for
+  /// search, which can substantially increase the memory requirement.
+  bool usePrecomputedTables;
+};
+
+/// IVFPQ index for the GPU
+class GpuIndexIVFPQ : public GpuIndexIVF {
+ public:
+  /// Construct from a pre-existing faiss::IndexIVFPQ instance, copying
+  /// data over to the given GPU, if the input index is trained.
+  GpuIndexIVFPQ(GpuResources* resources,
+                const faiss::IndexIVFPQ* index,
+                GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig());
+
+  /// Construct an empty index
+  GpuIndexIVFPQ(GpuResources* resources,
+                int dims,
+                int nlist,
+                int subQuantizers,
+                int bitsPerCode,
+                faiss::MetricType metric,
+                GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig());
+
+  ~GpuIndexIVFPQ() override;
+
+  /// Reserve space on the GPU for the inverted lists for `num`
+  /// vectors, assumed equally distributed among
+
+  /// Initialize ourselves from the given CPU index; will overwrite
+  /// all data in ourselves
+  void copyFrom(const faiss::IndexIVFPQ* index);
+
+  /// Copy ourselves to the given CPU index; will overwrite all data
+  /// in the index instance
+  void copyTo(faiss::IndexIVFPQ* index) const;
+
+  /// Reserve GPU memory in our inverted lists for this number of vectors
+  void reserveMemory(size_t numVecs);
+
+  /// Enable or disable pre-computed codes
+  void setPrecomputedCodes(bool enable);
+
+  /// Are pre-computed codes enabled?
+  bool getPrecomputedCodes() const;
+
+  /// Return the number of sub-quantizers we are using
+  int getNumSubQuantizers() const;
+
+  /// Return the number of bits per PQ code
+  int getBitsPerCode() const;
+
+  /// Return the number of centroids per PQ code (2^bits per code)
+  int getCentroidsPerSubQuantizer() const;
+
+  /// After adding vectors, one can call this to reclaim device memory
+  /// to exactly the amount needed. Returns space reclaimed in bytes
+  size_t reclaimMemory();
+
+  /// Clears out all inverted lists, but retains the coarse and
+  /// product centroid information
+  void reset() override;
+
+  void train(Index::idx_t n, const float* x) override;
+
+  /// For debugging purposes, return the list length of a particular
+  /// list
+  int getListLength(int listId) const;
+
+  /// For debugging purposes, return the list codes of a particular
+  /// list
+  std::vector<unsigned char> getListCodes(int listId) const;
+
+  /// For debugging purposes, return the list indices of a particular
+  /// list
+  std::vector<long> getListIndices(int listId) const;
+
+ protected:
+  /// Called from GpuIndex for add/add_with_ids
+  void addImpl_(int n,
+                const float* x,
+                const Index::idx_t* ids) override;
+
+  /// Called from GpuIndex for search
+  void searchImpl_(int n,
+                   const float* x,
+                   int k,
+                   float* distances,
+                   Index::idx_t* labels) const override;
+
+ private:
+  void verifySettings_() const;
+
+  void trainResidualQuantizer_(Index::idx_t n, const float* x);
+
+ private:
+  GpuIndexIVFPQConfig ivfpqConfig_;
+
+  /// Number of sub-quantizers per encoded vector
+  int subQuantizers_;
+
+  /// Bits per sub-quantizer code
+  int bitsPerCode_;
+
+  /// Desired inverted list memory reservation
+  size_t reserveMemoryVecs_;
+
+  /// The product quantizer instance that we own; contains the
+  /// inverted lists
+  IVFPQ* index_;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu
new file mode 100644
index 0000000000..ddda9193d1
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu
@@ -0,0 +1,341 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/GpuIndexIVFSQHybrid.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
+#include <faiss/gpu/impl/IVFFlat.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/utils/utils.h>
+#include <limits>
+
+namespace faiss { namespace gpu {
+
+GpuIndexIVFSQHybrid::GpuIndexIVFSQHybrid(
+  GpuResources* resources,
+  faiss::IndexIVFSQHybrid* index,
+  GpuIndexIVFSQHybridConfig config) :
+    GpuIndexIVF(resources,
+                index->d,
+                index->metric_type,
+                index->nlist,
+                config),
+    ivfSQConfig_(config),
+    sq(index->sq),
+    by_residual(index->by_residual),
+    reserveMemoryVecs_(0),
+    index_(nullptr) {
+  gpu::GpuIndexFlat *quantizer = nullptr;
+  copyFrom(index, quantizer, 0);
+
+  FAISS_THROW_IF_NOT_MSG(isSQSupported(sq.qtype),
+                         "Unsupported QuantizerType on GPU");
+}
+
+GpuIndexIVFSQHybrid::GpuIndexIVFSQHybrid(
+  GpuResources* resources,
+  int dims,
+  int nlist,
+  faiss::ScalarQuantizer::QuantizerType qtype,
+  faiss::MetricType metric,
+  bool encodeResidual,
+  GpuIndexIVFSQHybridConfig config) :
+    GpuIndexIVF(resources, dims, metric, nlist, config),
+    ivfSQConfig_(config),
+    sq(dims, qtype),
+    by_residual(encodeResidual),
+    reserveMemoryVecs_(0),
+    index_(nullptr) {
+
+  // faiss::Index params
+  this->is_trained = false;
+
+  // We haven't trained ourselves, so don't construct the IVFFlat
+  // index yet
+  FAISS_THROW_IF_NOT_MSG(isSQSupported(sq.qtype),
+                         "Unsupported QuantizerType on GPU");
+}
+
+GpuIndexIVFSQHybrid::~GpuIndexIVFSQHybrid() {
+  delete index_;
+}
+
+void
+GpuIndexIVFSQHybrid::reserveMemory(size_t numVecs) {
+  reserveMemoryVecs_ = numVecs;
+  if (index_) {
+    index_->reserveMemory(numVecs);
+  }
+}
+
+void
+GpuIndexIVFSQHybrid::copyFrom(
+        const faiss::IndexIVFSQHybrid* index) {
+    DeviceScope scope(device_);
+
+    // Clear out our old data
+    delete index_;
+    index_ = nullptr;
+
+    // Copy what we need from the CPU index
+    GpuIndexIVF::copyFrom(index);
+    sq = index->sq;
+    by_residual = index->by_residual;
+
+    // The other index might not be trained, in which case we don't need to copy
+    // over the lists
+    if (!index->is_trained) {
+        return;
+    }
+
+    // Otherwise, we can populate ourselves from the other index
+    this->is_trained = true;
+
+    // Copy our lists as well
+    index_ = new IVFFlat(resources_,
+                         quantizer->getGpuData(),
+                         index->metric_type,
+                         by_residual,
+                         &sq,
+                         ivfSQConfig_.indicesOptions,
+                         memorySpace_);
+
+    InvertedLists* ivf = index->invlists;
+    if(ReadOnlyArrayInvertedLists* rol = dynamic_cast<ReadOnlyArrayInvertedLists*>(ivf)) {
+        index_->copyCodeVectorsFromCpu((const float* )(rol->pin_readonly_codes->data),
+                                       (const long *)(rol->pin_readonly_ids->data), rol->readonly_length);
+    } else {
+        for (size_t i = 0; i < ivf->nlist; ++i) {
+            auto numVecs = ivf->list_size(i);
+
+            // GPU index can only support max int entries per list
+            FAISS_THROW_IF_NOT_FMT(numVecs <=
+                                   (size_t) std::numeric_limits<int>::max(),
+                                   "GPU inverted list can only support "
+                                   "%zu entries; %zu found",
+                                   (size_t) std::numeric_limits<int>::max(),
+                                   numVecs);
+
+            index_->addCodeVectorsFromCpu(
+                    i,
+                    (const unsigned char*) ivf->get_codes(i),
+                    ivf->get_ids(i),
+                    numVecs);
+        }
+    }
+}
+
+void
+GpuIndexIVFSQHybrid::copyFrom(
+  faiss::IndexIVFSQHybrid* index,
+  gpu::GpuIndexFlat *&qt,
+  long mode) {
+  DeviceScope scope(device_);
+
+  // Clear out our old data
+  delete index_;
+  index_ = nullptr;
+
+  GpuIndexIVF::copyFrom(index, qt, mode);
+  if(mode == 1) {
+      // Only copy quantizer
+      return ;
+  }
+
+  sq = index->sq;
+  by_residual = index->by_residual;
+
+  // The other index might not be trained, in which case we don't need to copy
+  // over the lists
+  if (!index->is_trained) {
+    return;
+  }
+
+  // Otherwise, we can populate ourselves from the other index
+  this->is_trained = true;
+
+  // Copy our lists as well
+  index_ = new IVFFlat(resources_,
+                       quantizer->getGpuData(),
+                       index->metric_type,
+                       by_residual,
+                       &sq,
+                       ivfSQConfig_.indicesOptions,
+                       memorySpace_);
+
+  InvertedLists* ivf = index->invlists;
+  if(ReadOnlyArrayInvertedLists* rol = dynamic_cast<ReadOnlyArrayInvertedLists*>(ivf)) {
+      index_->copyCodeVectorsFromCpu((const float* )(rol->pin_readonly_codes->data),
+                                     (const long *)(rol->pin_readonly_ids->data), rol->readonly_length);
+  } else {
+      for (size_t i = 0; i < ivf->nlist; ++i) {
+          auto numVecs = ivf->list_size(i);
+
+          // GPU index can only support max int entries per list
+          FAISS_THROW_IF_NOT_FMT(numVecs <=
+                                 (size_t) std::numeric_limits<int>::max(),
+                                 "GPU inverted list can only support "
+                                 "%zu entries; %zu found",
+                                 (size_t) std::numeric_limits<int>::max(),
+                                 numVecs);
+
+          index_->addCodeVectorsFromCpu(
+                  i,
+                  (const unsigned char*) ivf->get_codes(i),
+                  ivf->get_ids(i),
+                  numVecs);
+      }
+  }
+}
+
+void
+GpuIndexIVFSQHybrid::copyTo(
+  faiss::IndexIVFSQHybrid* index) const {
+  DeviceScope scope(device_);
+
+  // We must have the indices in order to copy to ourselves
+  FAISS_THROW_IF_NOT_MSG(
+    ivfSQConfig_.indicesOptions != INDICES_IVF,
+    "Cannot copy to CPU as GPU index doesn't retain "
+    "indices (INDICES_IVF)");
+
+  GpuIndexIVF::copyTo(index);
+  index->sq = sq;
+  index->by_residual = by_residual;
+  index->code_size = sq.code_size;
+
+  InvertedLists* ivf = new ArrayInvertedLists(nlist, index->code_size);
+  index->replace_invlists(ivf, true);
+
+  // Copy the inverted lists
+  if (index_) {
+    for (int i = 0; i < nlist; ++i) {
+      auto listIndices = index_->getListIndices(i);
+      auto listData = index_->getListVectors(i);
+
+      ivf->add_entries(i,
+                       listIndices.size(),
+                       listIndices.data(),
+                       (const uint8_t*) listData.data());
+    }
+  }
+}
+
+size_t
+GpuIndexIVFSQHybrid::reclaimMemory() {
+  if (index_) {
+    DeviceScope scope(device_);
+
+    return index_->reclaimMemory();
+  }
+
+  return 0;
+}
+
+void
+GpuIndexIVFSQHybrid::reset() {
+  if (index_) {
+    DeviceScope scope(device_);
+
+    index_->reset();
+    this->ntotal = 0;
+  } else {
+    FAISS_ASSERT(this->ntotal == 0);
+  }
+}
+
+void
+GpuIndexIVFSQHybrid::trainResiduals_(Index::idx_t n, const float* x) {
+  // The input is already guaranteed to be on the CPU
+  sq.train_residual(n, x, quantizer, by_residual, verbose);
+}
+
+void
+GpuIndexIVFSQHybrid::train(Index::idx_t n, const float* x) {
+  DeviceScope scope(device_);
+
+  if (this->is_trained) {
+    FAISS_ASSERT(quantizer->is_trained);
+    FAISS_ASSERT(quantizer->ntotal == nlist);
+    FAISS_ASSERT(index_);
+    return;
+  }
+
+  FAISS_ASSERT(!index_);
+
+  // FIXME: GPUize more of this
+  // First, make sure that the data is resident on the CPU, if it is not on the
+  // CPU, as we depend upon parts of the CPU code
+  auto hostData = toHost<float, 2>((float*) x,
+                                   resources_->getDefaultStream(device_),
+                                   {(int) n, (int) this->d});
+
+  trainQuantizer_(n, hostData.data());
+  trainResiduals_(n, hostData.data());
+
+  // The quantizer is now trained; construct the IVF index
+  index_ = new IVFFlat(resources_,
+                       quantizer->getGpuData(),
+                       this->metric_type,
+                       by_residual,
+                       &sq,
+                       ivfSQConfig_.indicesOptions,
+                       memorySpace_);
+
+  if (reserveMemoryVecs_) {
+    index_->reserveMemory(reserveMemoryVecs_);
+  }
+
+  this->is_trained = true;
+}
+
+void
+GpuIndexIVFSQHybrid::addImpl_(int n,
+                                     const float* x,
+                                     const Index::idx_t* xids) {
+  // Device is already set in GpuIndex::add
+  FAISS_ASSERT(index_);
+  FAISS_ASSERT(n > 0);
+
+  // Data is already resident on the GPU
+  Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
+
+  static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
+  Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
+
+  // Not all vectors may be able to be added (some may contain NaNs etc)
+  index_->classifyAndAddVectors(data, labels);
+
+  // but keep the ntotal based on the total number of vectors that we attempted
+  // to add
+  ntotal += n;
+}
+
+void
+GpuIndexIVFSQHybrid::searchImpl_(int n,
+                                        const float* x,
+                                        int k,
+                                        float* distances,
+                                        Index::idx_t* labels) const {
+  // Device is already set in GpuIndex::search
+  FAISS_ASSERT(index_);
+  FAISS_ASSERT(n > 0);
+
+  // Data is already resident on the GPU
+  Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
+  Tensor<float, 2, true> outDistances(distances, {n, k});
+
+  static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
+  Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
+
+  index_->query(queries, nprobe, k, outDistances, outLabels);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.h
new file mode 100644
index 0000000000..908635033f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.h
@@ -0,0 +1,104 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/GpuIndexIVF.h>
+#include <faiss/IndexSQHybrid.h>
+
+namespace faiss { namespace gpu {
+
+class IVFFlat;
+class GpuIndexFlat;
+
+struct GpuIndexIVFSQHybridConfig : public GpuIndexIVFConfig {
+};
+
+/// Wrapper around the GPU implementation that looks like
+/// faiss::IndexIVFSQHybrid
+class GpuIndexIVFSQHybrid : public GpuIndexIVF {
+ public:
+  /// Construct from a pre-existing faiss::IndexIVFSQHybrid instance,
+  /// copying data over to the given GPU, if the input index is trained.
+  GpuIndexIVFSQHybrid(
+    GpuResources* resources,
+    faiss::IndexIVFSQHybrid* index,
+    GpuIndexIVFSQHybridConfig config =
+    GpuIndexIVFSQHybridConfig());
+
+  /// Constructs a new instance with an empty flat quantizer; the user
+  /// provides the number of lists desired.
+  GpuIndexIVFSQHybrid(
+    GpuResources* resources,
+    int dims,
+    int nlist,
+    faiss::ScalarQuantizer::QuantizerType qtype,
+    faiss::MetricType metric = MetricType::METRIC_L2,
+    bool encodeResidual = true,
+    GpuIndexIVFSQHybridConfig config =
+    GpuIndexIVFSQHybridConfig());
+
+  ~GpuIndexIVFSQHybrid() override;
+
+  /// Reserve GPU memory in our inverted lists for this number of vectors
+  void reserveMemory(size_t numVecs);
+
+  /// Initialize ourselves from the given CPU index; will overwrite
+  /// all data in ourselves
+  void copyFrom(const faiss::IndexIVFSQHybrid* index);
+
+  /// Initialize ourselves from the given CPU index; will overwrite
+  /// all data in ourselves
+  void copyFrom(faiss::IndexIVFSQHybrid* index, gpu::GpuIndexFlat *&quantizer, int64_t mode);
+
+  /// Copy ourselves to the given CPU index; will overwrite all data
+  /// in the index instance
+  void copyTo(faiss::IndexIVFSQHybrid* index) const;
+
+  /// After adding vectors, one can call this to reclaim device memory
+  /// to exactly the amount needed. Returns space reclaimed in bytes
+  size_t reclaimMemory();
+
+  void reset() override;
+
+  void train(Index::idx_t n, const float* x) override;
+
+ protected:
+  /// Called from GpuIndex for add/add_with_ids
+  void addImpl_(int n,
+                const float* x,
+                const Index::idx_t* ids) override;
+
+  /// Called from GpuIndex for search
+  void searchImpl_(int n,
+                   const float* x,
+                   int k,
+                   float* distances,
+                   Index::idx_t* labels) const override;
+
+  /// Called from train to handle SQ residual training
+  void trainResiduals_(Index::idx_t n, const float* x);
+
+ public:
+  /// Exposed like the CPU version
+  faiss::ScalarQuantizer sq;
+
+  /// Exposed like the CPU version
+  bool by_residual;
+
+ private:
+  GpuIndexIVFSQHybridConfig ivfSQConfig_;
+
+  /// Desired inverted list memory reservation
+  size_t reserveMemoryVecs_;
+
+  /// Instance that we own; contains the inverted list
+  IVFFlat* index_;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
new file mode 100644
index 0000000000..866bfdbfdf
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
@@ -0,0 +1,276 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
+#include <faiss/gpu/impl/IVFFlat.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <limits>
+
+namespace faiss { namespace gpu {
+
+GpuIndexIVFScalarQuantizer::GpuIndexIVFScalarQuantizer(
+  GpuResources* resources,
+  const faiss::IndexIVFScalarQuantizer* index,
+  GpuIndexIVFScalarQuantizerConfig config) :
+    GpuIndexIVF(resources,
+                index->d,
+                index->metric_type,
+                index->nlist,
+                config),
+    ivfSQConfig_(config),
+    sq(index->sq),
+    by_residual(index->by_residual),
+    reserveMemoryVecs_(0),
+    index_(nullptr) {
+  copyFrom(index);
+
+  FAISS_THROW_IF_NOT_MSG(isSQSupported(sq.qtype),
+                         "Unsupported QuantizerType on GPU");
+}
+
+GpuIndexIVFScalarQuantizer::GpuIndexIVFScalarQuantizer(
+  GpuResources* resources,
+  int dims,
+  int nlist,
+  faiss::ScalarQuantizer::QuantizerType qtype,
+  faiss::MetricType metric,
+  bool encodeResidual,
+  GpuIndexIVFScalarQuantizerConfig config) :
+    GpuIndexIVF(resources, dims, metric, nlist, config),
+    ivfSQConfig_(config),
+    sq(dims, qtype),
+    by_residual(encodeResidual),
+    reserveMemoryVecs_(0),
+    index_(nullptr) {
+
+  // faiss::Index params
+  this->is_trained = false;
+
+  // We haven't trained ourselves, so don't construct the IVFFlat
+  // index yet
+  FAISS_THROW_IF_NOT_MSG(isSQSupported(sq.qtype),
+                         "Unsupported QuantizerType on GPU");
+}
+
+GpuIndexIVFScalarQuantizer::~GpuIndexIVFScalarQuantizer() {
+  delete index_;
+}
+
+void
+GpuIndexIVFScalarQuantizer::reserveMemory(size_t numVecs) {
+  reserveMemoryVecs_ = numVecs;
+  if (index_) {
+    index_->reserveMemory(numVecs);
+  }
+}
+
+void
+GpuIndexIVFScalarQuantizer::copyFrom(
+  const faiss::IndexIVFScalarQuantizer* index) {
+  DeviceScope scope(device_);
+
+  // Clear out our old data
+  delete index_;
+  index_ = nullptr;
+
+  // Copy what we need from the CPU index
+  GpuIndexIVF::copyFrom(index);
+  sq = index->sq;
+  by_residual = index->by_residual;
+
+  // The other index might not be trained, in which case we don't need to copy
+  // over the lists
+  if (!index->is_trained) {
+    return;
+  }
+
+  // Otherwise, we can populate ourselves from the other index
+  this->is_trained = true;
+
+  // Copy our lists as well
+  index_ = new IVFFlat(resources_,
+                       quantizer->getGpuData(),
+                       index->metric_type,
+                       by_residual,
+                       &sq,
+                       ivfSQConfig_.indicesOptions,
+                       memorySpace_);
+
+  InvertedLists* ivf = index->invlists;
+  if(ReadOnlyArrayInvertedLists* rol = dynamic_cast<ReadOnlyArrayInvertedLists*>(ivf)) {
+      index_->copyCodeVectorsFromCpu((const float* )(rol->pin_readonly_codes->data),
+                                     (const long *)(rol->pin_readonly_ids->data), rol->readonly_length);
+  } else {
+      for (size_t i = 0; i < ivf->nlist; ++i) {
+          auto numVecs = ivf->list_size(i);
+
+          // GPU index can only support max int entries per list
+          FAISS_THROW_IF_NOT_FMT(numVecs <=
+                                 (size_t) std::numeric_limits<int>::max(),
+                                 "GPU inverted list can only support "
+                                 "%zu entries; %zu found",
+                                 (size_t) std::numeric_limits<int>::max(),
+                                 numVecs);
+
+          index_->addCodeVectorsFromCpu(
+                  i,
+                  (const unsigned char*) ivf->get_codes(i),
+                  ivf->get_ids(i),
+                  numVecs);
+      }
+  }
+}
+
+void
+GpuIndexIVFScalarQuantizer::copyTo(
+  faiss::IndexIVFScalarQuantizer* index) const {
+  DeviceScope scope(device_);
+
+  // We must have the indices in order to copy to ourselves
+  FAISS_THROW_IF_NOT_MSG(
+    ivfSQConfig_.indicesOptions != INDICES_IVF,
+    "Cannot copy to CPU as GPU index doesn't retain "
+    "indices (INDICES_IVF)");
+
+  GpuIndexIVF::copyTo(index);
+  index->sq = sq;
+  index->by_residual = by_residual;
+  index->code_size = sq.code_size;
+
+  InvertedLists* ivf = new ArrayInvertedLists(nlist, index->code_size);
+  index->replace_invlists(ivf, true);
+
+  // Copy the inverted lists
+  if (index_) {
+    for (int i = 0; i < nlist; ++i) {
+      auto listIndices = index_->getListIndices(i);
+      auto listData = index_->getListVectors(i);
+
+      ivf->add_entries(i,
+                       listIndices.size(),
+                       listIndices.data(),
+                       (const uint8_t*) listData.data());
+    }
+  }
+}
+
+size_t
+GpuIndexIVFScalarQuantizer::reclaimMemory() {
+  if (index_) {
+    DeviceScope scope(device_);
+
+    return index_->reclaimMemory();
+  }
+
+  return 0;
+}
+
+void
+GpuIndexIVFScalarQuantizer::reset() {
+  if (index_) {
+    DeviceScope scope(device_);
+
+    index_->reset();
+    this->ntotal = 0;
+  } else {
+    FAISS_ASSERT(this->ntotal == 0);
+  }
+}
+
+void
+GpuIndexIVFScalarQuantizer::trainResiduals_(Index::idx_t n, const float* x) {
+  // The input is already guaranteed to be on the CPU
+  sq.train_residual(n, x, quantizer, by_residual, verbose);
+}
+
+void
+GpuIndexIVFScalarQuantizer::train(Index::idx_t n, const float* x) {
+  DeviceScope scope(device_);
+
+  if (this->is_trained) {
+    FAISS_ASSERT(quantizer->is_trained);
+    FAISS_ASSERT(quantizer->ntotal == nlist);
+    FAISS_ASSERT(index_);
+    return;
+  }
+
+  FAISS_ASSERT(!index_);
+
+  // FIXME: GPUize more of this
+  // First, make sure that the data is resident on the CPU, if it is not on the
+  // CPU, as we depend upon parts of the CPU code
+  auto hostData = toHost<float, 2>((float*) x,
+                                   resources_->getDefaultStream(device_),
+                                   {(int) n, (int) this->d});
+
+  trainQuantizer_(n, hostData.data());
+  trainResiduals_(n, hostData.data());
+
+  // The quantizer is now trained; construct the IVF index
+  index_ = new IVFFlat(resources_,
+                       quantizer->getGpuData(),
+                       this->metric_type,
+                       by_residual,
+                       &sq,
+                       ivfSQConfig_.indicesOptions,
+                       memorySpace_);
+
+  if (reserveMemoryVecs_) {
+    index_->reserveMemory(reserveMemoryVecs_);
+  }
+
+  this->is_trained = true;
+}
+
+void
+GpuIndexIVFScalarQuantizer::addImpl_(int n,
+                                     const float* x,
+                                     const Index::idx_t* xids) {
+  // Device is already set in GpuIndex::add
+  FAISS_ASSERT(index_);
+  FAISS_ASSERT(n > 0);
+
+  // Data is already resident on the GPU
+  Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
+
+  static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
+  Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
+
+  // Not all vectors may be able to be added (some may contain NaNs etc)
+  index_->classifyAndAddVectors(data, labels);
+
+  // but keep the ntotal based on the total number of vectors that we attempted
+  // to add
+  ntotal += n;
+}
+
+void
+GpuIndexIVFScalarQuantizer::searchImpl_(int n,
+                                        const float* x,
+                                        int k,
+                                        float* distances,
+                                        Index::idx_t* labels) const {
+  // Device is already set in GpuIndex::search
+  FAISS_ASSERT(index_);
+  FAISS_ASSERT(n > 0);
+
+  // Data is already resident on the GPU
+  Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
+  Tensor<float, 2, true> outDistances(distances, {n, k});
+
+  static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
+  Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
+
+  index_->query(queries, nprobe, k, outDistances, outLabels);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.h
new file mode 100644
index 0000000000..ea4a9d7bc1
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.h
@@ -0,0 +1,100 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/GpuIndexIVF.h>
+#include <faiss/IndexScalarQuantizer.h>
+
+namespace faiss { namespace gpu {
+
+class IVFFlat;
+class GpuIndexFlat;
+
+struct GpuIndexIVFScalarQuantizerConfig : public GpuIndexIVFConfig {
+};
+
+/// Wrapper around the GPU implementation that looks like
+/// faiss::IndexIVFScalarQuantizer
+class GpuIndexIVFScalarQuantizer : public GpuIndexIVF {
+ public:
+  /// Construct from a pre-existing faiss::IndexIVFScalarQuantizer instance,
+  /// copying data over to the given GPU, if the input index is trained.
+  GpuIndexIVFScalarQuantizer(
+    GpuResources* resources,
+    const faiss::IndexIVFScalarQuantizer* index,
+    GpuIndexIVFScalarQuantizerConfig config =
+    GpuIndexIVFScalarQuantizerConfig());
+
+  /// Constructs a new instance with an empty flat quantizer; the user
+  /// provides the number of lists desired.
+  GpuIndexIVFScalarQuantizer(
+    GpuResources* resources,
+    int dims,
+    int nlist,
+    faiss::ScalarQuantizer::QuantizerType qtype,
+    faiss::MetricType metric = MetricType::METRIC_L2,
+    bool encodeResidual = true,
+    GpuIndexIVFScalarQuantizerConfig config =
+    GpuIndexIVFScalarQuantizerConfig());
+
+  ~GpuIndexIVFScalarQuantizer() override;
+
+  /// Reserve GPU memory in our inverted lists for this number of vectors
+  void reserveMemory(size_t numVecs);
+
+  /// Initialize ourselves from the given CPU index; will overwrite
+  /// all data in ourselves
+  void copyFrom(const faiss::IndexIVFScalarQuantizer* index);
+
+  /// Copy ourselves to the given CPU index; will overwrite all data
+  /// in the index instance
+  void copyTo(faiss::IndexIVFScalarQuantizer* index) const;
+
+  /// After adding vectors, one can call this to reclaim device memory
+  /// to exactly the amount needed. Returns space reclaimed in bytes
+  size_t reclaimMemory();
+
+  void reset() override;
+
+  void train(Index::idx_t n, const float* x) override;
+
+ protected:
+  /// Called from GpuIndex for add/add_with_ids
+  void addImpl_(int n,
+                const float* x,
+                const Index::idx_t* ids) override;
+
+  /// Called from GpuIndex for search
+  void searchImpl_(int n,
+                   const float* x,
+                   int k,
+                   float* distances,
+                   Index::idx_t* labels) const override;
+
+  /// Called from train to handle SQ residual training
+  void trainResiduals_(Index::idx_t n, const float* x);
+
+ public:
+  /// Exposed like the CPU version
+  faiss::ScalarQuantizer sq;
+
+  /// Exposed like the CPU version
+  bool by_residual;
+
+ private:
+  GpuIndexIVFScalarQuantizerConfig ivfSQConfig_;
+
+  /// Desired inverted list memory reservation
+  size_t reserveMemoryVecs_;
+
+  /// Instance that we own; contains the inverted list
+  IVFFlat* index_;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndicesOptions.h b/core/src/index/thirdparty/faiss/gpu/GpuIndicesOptions.h
new file mode 100644
index 0000000000..768f981f71
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndicesOptions.h
@@ -0,0 +1,30 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+namespace faiss { namespace gpu {
+
+/// How user vector index data is stored on the GPU
+enum IndicesOptions {
+  /// The user indices are only stored on the CPU; the GPU returns
+  /// (inverted list, offset) to the CPU which is then translated to
+  /// the real user index.
+  INDICES_CPU = 0,
+  /// The indices are not stored at all, on either the CPU or
+  /// GPU. Only (inverted list, offset) is returned to the user as the
+  /// index.
+  INDICES_IVF = 1,
+  /// Indices are stored as 32 bit integers on the GPU, but returned
+  /// as 64 bit integers
+  INDICES_32_BIT = 2,
+  /// Indices are stored as 64 bit integers on the GPU
+  INDICES_64_BIT = 3,
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuResources.cpp b/core/src/index/thirdparty/faiss/gpu/GpuResources.cpp
new file mode 100644
index 0000000000..fe386c2cf8
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuResources.cpp
@@ -0,0 +1,52 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+
+namespace faiss { namespace gpu {
+
+GpuResources::~GpuResources() {
+}
+
+cublasHandle_t
+GpuResources::getBlasHandleCurrentDevice() {
+  return getBlasHandle(getCurrentDevice());
+}
+
+cudaStream_t
+GpuResources::getDefaultStreamCurrentDevice() {
+  return getDefaultStream(getCurrentDevice());
+}
+
+std::vector<cudaStream_t>
+GpuResources::getAlternateStreamsCurrentDevice() {
+  return getAlternateStreams(getCurrentDevice());
+}
+
+DeviceMemory&
+GpuResources::getMemoryManagerCurrentDevice() {
+  return getMemoryManager(getCurrentDevice());
+}
+
+cudaStream_t
+GpuResources::getAsyncCopyStreamCurrentDevice() {
+  return getAsyncCopyStream(getCurrentDevice());
+}
+
+void
+GpuResources::syncDefaultStream(int device) {
+  CUDA_VERIFY(cudaStreamSynchronize(getDefaultStream(device)));
+}
+
+void
+GpuResources::syncDefaultStreamCurrentDevice() {
+  syncDefaultStream(getCurrentDevice());
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuResources.h b/core/src/index/thirdparty/faiss/gpu/GpuResources.h
new file mode 100644
index 0000000000..bdea4f630a
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/GpuResources.h
@@ -0,0 +1,73 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/DeviceMemory.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <utility>
+#include <vector>
+
+namespace faiss { namespace gpu {
+
+/// Base class of GPU-side resource provider; hides provision of
+/// cuBLAS handles, CUDA streams and a temporary memory manager
+class GpuResources {
+ public:
+  virtual ~GpuResources();
+
+  /// Call to pre-allocate resources for a particular device. If this is
+  /// not called, then resources will be allocated at the first time
+  /// of demand
+  virtual void initializeForDevice(int device) = 0;
+
+  /// Returns the cuBLAS handle that we use for the given device
+  virtual cublasHandle_t getBlasHandle(int device) = 0;
+
+  /// Returns the stream that we order all computation on for the
+  /// given device
+  virtual cudaStream_t getDefaultStream(int device) = 0;
+
+  /// Returns the set of alternative streams that we use for the given device
+  virtual std::vector<cudaStream_t> getAlternateStreams(int device) = 0;
+
+  /// Returns the temporary memory manager for the given device
+  virtual DeviceMemory& getMemoryManager(int device) = 0;
+
+  /// Returns the available CPU pinned memory buffer
+  virtual std::pair<void*, size_t> getPinnedMemory() = 0;
+
+  /// Returns the stream on which we perform async CPU <-> GPU copies
+  virtual cudaStream_t getAsyncCopyStream(int device) = 0;
+
+  /// Calls getBlasHandle with the current device
+  cublasHandle_t getBlasHandleCurrentDevice();
+
+  /// Calls getDefaultStream with the current device
+  cudaStream_t getDefaultStreamCurrentDevice();
+
+  /// Synchronizes the CPU with respect to the default stream for the
+  /// given device
+  // equivalent to cudaDeviceSynchronize(getDefaultStream(device))
+  void syncDefaultStream(int device);
+
+  /// Calls syncDefaultStream for the current device
+  void syncDefaultStreamCurrentDevice();
+
+  /// Calls getAlternateStreams for the current device
+  std::vector<cudaStream_t> getAlternateStreamsCurrentDevice();
+
+  /// Calls getMemoryManager for the current device
+  DeviceMemory& getMemoryManagerCurrentDevice();
+
+  /// Calls getAsyncCopyStream for the current device
+  cudaStream_t getAsyncCopyStreamCurrentDevice();
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/StandardGpuResources.cpp b/core/src/index/thirdparty/faiss/gpu/StandardGpuResources.cpp
new file mode 100644
index 0000000000..63ed9ef316
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/StandardGpuResources.cpp
@@ -0,0 +1,295 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/utils/MemorySpace.h>
+#include <faiss/impl/FaissAssert.h>
+#include <limits>
+
+namespace faiss { namespace gpu {
+
+namespace {
+
+// How many streams per device we allocate by default (for multi-streaming)
+constexpr int kNumStreams = 2;
+
+// Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
+constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
+
+// Default temporary memory allocation for <= 4 GiB memory GPUs
+constexpr size_t k4GiBTempMem = (size_t) 512 * 1024 * 1024;
+
+// Default temporary memory allocation for <= 8 GiB memory GPUs
+constexpr size_t k8GiBTempMem = (size_t) 1024 * 1024 * 1024;
+
+// Maximum temporary memory allocation for all GPUs
+constexpr size_t kMaxTempMem = (size_t) 1536 * 1024 * 1024;
+
+}
+
+StandardGpuResources::StandardGpuResources() :
+    pinnedMemAlloc_(nullptr),
+    pinnedMemAllocSize_(0),
+    // let the adjustment function determine the memory size for us by passing
+    // in a huge value that will then be adjusted
+    tempMemSize_(getDefaultTempMemForGPU(-1,
+                                         std::numeric_limits<size_t>::max())),
+    pinnedMemSize_(kDefaultPinnedMemoryAllocation),
+    cudaMallocWarning_(true) {
+}
+
+StandardGpuResources::~StandardGpuResources() {
+  for (auto& entry : defaultStreams_) {
+    DeviceScope scope(entry.first);
+
+    auto it = userDefaultStreams_.find(entry.first);
+    if (it == userDefaultStreams_.end()) {
+      // The user did not specify this stream, thus we are the ones
+      // who have created it
+      CUDA_VERIFY(cudaStreamDestroy(entry.second));
+    }
+  }
+
+  for (auto& entry : alternateStreams_) {
+    DeviceScope scope(entry.first);
+
+    for (auto stream : entry.second) {
+      CUDA_VERIFY(cudaStreamDestroy(stream));
+    }
+  }
+
+  for (auto& entry : asyncCopyStreams_) {
+    DeviceScope scope(entry.first);
+
+    CUDA_VERIFY(cudaStreamDestroy(entry.second));
+  }
+
+  for (auto& entry : blasHandles_) {
+    DeviceScope scope(entry.first);
+
+    auto blasStatus = cublasDestroy(entry.second);
+    FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
+  }
+
+  if (pinnedMemAlloc_) {
+    freeMemorySpace(MemorySpace::HostPinned, pinnedMemAlloc_);
+  }
+}
+
+size_t
+StandardGpuResources::getDefaultTempMemForGPU(int device,
+                                              size_t requested) {
+  auto totalMem = device != -1 ?
+    getDeviceProperties(device).totalGlobalMem :
+    std::numeric_limits<size_t>::max();
+
+  if (totalMem <= (size_t) 4 * 1024 * 1024 * 1024) {
+    // If the GPU has <= 4 GiB of memory, reserve 512 MiB
+
+    if (requested > k4GiBTempMem) {
+      return k4GiBTempMem;
+    }
+  } else if (totalMem <= (size_t) 8 * 1024 * 1024 * 1024) {
+    // If the GPU has <= 8 GiB of memory, reserve 1 GiB
+
+    if (requested > k8GiBTempMem) {
+      return k8GiBTempMem;
+    }
+  } else {
+    // Never use more than 1.5 GiB
+    if (requested > kMaxTempMem) {
+      return kMaxTempMem;
+    }
+  }
+
+  // use whatever lower limit the user requested
+  return requested;
+}
+
+void
+StandardGpuResources::noTempMemory() {
+  setTempMemory(0);
+  setCudaMallocWarning(false);
+}
+
+void
+StandardGpuResources::setTempMemory(size_t size) {
+  if (tempMemSize_ != size) {
+    // adjust based on general limits
+    tempMemSize_ = getDefaultTempMemForGPU(-1, size);
+
+    // We need to re-initialize memory resources for all current devices that
+    // have been initialized.
+    // This should be safe to do, even if we are currently running work, because
+    // the cudaFree call that this implies will force-synchronize all GPUs with
+    // the CPU
+    for (auto& p : memory_) {
+      int device = p.first;
+      // Free the existing memory first
+      p.second.reset();
+
+      // Allocate new
+      p.second = std::unique_ptr<StackDeviceMemory>(
+        new StackDeviceMemory(p.first,
+                              // adjust for this specific device
+                              getDefaultTempMemForGPU(device, tempMemSize_)));
+    }
+  }
+}
+
+void
+StandardGpuResources::setPinnedMemory(size_t size) {
+  // Should not call this after devices have been initialized
+  FAISS_ASSERT(defaultStreams_.size() == 0);
+  FAISS_ASSERT(!pinnedMemAlloc_);
+
+  pinnedMemSize_ = size;
+}
+
+void
+StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
+  auto it = defaultStreams_.find(device);
+  if (it != defaultStreams_.end()) {
+    // Replace this stream with the user stream
+    CUDA_VERIFY(cudaStreamDestroy(it->second));
+    it->second = stream;
+  }
+
+  userDefaultStreams_[device] = stream;
+}
+
+void
+StandardGpuResources::setDefaultNullStreamAllDevices() {
+  for (int dev = 0; dev < getNumDevices(); ++dev) {
+    setDefaultStream(dev, nullptr);
+  }
+}
+
+void
+StandardGpuResources::setCudaMallocWarning(bool b) {
+  cudaMallocWarning_ = b;
+
+  for (auto& v : memory_) {
+    v.second->setCudaMallocWarning(b);
+  }
+}
+
+bool
+StandardGpuResources::isInitialized(int device) const {
+  // Use default streams as a marker for whether or not a certain
+  // device has been initialized
+  return defaultStreams_.count(device) != 0;
+}
+
+void
+StandardGpuResources::initializeForDevice(int device) {
+  if (isInitialized(device)) {
+    return;
+  }
+
+  // If this is the first device that we're initializing, create our
+  // pinned memory allocation
+  if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
+    allocMemorySpace(MemorySpace::HostPinned, &pinnedMemAlloc_, pinnedMemSize_);
+    pinnedMemAllocSize_ = pinnedMemSize_;
+  }
+
+  FAISS_ASSERT(device < getNumDevices());
+  DeviceScope scope(device);
+
+  // Make sure that device properties for all devices are cached
+  auto& prop = getDeviceProperties(device);
+
+  // Also check to make sure we meet our minimum compute capability (3.0)
+  FAISS_ASSERT_FMT(prop.major >= 3,
+                   "Device id %d with CC %d.%d not supported, "
+                   "need 3.0+ compute capability",
+                   device, prop.major, prop.minor);
+
+  // Create streams
+  cudaStream_t defaultStream = 0;
+  auto it = userDefaultStreams_.find(device);
+  if (it != userDefaultStreams_.end()) {
+    // We already have a stream provided by the user
+    defaultStream = it->second;
+  } else {
+    CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
+                                          cudaStreamNonBlocking));
+  }
+
+  defaultStreams_[device] = defaultStream;
+
+  cudaStream_t asyncCopyStream = 0;
+  CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
+                                        cudaStreamNonBlocking));
+
+  asyncCopyStreams_[device] = asyncCopyStream;
+
+  std::vector<cudaStream_t> deviceStreams;
+  for (int j = 0; j < kNumStreams; ++j) {
+    cudaStream_t stream = 0;
+    CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
+                                          cudaStreamNonBlocking));
+
+    deviceStreams.push_back(stream);
+  }
+
+  alternateStreams_[device] = std::move(deviceStreams);
+
+  // Create cuBLAS handle
+  cublasHandle_t blasHandle = 0;
+  auto blasStatus = cublasCreate(&blasHandle);
+  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
+  blasHandles_[device] = blasHandle;
+
+  FAISS_ASSERT(memory_.count(device) == 0);
+
+  auto mem = std::unique_ptr<StackDeviceMemory>(
+    new StackDeviceMemory(device,
+                          // adjust for this specific device
+                          getDefaultTempMemForGPU(device, tempMemSize_)));
+  mem->setCudaMallocWarning(cudaMallocWarning_);
+
+  memory_.emplace(device, std::move(mem));
+}
+
+cublasHandle_t
+StandardGpuResources::getBlasHandle(int device) {
+  initializeForDevice(device);
+  return blasHandles_[device];
+}
+
+cudaStream_t
+StandardGpuResources::getDefaultStream(int device) {
+  initializeForDevice(device);
+  return defaultStreams_[device];
+}
+
+std::vector<cudaStream_t>
+StandardGpuResources::getAlternateStreams(int device) {
+  initializeForDevice(device);
+  return alternateStreams_[device];
+}
+
+DeviceMemory& StandardGpuResources::getMemoryManager(int device) {
+  initializeForDevice(device);
+  return *memory_[device];
+}
+
+std::pair<void*, size_t>
+StandardGpuResources::getPinnedMemory() {
+  return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
+}
+
+cudaStream_t
+StandardGpuResources::getAsyncCopyStream(int device) {
+  initializeForDevice(device);
+  return asyncCopyStreams_[device];
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/StandardGpuResources.h b/core/src/index/thirdparty/faiss/gpu/StandardGpuResources.h
new file mode 100644
index 0000000000..9d4ffa4c44
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/StandardGpuResources.h
@@ -0,0 +1,114 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/StackDeviceMemory.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <unordered_map>
+#include <vector>
+
+namespace faiss { namespace gpu {
+
+/// Default implementation of GpuResources that allocates a cuBLAS
+/// stream and 2 streams for use, as well as temporary memory
+class StandardGpuResources : public GpuResources {
+ public:
+  StandardGpuResources();
+
+  ~StandardGpuResources() override;
+
+  /// Disable allocation of temporary memory; all temporary memory
+  /// requests will call cudaMalloc / cudaFree at the point of use
+  void noTempMemory();
+
+  /// Specify that we wish to use a certain fixed size of memory on
+  /// all devices as temporary memory. This is the upper bound for the GPU
+  /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
+  /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
+  /// To avoid any temporary memory allocation, pass 0.
+  void setTempMemory(size_t size);
+
+  /// Set amount of pinned memory to allocate, for async GPU <-> CPU
+  /// transfers
+  void setPinnedMemory(size_t size);
+
+  /// Called to change the stream for work ordering
+  void setDefaultStream(int device, cudaStream_t stream);
+
+  /// Called to change the work ordering streams to the null stream
+  /// for all devices
+  void setDefaultNullStreamAllDevices();
+
+  /// Enable or disable the warning about not having enough temporary memory
+  /// when cudaMalloc gets called
+  void setCudaMallocWarning(bool b);
+
+ public:
+  /// Internal system calls
+
+  /// Initialize resources for this device
+  void initializeForDevice(int device) override;
+
+  cublasHandle_t getBlasHandle(int device) override;
+
+  cudaStream_t getDefaultStream(int device) override;
+
+  std::vector<cudaStream_t> getAlternateStreams(int device) override;
+
+  DeviceMemory& getMemoryManager(int device) override;
+
+  std::pair<void*, size_t> getPinnedMemory() override;
+
+  cudaStream_t getAsyncCopyStream(int device) override;
+
+ private:
+  /// Have GPU resources been initialized for this device yet?
+  bool isInitialized(int device) const;
+
+  /// Adjust the default temporary memory allocation based on the total GPU
+  /// memory size
+  static size_t getDefaultTempMemForGPU(int device, size_t requested);
+
+ private:
+  /// Our default stream that work is ordered on, one per each device
+  std::unordered_map<int, cudaStream_t> defaultStreams_;
+
+  /// This contains particular streams as set by the user for
+  /// ordering, if any
+  std::unordered_map<int, cudaStream_t> userDefaultStreams_;
+
+  /// Other streams we can use, per each device
+  std::unordered_map<int, std::vector<cudaStream_t> > alternateStreams_;
+
+  /// Async copy stream to use for GPU <-> CPU pinned memory copies
+  std::unordered_map<int, cudaStream_t> asyncCopyStreams_;
+
+  /// cuBLAS handle for each device
+  std::unordered_map<int, cublasHandle_t> blasHandles_;
+
+  /// Temporary memory provider, per each device
+  std::unordered_map<int, std::unique_ptr<StackDeviceMemory> > memory_;
+
+  /// Pinned memory allocation for use with this GPU
+  void* pinnedMemAlloc_;
+  size_t pinnedMemAllocSize_;
+
+  /// Another option is to use a specified amount of memory on all
+  /// devices
+  size_t tempMemSize_;
+
+  /// Amount of pinned memory we should allocate
+  size_t pinnedMemSize_;
+
+  /// Whether or not a warning upon cudaMalloc is generated
+  bool cudaMallocWarning_;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/BinaryDistance.cu b/core/src/index/thirdparty/faiss/gpu/impl/BinaryDistance.cu
new file mode 100644
index 0000000000..9c91ae2182
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/BinaryDistance.cu
@@ -0,0 +1,316 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Select.cuh>
+
+namespace faiss { namespace gpu {
+
+// Number of warps that the kernel is instantiated with
+constexpr int kWarps = 8;
+constexpr int kLanes = kWarpSize;
+
+constexpr int kMaxDistance = std::numeric_limits<int>::max();
+
+// Performs a binary matrix multiplication, returning the lowest k results in
+// `vecs` for each `query` in terms of Hamming distance (a fused kernel)
+// Each warp calculates distance for a single query
+template <int NumWarpQ,
+          int NumThreadQ,
+          typename BinaryType>
+__launch_bounds__(kWarps * kLanes)
+__global__ void binaryDistanceAnySize(const Tensor<BinaryType, 2, true> vecs,
+                                      const Tensor<BinaryType, 2, true> query,
+                                      Tensor<int, 2, true> outK,
+                                      Tensor<int, 2, true> outV,
+                                      int k) {
+  // A matrix tile (query, k)
+  __shared__ BinaryType queryTile[kWarps][kLanes + 1]; // avoid bank conflict
+
+  // B matrix tile (vec, k)
+  __shared__ BinaryType vecTile[kLanes][kLanes + 1]; // avoid bank conflict
+
+  WarpSelect<int, int, false, Comparator<int>,
+             NumWarpQ, NumThreadQ, kWarps * kLanes>
+    heap(kMaxDistance, -1, k);
+
+  int warpId = threadIdx.y;
+  int laneId = threadIdx.x;
+
+  // Each warp handles a single query
+  int warpQuery = blockIdx.x * kWarps + warpId;
+  bool queryInBounds = warpQuery < query.getSize(0);
+
+  // Each warp loops through the entire chunk of vectors
+  for (int blockVec = 0; blockVec < vecs.getSize(0); blockVec += kLanes) {
+    int threadDistance = 0;
+
+    // Reduction dimension
+    for (int blockK = 0; blockK < vecs.getSize(1); blockK += kLanes) {
+      int laneK = blockK + laneId;
+      bool kInBounds = laneK < vecs.getSize(1);
+
+      queryTile[warpId][laneId] = queryInBounds && kInBounds ?
+                               query[warpQuery][laneK] : 0;
+
+      // kWarps warps are responsible for loading 32 vecs
+#pragma unroll
+      for (int i = 0; i < kLanes / kWarps; ++i) {
+        int warpVec = i * kWarps + warpId;
+        int vec = blockVec + warpVec;
+        bool vecInBounds = vec < vecs.getSize(0);
+
+        vecTile[warpVec][laneId] = vecInBounds && kInBounds ?
+                                 vecs[vec][laneK] : 0;
+      }
+
+      __syncthreads();
+
+      // Compare distances
+#pragma unroll
+      for (int i = 0; i < kLanes; ++i) {
+        threadDistance += __popc(queryTile[warpId][i] ^ vecTile[laneId][i]);
+      }
+
+      __syncthreads();
+    }
+
+    // Lanes within a warp are different vec results against the same query
+    // Only submit distances which represent real (query, vec) pairs
+    bool valInBounds = queryInBounds && (blockVec + laneId < vecs.getSize(0));
+    threadDistance = valInBounds ? threadDistance : kMaxDistance;
+    int id = valInBounds ? blockVec + laneId : -1;
+
+    heap.add(threadDistance, id);
+  }
+
+  heap.reduce();
+
+  if (warpQuery < query.getSize(0)) {
+    heap.writeOut(outK[warpQuery].data(),
+                  outV[warpQuery].data(),
+                  k);
+  }
+}
+
+// Version of the kernel that avoids a loop over the reduction dimension, and
+// thus avoids reloading the query vectors
+template <int NumWarpQ,
+          int NumThreadQ,
+          typename BinaryType,
+          int ReductionLimit = kLanes>
+__global__ void
+__launch_bounds__(kWarps * kLanes)
+binaryDistanceLimitSize(const Tensor<BinaryType, 2, true> vecs,
+                        const Tensor<BinaryType, 2, true> query,
+                        Tensor<int, 2, true> outK,
+                        Tensor<int, 2, true> outV,
+                        int k) {
+  // A matrix tile (query, k)
+  __shared__ BinaryType queryTile[kWarps][kLanes + 1]; // avoid bank conflict
+
+  // B matrix tile (vec, k)
+  __shared__ BinaryType vecTile[kLanes][kLanes + 1]; // avoid bank conflict
+
+  WarpSelect<int, int, false, Comparator<int>,
+             NumWarpQ, NumThreadQ, kWarps * kLanes>
+    heap(kMaxDistance, -1, k);
+
+  int warpId = threadIdx.y;
+  int laneId = threadIdx.x;
+
+  // Each warp handles a single query
+  int laneK = laneId;
+  int warpQuery = blockIdx.x * kWarps + warpId;
+  bool kInBounds = laneK < vecs.getSize(1);
+  bool queryInBounds = warpQuery < query.getSize(0);
+
+
+  queryTile[warpId][laneId] = queryInBounds && kInBounds ?
+                                   query[warpQuery][laneK] : 0;
+
+  // Each warp loops through the entire chunk of vectors
+  for (int blockVec = 0; blockVec < vecs.getSize(0); blockVec += kLanes) {
+    int threadDistance = 0;
+
+    // kWarps warps are responsible for loading 32 vecs
+#pragma unroll
+    for (int i = 0; i < kLanes / kWarps; ++i) {
+      int warpVec = i * kWarps + warpId;
+      int vec = blockVec + warpVec;
+      bool vecInBounds = vec < vecs.getSize(0);
+
+      vecTile[warpVec][laneId] = vecInBounds && kInBounds ?
+                               vecs[vec][laneK] : 0;
+    }
+
+    __syncthreads();
+
+    // Compare distances
+#pragma unroll
+    for (int i = 0; i < ReductionLimit; ++i) {
+      threadDistance += __popc(queryTile[warpId][i] ^ vecTile[laneId][i]);
+    }
+
+    __syncthreads();
+
+    // Lanes within a warp are different vec results against the same query
+    // Only submit distances which represent real (query, vec) pairs
+    bool valInBounds = queryInBounds && (blockVec + laneId < vecs.getSize(0));
+    threadDistance = valInBounds ? threadDistance : kMaxDistance;
+    int id = valInBounds ? blockVec + laneId : -1;
+
+    heap.add(threadDistance, id);
+  }
+
+  heap.reduce();
+
+  if (warpQuery < query.getSize(0)) {
+    heap.writeOut(outK[warpQuery].data(),
+                  outV[warpQuery].data(),
+                  k);
+  }
+}
+
+template <typename BinaryType>
+void runBinaryDistanceAnySize(Tensor<BinaryType, 2, true>& vecs,
+                              Tensor<BinaryType, 2, true>& query,
+                              Tensor<int, 2, true>& outK,
+                              Tensor<int, 2, true>& outV,
+                              int k, cudaStream_t stream) {
+  dim3 grid(utils::divUp(query.getSize(0), kWarps));
+  dim3 block(kLanes, kWarps);
+
+  if (k == 1) {
+    binaryDistanceAnySize<1, 1, BinaryType>
+      <<<grid, block, 0, stream>>>(
+      vecs, query, outK, outV, k);
+  } else if (k <= 32) {
+    binaryDistanceAnySize<32, 2, BinaryType>
+      <<<grid, block, 0, stream>>>(
+      vecs, query, outK, outV, k);
+  } else if (k <= 64) {
+    binaryDistanceAnySize<64, 3, BinaryType>
+      <<<grid, block, 0, stream>>>(
+      vecs, query, outK, outV, k);
+  } else if (k <= 128) {
+    binaryDistanceAnySize<128, 3, BinaryType>
+      <<<grid, block, 0, stream>>>(
+      vecs, query, outK, outV, k);
+  } else if (k <= 256) {
+    binaryDistanceAnySize<256, 4, BinaryType>
+      <<<grid, block, 0, stream>>>(
+      vecs, query, outK, outV, k);
+  } else if (k <= 512) {
+    binaryDistanceAnySize<512, 8, BinaryType>
+      <<<grid, block, 0, stream>>>(
+      vecs, query, outK, outV, k);
+  } else if (k <= 1024) {
+    binaryDistanceAnySize<1024, 8, BinaryType>
+      <<<grid, block, 0, stream>>>(
+      vecs, query, outK, outV, k);
+  }
+#if GPU_MAX_SELECTION_K >= 2048
+  else if (k <= 2048) {
+    binaryDistanceAnySize<2048, 8, BinaryType>
+      <<<grid, block, 0, stream>>>(
+      vecs, query, outK, outV, k);
+  }
+#endif
+}
+
+template <typename BinaryType, int ReductionLimit>
+void runBinaryDistanceLimitSize(Tensor<BinaryType, 2, true>& vecs,
+                                Tensor<BinaryType, 2, true>& query,
+                                Tensor<int, 2, true>& outK,
+                                Tensor<int, 2, true>& outV,
+                                int k, cudaStream_t stream) {
+  dim3 grid(utils::divUp(query.getSize(0), kWarps));
+  dim3 block(kLanes, kWarps);
+
+  if (k == 1) {
+    binaryDistanceLimitSize<1, 1, BinaryType, ReductionLimit>
+      <<<grid, block, 0, stream>>>(
+      vecs, query, outK, outV, k);
+  } else if (k <= 32) {
+    binaryDistanceLimitSize<32, 2, BinaryType, ReductionLimit>
+      <<<grid, block, 0, stream>>>(
+      vecs, query, outK, outV, k);
+  } else if (k <= 64) {
+    binaryDistanceLimitSize<64, 3, BinaryType, ReductionLimit>
+      <<<grid, block, 0, stream>>>(
+      vecs, query, outK, outV, k);
+  } else if (k <= 128) {
+    binaryDistanceLimitSize<128, 3, BinaryType, ReductionLimit>
+      <<<grid, block, 0, stream>>>(
+      vecs, query, outK, outV, k);
+  } else if (k <= 256) {
+    binaryDistanceLimitSize<256, 4, BinaryType, ReductionLimit>
+      <<<grid, block, 0, stream>>>(
+      vecs, query, outK, outV, k);
+  } else if (k <= 512) {
+    binaryDistanceLimitSize<512, 8, BinaryType, ReductionLimit>
+      <<<grid, block, 0, stream>>>(
+      vecs, query, outK, outV, k);
+  } else if (k <= 1024) {
+    binaryDistanceLimitSize<1024, 8, BinaryType, ReductionLimit>
+      <<<grid, block, 0, stream>>>(
+      vecs, query, outK, outV, k);
+  }
+#if GPU_MAX_SELECTION_K >= 2048
+  else if (k <= 2048) {
+    binaryDistanceLimitSize<2048, 8, BinaryType, ReductionLimit>
+      <<<grid, block, 0, stream>>>(
+      vecs, query, outK, outV, k);
+  }
+#endif
+}
+
+void runBinaryDistance(Tensor<unsigned char, 2, true>& vecs,
+                       Tensor<unsigned char, 2, true>& query,
+                       Tensor<int, 2, true>& outK,
+                       Tensor<int, 2, true>& outV,
+                       int k, cudaStream_t stream) {
+  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
+  FAISS_ASSERT(vecs.getSize(1) == query.getSize(1));
+
+  FAISS_ASSERT(outK.getSize(1) == k);
+  FAISS_ASSERT(outV.getSize(1) == k);
+
+  // For the optimized uint32 kernel, we handle 32 * 8 = 256 max dims
+  constexpr int kReductionLimit32 = 8;
+
+  // For the optimized uint8 kernel, we handle 8 * 16 = 128 max dims
+  constexpr int kReductionLimit8 = 16;
+
+  // All other cases (large or small) go through the general kernel
+
+  if (vecs.getSize(1) % sizeof(unsigned int) == 0 &&
+      (vecs.getSize(1) / sizeof(unsigned int)) <= kReductionLimit32) {
+    auto vecs32 = vecs.castResize<unsigned int>();
+    auto query32 = query.castResize<unsigned int>();
+
+    // Optimize for vectors with dimensions a multiple of 32 that are less than
+    // 32 * kReductionLimit (256) dimensions in size
+    runBinaryDistanceLimitSize<unsigned int, kReductionLimit32>(
+      vecs32, query32, outK, outV, k, stream);
+
+  } else if (vecs.getSize(1) <= kReductionLimit8) {
+    // Optimize for vectors with dimensions a multiple of 32 that are less than
+    // 32 * kReductionLimit (256) dimensions in size
+    runBinaryDistanceLimitSize<unsigned char, kReductionLimit8>(
+      vecs, query, outK, outV, k, stream);
+  } else {
+    // Arbitrary size kernel
+    runBinaryDistanceAnySize<unsigned char>(
+      vecs, query, outK, outV, k, stream);
+  }
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/BinaryDistance.cuh b/core/src/index/thirdparty/faiss/gpu/impl/BinaryDistance.cuh
new file mode 100644
index 0000000000..149accc016
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/BinaryDistance.cuh
@@ -0,0 +1,21 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+
+namespace faiss { namespace gpu {
+
+// Performs brute-force k-NN comparison between `vecs` and `query`, where they
+// are encoded as binary vectors
+void runBinaryDistance(Tensor<unsigned char, 2, true>& vecs,
+                       Tensor<unsigned char, 2, true>& query,
+                       Tensor<int, 2, true>& outK,
+                       Tensor<int, 2, true>& outV,
+                       int k, cudaStream_t stream);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/BinaryFlatIndex.cu b/core/src/index/thirdparty/faiss/gpu/impl/BinaryFlatIndex.cu
new file mode 100644
index 0000000000..dd38fdd7dd
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/BinaryFlatIndex.cu
@@ -0,0 +1,88 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/BinaryFlatIndex.cuh>
+#include <faiss/gpu/impl/BinaryDistance.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/GpuResources.h>
+
+namespace faiss { namespace gpu {
+
+BinaryFlatIndex::BinaryFlatIndex(GpuResources* res,
+                                 int dim,
+                                 MemorySpace space) :
+    resources_(res),
+    dim_(dim),
+    space_(space),
+    num_(0),
+    rawData_(space) {
+  FAISS_ASSERT(dim % 8 == 0);
+}
+
+/// Returns the number of vectors we contain
+int BinaryFlatIndex::getSize() const {
+  return vectors_.getSize(0);
+}
+
+int BinaryFlatIndex::getDim() const {
+  return vectors_.getSize(1) * 8;
+}
+
+void
+BinaryFlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
+  rawData_.reserve(numVecs * (dim_ / 8) * sizeof(unsigned int), stream);
+}
+
+Tensor<unsigned char, 2, true>&
+BinaryFlatIndex::getVectorsRef() {
+  return vectors_;
+}
+
+void
+BinaryFlatIndex::query(Tensor<unsigned char, 2, true>& input,
+                       int k,
+                       Tensor<int, 2, true>& outDistances,
+                       Tensor<int, 2, true>& outIndices) {
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+
+  runBinaryDistance(vectors_,
+                    input,
+                    outDistances,
+                    outIndices,
+                    k,
+                    stream);
+}
+
+void
+BinaryFlatIndex::add(const unsigned char* data,
+                     int numVecs,
+                     cudaStream_t stream) {
+  if (numVecs == 0) {
+    return;
+  }
+
+  rawData_.append((char*) data,
+                  (size_t) (dim_ / 8) * numVecs * sizeof(unsigned char),
+                  stream,
+                  true /* reserve exactly */);
+
+  num_ += numVecs;
+
+  DeviceTensor<unsigned char, 2, true> vectors(
+    (unsigned char*) rawData_.data(), {(int) num_, (dim_ / 8)}, space_);
+  vectors_ = std::move(vectors);
+}
+
+void
+BinaryFlatIndex::reset() {
+  rawData_.clear();
+  vectors_ = std::move(DeviceTensor<unsigned char, 2, true>());
+  num_ = 0;
+}
+
+} }
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/BinaryFlatIndex.cuh b/core/src/index/thirdparty/faiss/gpu/impl/BinaryFlatIndex.cuh
new file mode 100644
index 0000000000..c99afc45a7
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/BinaryFlatIndex.cuh
@@ -0,0 +1,69 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/DeviceVector.cuh>
+#include <faiss/gpu/utils/MemorySpace.h>
+
+namespace faiss { namespace gpu {
+
+class GpuResources;
+
+/// Holder of GPU resources for a particular flat index
+class BinaryFlatIndex {
+ public:
+  BinaryFlatIndex(GpuResources* res,
+                  int dim,
+                  MemorySpace space);
+
+  /// Returns the number of vectors we contain
+  int getSize() const;
+
+  int getDim() const;
+
+  /// Reserve storage that can contain at least this many vectors
+  void reserve(size_t numVecs, cudaStream_t stream);
+
+  /// Returns a reference to our vectors currently in use
+  Tensor<unsigned char, 2, true>& getVectorsRef();
+
+  void query(Tensor<unsigned char, 2, true>& vecs,
+             int k,
+             Tensor<int, 2, true>& outDistances,
+             Tensor<int, 2, true>& outIndices);
+
+  /// Add vectors to ourselves; the pointer passed can be on the host
+  /// or the device
+  void add(const unsigned char* data, int numVecs, cudaStream_t stream);
+
+  /// Free all storage
+  void reset();
+
+ private:
+  /// Collection of GPU resources that we use
+  GpuResources* resources_;
+
+  /// Dimensionality of our vectors
+  const int dim_;
+
+  /// Memory space for our allocations
+  MemorySpace space_;
+
+  /// How many vectors we have
+  int num_;
+
+  /// The underlying expandable storage
+  DeviceVector<char> rawData_;
+
+  /// Vectors currently in rawData_
+  DeviceTensor<unsigned char, 2, true> vectors_;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/BroadcastSum.cu b/core/src/index/thirdparty/faiss/gpu/impl/BroadcastSum.cu
new file mode 100644
index 0000000000..364200c3e4
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/BroadcastSum.cu
@@ -0,0 +1,354 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <algorithm>
+#include <faiss/impl/FaissAssert.h>
+
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/MathOperators.cuh>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+
+namespace faiss { namespace gpu {
+
+template <typename T, int kRowsPerBlock, int kRowUnroll, int kColLoad>
+__global__ void sumAlongColumns(Tensor<T, 1, true> input,
+                                Tensor<T, 2, true> output) {
+  static_assert(kRowsPerBlock % kRowUnroll == 0, "must fit rows");
+
+  // blockIdx.x: which chunk of rows we are responsible for updating
+  // blockIdx.y: which chunk of columns we are responsible for
+  // updating
+  int rowStart = blockIdx.x * kRowsPerBlock;
+  int rowEnd = rowStart + kRowsPerBlock;
+  int colStart = blockIdx.y * blockDim.x * kColLoad;
+
+  // FIXME: if we have exact multiples, don't need this
+  bool endRow = (blockIdx.x == gridDim.x - 1);
+  bool endCol = (blockIdx.y == gridDim.y - 1);
+
+  if (endRow) {
+    if (output.getSize(0) % kRowsPerBlock == 0) {
+      endRow = false;
+    }
+  }
+
+  if (endCol) {
+    for (int col = colStart + threadIdx.x;
+         col < input.getSize(0); col += blockDim.x) {
+      T val = input[col];
+
+      if (endRow) {
+        for (int row = rowStart; row < output.getSize(0); ++row) {
+          T out = output[row][col];
+          out = Math<T>::add(out, val);
+          output[row][col] = out;
+        }
+      } else {
+        T rows[kRowUnroll];
+
+        for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
+#pragma unroll
+          for (int i = 0; i < kRowUnroll; ++i) {
+            rows[i] = output[row + i][col];
+          }
+
+#pragma unroll
+          for (int i = 0; i < kRowUnroll; ++i) {
+            rows[i] = Math<T>::add(rows[i], val);
+          }
+
+#pragma unroll
+          for (int i = 0; i < kRowUnroll; ++i) {
+            output[row + i][col] = rows[i];
+          }
+        }
+      }
+    }
+  } else {
+    int col = colStart + threadIdx.x;
+
+    T val[kColLoad];
+
+#pragma unroll
+    for (int i = 0; i < kColLoad; ++i) {
+      val[i] = input[col + i * blockDim.x];
+    }
+
+    if (endRow) {
+      for (int row = rowStart; row < output.getSize(0); ++row) {
+#pragma unroll
+        for (int i = 0; i < kColLoad; ++i) {
+          T out = output[row][col + i * blockDim.x];
+          out = Math<T>::add(out, val[i]);
+          output[row][col + i * blockDim.x] = out;
+        }
+      }
+    } else {
+      T rows[kRowUnroll * kColLoad];
+
+      for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
+#pragma unroll
+        for (int i = 0; i < kRowUnroll; ++i) {
+#pragma unroll
+          for (int j = 0; j < kColLoad; ++j) {
+            rows[i * kColLoad + j] =
+              output[row + i][col + j * blockDim.x];
+          }
+        }
+
+#pragma unroll
+        for (int i = 0; i < kRowUnroll; ++i) {
+#pragma unroll
+          for (int j = 0; j < kColLoad; ++j) {
+            rows[i * kColLoad + j] =
+              Math<T>::add(rows[i * kColLoad + j], val[j]);
+          }
+        }
+
+#pragma unroll
+        for (int i = 0; i < kRowUnroll; ++i) {
+#pragma unroll
+          for (int j = 0; j < kColLoad; ++j) {
+            output[row + i][col + j * blockDim.x] =
+              rows[i * kColLoad + j];
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, int kRowsPerBlock, int kRowUnroll, int kColLoad>
+__global__ void assignAlongColumns(Tensor<T, 1, true> input,
+                                   Tensor<T, 2, true> output) {
+  static_assert(kRowsPerBlock % kRowUnroll == 0, "must fit rows");
+
+  // blockIdx.x: which chunk of rows we are responsible for updating
+  // blockIdx.y: which chunk of columns we are responsible for
+  // updating
+  int rowStart = blockIdx.x * kRowsPerBlock;
+  int rowEnd = rowStart + kRowsPerBlock;
+  int colStart = blockIdx.y * blockDim.x * kColLoad;
+
+  // FIXME: if we have exact multiples, don't need this
+  bool endRow = (blockIdx.x == gridDim.x - 1);
+  bool endCol = (blockIdx.y == gridDim.y - 1);
+
+  if (endRow) {
+    if (output.getSize(0) % kRowsPerBlock == 0) {
+      endRow = false;
+    }
+  }
+
+  if (endCol) {
+    for (int col = colStart + threadIdx.x;
+         col < input.getSize(0); col += blockDim.x) {
+      T val = input[col];
+
+      if (endRow) {
+        for (int row = rowStart; row < output.getSize(0); ++row) {
+          output[row][col] = val;
+        }
+      } else {
+        for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
+#pragma unroll
+          for (int i = 0; i < kRowUnroll; ++i) {
+            output[row + i][col] = val;
+          }
+        }
+      }
+    }
+  } else {
+    int col = colStart + threadIdx.x;
+
+    T val[kColLoad];
+
+#pragma unroll
+    for (int i = 0; i < kColLoad; ++i) {
+      val[i] = input[col + i * blockDim.x];
+    }
+
+    if (endRow) {
+      for (int row = rowStart; row < output.getSize(0); ++row) {
+#pragma unroll
+        for (int i = 0; i < kColLoad; ++i) {
+          output[row][col + i * blockDim.x] = val[i];
+        }
+      }
+    } else {
+      for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
+#pragma unroll
+        for (int i = 0; i < kRowUnroll; ++i) {
+#pragma unroll
+          for (int j = 0; j < kColLoad; ++j) {
+            output[row + i][col + j * blockDim.x] = val[j];
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, bool ZeroClamp>
+__global__ void sumAlongRows(Tensor<T, 1, true> input,
+                             Tensor<T, 2, true> output) {
+  __shared__ T sval;
+
+  int row = blockIdx.x;
+
+  if (threadIdx.x == 0) {
+    sval = input[row];
+  }
+
+  __syncthreads();
+
+  T val = sval;
+
+  // FIXME: speed up
+  for (int i = threadIdx.x; i < output.getSize(1); i += blockDim.x) {
+    T out = output[row][i];
+    out = Math<T>::add(out, val);
+    out = Math<T>::lt(out, Math<T>::zero()) ? Math<T>::zero() : out;
+
+    output[row][i] = out;
+  }
+}
+
+template <typename T, typename TVec>
+void runSumAlongColumns(Tensor<T, 1, true>& input,
+                        Tensor<T, 2, true>& output,
+                        cudaStream_t stream) {
+  FAISS_ASSERT(input.getSize(0) == output.getSize(1));
+
+  int threadsPerBlock = 256;
+  constexpr int kRowUnroll = 4;
+  constexpr int kRowsPerBlock = kRowUnroll * 4;
+  constexpr int kColLoad = 4;
+
+  auto block = dim3(threadsPerBlock);
+
+  if (input.template canCastResize<TVec>() &&
+      output.template canCastResize<TVec>()) {
+    auto inputV = input.template castResize<TVec>();
+    auto outputV = output.template castResize<TVec>();
+
+    auto grid =
+      dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
+           utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
+
+    sumAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
+      <<<grid, block, 0, stream>>>(inputV, outputV);
+  } else {
+    auto grid =
+      dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
+           utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
+
+    sumAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
+      <<<grid, block, 0, stream>>>(input, output);
+  }
+
+  CUDA_TEST_ERROR();
+}
+
+void runSumAlongColumns(Tensor<float, 1, true>& input,
+                        Tensor<float, 2, true>& output,
+                        cudaStream_t stream) {
+  runSumAlongColumns<float, float4>(input, output, stream);
+}
+
+void runSumAlongColumns(Tensor<half, 1, true>& input,
+                        Tensor<half, 2, true>& output,
+                        cudaStream_t stream) {
+  runSumAlongColumns<half, half2>(input, output, stream);
+}
+
+template <typename T, typename TVec>
+void runAssignAlongColumns(Tensor<T, 1, true>& input,
+                           Tensor<T, 2, true>& output,
+                           cudaStream_t stream) {
+  FAISS_ASSERT(input.getSize(0) == output.getSize(1));
+
+  int threadsPerBlock = 256;
+  constexpr int kRowUnroll = 4;
+  constexpr int kRowsPerBlock = kRowUnroll * 4;
+  constexpr int kColLoad = 4;
+
+  auto block = dim3(threadsPerBlock);
+
+  if (input.template canCastResize<TVec>() &&
+      output.template canCastResize<TVec>()) {
+    auto inputV = input.template castResize<TVec>();
+    auto outputV = output.template castResize<TVec>();
+
+    auto grid =
+      dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
+           utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
+
+    assignAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
+      <<<grid, block, 0, stream>>>(inputV, outputV);
+  } else {
+    auto grid =
+      dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
+           utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
+
+    assignAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
+      <<<grid, block, 0, stream>>>(input, output);
+  }
+
+  CUDA_TEST_ERROR();
+}
+
+void runAssignAlongColumns(Tensor<float, 1, true>& input,
+                           Tensor<float, 2, true>& output,
+                           cudaStream_t stream) {
+  runAssignAlongColumns<float, float4>(input, output, stream);
+}
+
+void runAssignAlongColumns(Tensor<half, 1, true>& input,
+                           Tensor<half, 2, true>& output,
+                           cudaStream_t stream) {
+  runAssignAlongColumns<half, half2>(input, output, stream);
+}
+
+template <typename T>
+void runSumAlongRows(Tensor<T, 1, true>& input,
+                     Tensor<T, 2, true>& output,
+                     bool zeroClamp,
+                     cudaStream_t stream) {
+  FAISS_ASSERT(input.getSize(0) == output.getSize(0));
+
+  int threadsPerBlock =
+    std::min(output.getSize(1), getMaxThreadsCurrentDevice());
+  auto grid = dim3(output.getSize(0));
+  auto block = dim3(threadsPerBlock);
+
+  if (zeroClamp) {
+    sumAlongRows<T, true><<<grid, block, 0, stream>>>(input, output);
+  } else {
+    sumAlongRows<T, false><<<grid, block, 0, stream>>>(input, output);
+  }
+
+  CUDA_TEST_ERROR();
+}
+
+void runSumAlongRows(Tensor<float, 1, true>& input,
+                     Tensor<float, 2, true>& output,
+                     bool zeroClamp,
+                     cudaStream_t stream) {
+  runSumAlongRows<float>(input, output, zeroClamp, stream);
+}
+
+void runSumAlongRows(Tensor<half, 1, true>& input,
+                     Tensor<half, 2, true>& output,
+                     bool zeroClamp,
+                     cudaStream_t stream) {
+  runSumAlongRows<half>(input, output, zeroClamp, stream);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/BroadcastSum.cuh b/core/src/index/thirdparty/faiss/gpu/impl/BroadcastSum.cuh
new file mode 100644
index 0000000000..8c4b27452c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/BroadcastSum.cuh
@@ -0,0 +1,45 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/Tensor.cuh>
+
+namespace faiss { namespace gpu {
+
+// output[x][i] += input[i] for all x
+void runSumAlongColumns(Tensor<float, 1, true>& input,
+                        Tensor<float, 2, true>& output,
+                        cudaStream_t stream);
+
+void runSumAlongColumns(Tensor<half, 1, true>& input,
+                        Tensor<half, 2, true>& output,
+                        cudaStream_t stream);
+
+// output[x][i] = input[i] for all x
+void runAssignAlongColumns(Tensor<float, 1, true>& input,
+                           Tensor<float, 2, true>& output,
+                           cudaStream_t stream);
+
+void runAssignAlongColumns(Tensor<half, 1, true>& input,
+                           Tensor<half, 2, true>& output,
+                           cudaStream_t stream);
+
+// output[i][x] += input[i] for all x
+// If zeroClamp, output[i][x] = max(output[i][x] + input[i], 0) for all x
+void runSumAlongRows(Tensor<float, 1, true>& input,
+                     Tensor<float, 2, true>& output,
+                     bool zeroClamp,
+                     cudaStream_t stream);
+
+void runSumAlongRows(Tensor<half, 1, true>& input,
+                     Tensor<half, 2, true>& output,
+                     bool zeroClamp,
+                     cudaStream_t stream);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu
new file mode 100644
index 0000000000..986c2eee3b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu
@@ -0,0 +1,531 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/Distance.cuh>
+#include <faiss/gpu/impl/BroadcastSum.cuh>
+#include <faiss/gpu/impl/L2Norm.cuh>
+#include <faiss/gpu/impl/L2Select.cuh>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Limits.cuh>
+#include <faiss/gpu/utils/MatrixMult.cuh>
+#include <faiss/gpu/utils/BlockSelectKernel.cuh>
+
+#include <memory>
+#include <algorithm>
+#include <thrust/fill.h>
+#include <thrust/for_each.h>
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+
+namespace faiss { namespace gpu {
+
+namespace {
+
+template <typename T>
+Tensor<T, 2, true> sliceCentroids(Tensor<T, 2, true>& centroids,
+                                  bool centroidsRowMajor,
+                                  int startCentroid,
+                                  int num) {
+  // Row major is (num, dim)
+  // Col major is (dim, num)
+  if (startCentroid == 0 &&
+      num == centroids.getSize(centroidsRowMajor ? 0 : 1)) {
+    return centroids;
+  }
+
+  return centroids.narrow(centroidsRowMajor ? 0 : 1, startCentroid, num);
+}
+
+// For each chunk of k indices, increment the index by chunk * increment
+template <typename T>
+__global__ void incrementIndex(Tensor<T, 2, true> indices,
+                               int k,
+                               int increment) {
+  for (int i = threadIdx.x; i < k; i += blockDim.x) {
+    indices[blockIdx.y][blockIdx.x * k + i] += blockIdx.x * increment;
+  }
+}
+
+// Used to update result indices in distance computation where the number of
+// centroids is high, and is tiled
+template <typename T>
+void runIncrementIndex(Tensor<T, 2, true>& indices,
+                       int k,
+                       int increment,
+                       cudaStream_t stream) {
+  dim3 grid(indices.getSize(1) / k, indices.getSize(0));
+  int block = std::min(k, 512);
+
+  // should be exact
+  FAISS_ASSERT(grid.x * k == indices.getSize(1));
+
+  incrementIndex<<<grid, block, 0, stream>>>(indices, k, increment);
+
+  cudaDeviceSynchronize();
+}
+
+// If the inner size (dim) of the vectors is small, we want a larger query tile
+// size, like 1024
+
+void chooseTileSize(int numQueries,
+                    int numCentroids,
+                    int dim,
+                    int elementSize,
+                    size_t tempMemAvailable,
+                    int& tileRows,
+                    int& tileCols) {
+  // The matrix multiplication should be large enough to be efficient, but if it
+  // is too large, we seem to lose efficiency as opposed to double-streaming.
+  // Each tile size here defines 1/2 of the memory use due to double streaming.
+  // We ignore available temporary memory, as that is adjusted independently by
+  // the user and can thus meet these requirements (or not).
+  // For <= 4 GB GPUs, prefer 512 MB of usage.
+  // For <= 8 GB GPUs, prefer 768 MB of usage.
+  // Otherwise, prefer 1 GB of usage.
+  auto totalMem = getCurrentDeviceProperties().totalGlobalMem;
+
+  int targetUsage = 0;
+
+  if (totalMem <= ((size_t) 4) * 1024 * 1024 * 1024) {
+    targetUsage = 512 * 1024 * 1024;
+  } else if (totalMem <= ((size_t) 8) * 1024 * 1024 * 1024) {
+    targetUsage = 768 * 1024 * 1024;
+  } else {
+    targetUsage = 1024 * 1024 * 1024;
+  }
+
+  targetUsage /= 2 * elementSize;
+
+  // 512 seems to be a batch size sweetspot for float32.
+  // If we are on float16, increase to 512.
+  // If the k size (vec dim) of the matrix multiplication is small (<= 32),
+  // increase to 1024.
+  int preferredTileRows = 512;
+  if (dim <= 32) {
+    preferredTileRows = 1024;
+  }
+
+  tileRows = std::min(preferredTileRows, numQueries);
+
+  // tileCols is the remainder size
+  tileCols = std::min(targetUsage / preferredTileRows, numCentroids);
+}
+
+}
+
+template <typename T>
+void runDistance(bool computeL2,
+                 GpuResources* resources,
+                 Tensor<T, 2, true>& centroids,
+                 bool centroidsRowMajor,
+                 Tensor<T, 1, true>* centroidNorms,
+                 Tensor<T, 2, true>& queries,
+                 bool queriesRowMajor,
+                 int k,
+                 Tensor<T, 2, true>& outDistances,
+                 Tensor<int, 2, true>& outIndices,
+                 bool useHgemm,
+                 bool ignoreOutDistances) {
+  // The # of centroids in `centroids` based on memory layout
+  auto numCentroids = centroids.getSize(centroidsRowMajor ? 0 : 1);
+
+  // The # of queries in `queries` based on memory layout
+  auto numQueries = queries.getSize(queriesRowMajor ? 0 : 1);
+
+  // The dimensions of the vectors to consider
+  auto dim = queries.getSize(queriesRowMajor ? 1 : 0);
+  FAISS_ASSERT((numQueries == 0 || numCentroids == 0) ||
+               dim == centroids.getSize(centroidsRowMajor ? 1 : 0));
+
+  FAISS_ASSERT(outDistances.getSize(0) == numQueries);
+  FAISS_ASSERT(outIndices.getSize(0) == numQueries);
+  FAISS_ASSERT(outDistances.getSize(1) == k);
+  FAISS_ASSERT(outIndices.getSize(1) == k);
+
+  auto& mem = resources->getMemoryManagerCurrentDevice();
+  auto defaultStream = resources->getDefaultStreamCurrentDevice();
+
+  // If we're quering against a 0 sized set, just return empty results
+  if (centroids.numElements() == 0) {
+    thrust::fill(thrust::cuda::par.on(defaultStream),
+                 outDistances.data(), outDistances.end(),
+                 Limits<T>::getMax());
+
+    thrust::fill(thrust::cuda::par.on(defaultStream),
+                 outIndices.data(), outIndices.end(),
+                 -1);
+
+    return;
+  }
+
+  // L2: If ||c||^2 is not pre-computed, calculate it
+  DeviceTensor<T, 1, true> cNorms;
+  if (computeL2 && !centroidNorms) {
+    cNorms =
+      std::move(DeviceTensor<T, 1, true>(mem,
+                                         {numCentroids}, defaultStream));
+    runL2Norm(centroids, centroidsRowMajor, cNorms, true, defaultStream);
+    centroidNorms = &cNorms;
+  }
+
+  //
+  // Prepare norm vector ||q||^2; ||c||^2 is already pre-computed
+  //
+  int qNormSize[1] = {numQueries};
+  DeviceTensor<T, 1, true> queryNorms(mem, qNormSize, defaultStream);
+
+  // ||q||^2
+  if (computeL2) {
+    runL2Norm(queries, queriesRowMajor, queryNorms, true, defaultStream);
+  }
+
+  // By default, aim to use up to 512 MB of memory for the processing, with both
+  // number of queries and number of centroids being at least 512.
+  int tileRows = 0;
+  int tileCols = 0;
+  chooseTileSize(numQueries,
+                 numCentroids,
+                 dim,
+                 sizeof(T),
+                 mem.getSizeAvailable(),
+                 tileRows,
+                 tileCols);
+
+  int numColTiles = utils::divUp(numCentroids, tileCols);
+
+  // We can have any number of vectors to query against, even less than k, in
+  // which case we'll return -1 for the index
+  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K); // select limitation
+
+  // Temporary output memory space we'll use
+  DeviceTensor<T, 2, true> distanceBuf1(
+    mem, {tileRows, tileCols}, defaultStream);
+  DeviceTensor<T, 2, true> distanceBuf2(
+    mem, {tileRows, tileCols}, defaultStream);
+  DeviceTensor<T, 2, true>* distanceBufs[2] =
+    {&distanceBuf1, &distanceBuf2};
+
+  DeviceTensor<T, 2, true> outDistanceBuf1(
+    mem, {tileRows, numColTiles * k}, defaultStream);
+  DeviceTensor<T, 2, true> outDistanceBuf2(
+    mem, {tileRows, numColTiles * k}, defaultStream);
+  DeviceTensor<T, 2, true>* outDistanceBufs[2] =
+    {&outDistanceBuf1, &outDistanceBuf2};
+
+  DeviceTensor<int, 2, true> outIndexBuf1(
+    mem, {tileRows, numColTiles * k}, defaultStream);
+  DeviceTensor<int, 2, true> outIndexBuf2(
+    mem, {tileRows, numColTiles * k}, defaultStream);
+  DeviceTensor<int, 2, true>* outIndexBufs[2] =
+    {&outIndexBuf1, &outIndexBuf2};
+
+  auto streams = resources->getAlternateStreamsCurrentDevice();
+  streamWait(streams, {defaultStream});
+
+  int curStream = 0;
+  bool interrupt = false;
+
+  // Tile over the input queries
+  for (int i = 0; i < numQueries; i += tileRows) {
+    if (interrupt || InterruptCallback::is_interrupted()) {
+      interrupt = true;
+      break;
+    }
+
+    int curQuerySize = std::min(tileRows, numQueries - i);
+
+    auto outDistanceView =
+      outDistances.narrow(0, i, curQuerySize);
+    auto outIndexView =
+      outIndices.narrow(0, i, curQuerySize);
+
+    auto queryView =
+      queries.narrow(queriesRowMajor ? 0 : 1, i, curQuerySize);
+    auto queryNormNiew =
+      queryNorms.narrow(0, i, curQuerySize);
+
+    auto outDistanceBufRowView =
+      outDistanceBufs[curStream]->narrow(0, 0, curQuerySize);
+    auto outIndexBufRowView =
+      outIndexBufs[curStream]->narrow(0, 0, curQuerySize);
+
+    // Tile over the centroids
+    for (int j = 0; j < numCentroids; j += tileCols) {
+      if (InterruptCallback::is_interrupted()) {
+        interrupt = true;
+        break;
+      }
+
+      int curCentroidSize = std::min(tileCols, numCentroids - j);
+      int curColTile = j / tileCols;
+
+      auto centroidsView =
+        sliceCentroids(centroids, centroidsRowMajor, j, curCentroidSize);
+
+      auto distanceBufView = distanceBufs[curStream]->
+        narrow(0, 0, curQuerySize).narrow(1, 0, curCentroidSize);
+
+      auto outDistanceBufColView =
+        outDistanceBufRowView.narrow(1, k * curColTile, k);
+      auto outIndexBufColView =
+        outIndexBufRowView.narrow(1, k * curColTile, k);
+
+      // L2: distance is ||c||^2 - 2qc + ||q||^2, we compute -2qc
+      // IP: just compute qc
+      // (query id x dim) x (centroid id, dim)' = (query id, centroid id)
+      runMatrixMult(distanceBufView,
+                    false, // not transposed
+                    queryView,
+                    !queriesRowMajor, // transposed MM if col major
+                    centroidsView,
+                    centroidsRowMajor, // transposed MM if row major
+                    computeL2 ? -2.0f : 1.0f,
+                    0.0f,
+                    useHgemm,
+                    resources->getBlasHandleCurrentDevice(),
+                    streams[curStream]);
+
+      if (computeL2) {
+        // For L2 distance, we use this fused kernel that performs both
+        // adding ||c||^2 to -2qc and k-selection, so we only need two
+        // passes (one write by the gemm, one read here) over the huge
+        // region of output memory
+        //
+        // If we aren't tiling along the number of centroids, we can perform the
+        // output work directly
+        if (tileCols == numCentroids) {
+          // Write into the final output
+          runL2SelectMin(distanceBufView,
+                         *centroidNorms,
+                         outDistanceView,
+                         outIndexView,
+                         k,
+                         streams[curStream]);
+
+          if (!ignoreOutDistances) {
+            // expand (query id) to (query id, k) by duplicating along rows
+            // top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)
+            runSumAlongRows(queryNormNiew,
+                            outDistanceView,
+                            true, // L2 distances should not go below zero due
+                                  // to roundoff error
+                            streams[curStream]);
+          }
+        } else {
+          auto centroidNormsView = centroidNorms->narrow(0, j, curCentroidSize);
+
+          // Write into our intermediate output
+          runL2SelectMin(distanceBufView,
+                         centroidNormsView,
+                         outDistanceBufColView,
+                         outIndexBufColView,
+                         k,
+                         streams[curStream]);
+
+          if (!ignoreOutDistances) {
+            // expand (query id) to (query id, k) by duplicating along rows
+            // top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)
+            runSumAlongRows(queryNormNiew,
+                            outDistanceBufColView,
+                            true, // L2 distances should not go below zero due
+                                  // to roundoff error
+                            streams[curStream]);
+          }
+        }
+      } else {
+        // For IP, just k-select the output for this tile
+        if (tileCols == numCentroids) {
+          // Write into the final output
+          runBlockSelect(distanceBufView,
+                         outDistanceView,
+                         outIndexView,
+                         true, k, streams[curStream]);
+        } else {
+          // Write into the intermediate output
+          runBlockSelect(distanceBufView,
+                         outDistanceBufColView,
+                         outIndexBufColView,
+                         true, k, streams[curStream]);
+        }
+      }
+    }
+
+    // As we're finished with processing a full set of centroids, perform the
+    // final k-selection
+    if (tileCols != numCentroids) {
+      // The indices are tile-relative; for each tile of k, we need to add
+      // tileCols to the index
+      runIncrementIndex(outIndexBufRowView, k, tileCols, streams[curStream]);
+
+      runBlockSelectPair(outDistanceBufRowView,
+                         outIndexBufRowView,
+                         outDistanceView,
+                         outIndexView,
+                         computeL2 ? false : true, k, streams[curStream]);
+    }
+
+    curStream = (curStream + 1) % 2;
+  }
+
+  // Have the desired ordering stream wait on the multi-stream
+  streamWait({defaultStream}, streams);
+
+  if (interrupt) {
+    FAISS_THROW_MSG("interrupted");
+  }
+}
+
+template <typename T>
+void runL2Distance(GpuResources* resources,
+                   Tensor<T, 2, true>& centroids,
+                   bool centroidsRowMajor,
+                   Tensor<T, 1, true>* centroidNorms,
+                   Tensor<T, 2, true>& queries,
+                   bool queriesRowMajor,
+                   int k,
+                   Tensor<T, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices,
+                   bool useHgemm,
+                   bool ignoreOutDistances = false) {
+  runDistance<T>(true, // L2
+                 resources,
+                 centroids,
+                 centroidsRowMajor,
+                 centroidNorms,
+                 queries,
+                 queriesRowMajor,
+                 k,
+                 outDistances,
+                 outIndices,
+                 useHgemm,
+                 ignoreOutDistances);
+}
+
+template <typename T>
+void runIPDistance(GpuResources* resources,
+                   Tensor<T, 2, true>& centroids,
+                   bool centroidsRowMajor,
+                   Tensor<T, 2, true>& queries,
+                   bool queriesRowMajor,
+                   int k,
+                   Tensor<T, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices,
+                   bool useHgemm) {
+  runDistance<T>(false, // IP
+                 resources,
+                 centroids,
+                 centroidsRowMajor,
+                 nullptr, // no centroid norms provided
+                 queries,
+                 queriesRowMajor,
+                 k,
+                 outDistances,
+                 outIndices,
+                 useHgemm,
+                 false);
+}
+
+//
+// Instantiations of the distance templates
+//
+
+void
+runIPDistance(GpuResources* resources,
+              Tensor<float, 2, true>& vectors,
+              bool vectorsRowMajor,
+              Tensor<float, 2, true>& queries,
+              bool queriesRowMajor,
+              int k,
+              Tensor<float, 2, true>& outDistances,
+              Tensor<int, 2, true>& outIndices) {
+  runIPDistance<float>(resources,
+                       vectors,
+                       vectorsRowMajor,
+                       queries,
+                       queriesRowMajor,
+                       k,
+                       outDistances,
+                       outIndices,
+                       false);
+}
+
+void
+runIPDistance(GpuResources* resources,
+              Tensor<half, 2, true>& vectors,
+              bool vectorsRowMajor,
+              Tensor<half, 2, true>& queries,
+              bool queriesRowMajor,
+              int k,
+              Tensor<half, 2, true>& outDistances,
+              Tensor<int, 2, true>& outIndices,
+              bool useHgemm) {
+  runIPDistance<half>(resources,
+                      vectors,
+                      vectorsRowMajor,
+                      queries,
+                      queriesRowMajor,
+                      k,
+                      outDistances,
+                      outIndices,
+                      useHgemm);
+}
+
+void
+runL2Distance(GpuResources* resources,
+              Tensor<float, 2, true>& vectors,
+              bool vectorsRowMajor,
+              Tensor<float, 1, true>* vectorNorms,
+              Tensor<float, 2, true>& queries,
+              bool queriesRowMajor,
+              int k,
+              Tensor<float, 2, true>& outDistances,
+              Tensor<int, 2, true>& outIndices,
+              bool ignoreOutDistances) {
+  runL2Distance<float>(resources,
+                       vectors,
+                       vectorsRowMajor,
+                       vectorNorms,
+                       queries,
+                       queriesRowMajor,
+                       k,
+                       outDistances,
+                       outIndices,
+                       false,
+                       ignoreOutDistances);
+}
+
+void
+runL2Distance(GpuResources* resources,
+              Tensor<half, 2, true>& vectors,
+              bool vectorsRowMajor,
+              Tensor<half, 1, true>* vectorNorms,
+              Tensor<half, 2, true>& queries,
+              bool queriesRowMajor,
+              int k,
+              Tensor<half, 2, true>& outDistances,
+              Tensor<int, 2, true>& outIndices,
+              bool useHgemm,
+              bool ignoreOutDistances) {
+  runL2Distance<half>(resources,
+                      vectors,
+                      vectorsRowMajor,
+                      vectorNorms,
+                      queries,
+                      queriesRowMajor,
+                      k,
+                      outDistances,
+                      outIndices,
+                      useHgemm,
+                      ignoreOutDistances);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh
new file mode 100644
index 0000000000..0508eeeed1
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh
@@ -0,0 +1,68 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/Float16.cuh>
+
+namespace faiss { namespace gpu {
+
+class GpuResources;
+
+/// Calculates brute-force L2 distance between `vectors` and
+/// `queries`, returning the k closest results seen
+void runL2Distance(GpuResources* resources,
+                   Tensor<float, 2, true>& vectors,
+                   bool vectorsRowMajor,
+                   // can be optionally pre-computed; nullptr if we
+                   // have to compute it upon the call
+                   Tensor<float, 1, true>* vectorNorms,
+                   Tensor<float, 2, true>& queries,
+                   bool queriesRowMajor,
+                   int k,
+                   Tensor<float, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices,
+                   // Do we care about `outDistances`? If not, we can
+                   // take shortcuts.
+                   bool ignoreOutDistances = false);
+
+/// Calculates brute-force inner product distance between `vectors`
+/// and `queries`, returning the k closest results seen
+void runIPDistance(GpuResources* resources,
+                   Tensor<float, 2, true>& vectors,
+                   bool vectorsRowMajor,
+                   Tensor<float, 2, true>& queries,
+                   bool queriesRowMajor,
+                   int k,
+                   Tensor<float, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices);
+
+void runIPDistance(GpuResources* resources,
+                   Tensor<half, 2, true>& vectors,
+                   bool vectorsRowMajor,
+                   Tensor<half, 2, true>& queries,
+                   bool queriesRowMajor,
+                   int k,
+                   Tensor<half, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices,
+                   bool useHgemm);
+
+void runL2Distance(GpuResources* resources,
+                   Tensor<half, 2, true>& vectors,
+                   bool vectorsRowMajor,
+                   Tensor<half, 1, true>* vectorNorms,
+                   Tensor<half, 2, true>& queries,
+                   bool queriesRowMajor,
+                   int k,
+                   Tensor<half, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices,
+                   bool useHgemm,
+                   bool ignoreOutDistances = false);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu
new file mode 100644
index 0000000000..08d4221dfd
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu
@@ -0,0 +1,308 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/impl/Distance.cuh>
+#include <faiss/gpu/impl/L2Norm.cuh>
+#include <faiss/gpu/impl/VectorResidual.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Transpose.cuh>
+
+namespace faiss { namespace gpu {
+
+FlatIndex::FlatIndex(GpuResources* res,
+                     int dim,
+                     bool l2Distance,
+                     bool useFloat16,
+                     bool useFloat16Accumulator,
+                     bool storeTransposed,
+                     MemorySpace space) :
+    resources_(res),
+    dim_(dim),
+    useFloat16_(useFloat16),
+    useFloat16Accumulator_(useFloat16Accumulator),
+    storeTransposed_(storeTransposed),
+    l2Distance_(l2Distance),
+    space_(space),
+    num_(0),
+    rawData_(space) {
+}
+
+bool
+FlatIndex::getUseFloat16() const {
+  return useFloat16_;
+}
+
+/// Returns the number of vectors we contain
+int FlatIndex::getSize() const {
+  if (useFloat16_) {
+    return vectorsHalf_.getSize(0);
+  } else {
+    return vectors_.getSize(0);
+  }
+}
+
+int FlatIndex::getDim() const {
+  if (useFloat16_) {
+    return vectorsHalf_.getSize(1);
+  } else {
+    return vectors_.getSize(1);
+  }
+}
+
+void
+FlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
+  if (useFloat16_) {
+    rawData_.reserve(numVecs * dim_ * sizeof(half), stream);
+  } else {
+    rawData_.reserve(numVecs * dim_ * sizeof(float), stream);
+  }
+}
+
+Tensor<float, 2, true>&
+FlatIndex::getVectorsFloat32Ref() {
+  // Should not call this unless we are in float32 mode
+  FAISS_ASSERT(!useFloat16_);
+
+  return vectors_;
+}
+
+Tensor<half, 2, true>&
+FlatIndex::getVectorsFloat16Ref() {
+  // Should not call this unless we are in float16 mode
+  FAISS_ASSERT(useFloat16_);
+
+  return vectorsHalf_;
+}
+
+DeviceTensor<float, 2, true>
+FlatIndex::getVectorsFloat32Copy(cudaStream_t stream) {
+  return getVectorsFloat32Copy(0, num_, stream);
+}
+
+DeviceTensor<float, 2, true>
+FlatIndex::getVectorsFloat32Copy(int from, int num, cudaStream_t stream) {
+  DeviceTensor<float, 2, true> vecFloat32({num, dim_}, space_);
+
+  if (useFloat16_) {
+    auto halfNarrow = vectorsHalf_.narrowOutermost(from, num);
+    convertTensor<half, float, 2>(stream, halfNarrow, vecFloat32);
+  } else {
+    vectors_.copyTo(vecFloat32, stream);
+  }
+
+  return vecFloat32;
+}
+
+void
+FlatIndex::query(Tensor<float, 2, true>& input,
+                 int k,
+                 Tensor<float, 2, true>& outDistances,
+                 Tensor<int, 2, true>& outIndices,
+                 bool exactDistance) {
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+  auto& mem = resources_->getMemoryManagerCurrentDevice();
+
+  if (useFloat16_) {
+    // We need to convert to float16
+    auto inputHalf = convertTensor<float, half, 2>(resources_,
+                                                   stream,
+                                                   input);
+
+    DeviceTensor<half, 2, true> outDistancesHalf(
+      mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream);
+
+    query(inputHalf, k, outDistancesHalf, outIndices, exactDistance);
+
+    if (exactDistance) {
+      // Convert outDistances back
+      convertTensor<half, float, 2>(stream,
+                                    outDistancesHalf,
+                                    outDistances);
+    }
+  } else {
+    if (l2Distance_) {
+      runL2Distance(resources_,
+                    storeTransposed_ ? vectorsTransposed_ : vectors_,
+                    !storeTransposed_, // is vectors row major?
+                    &norms_,
+                    input,
+                    true, // input is row major
+                    k,
+                    outDistances,
+                    outIndices,
+                    !exactDistance);
+    } else {
+      runIPDistance(resources_,
+                    storeTransposed_ ? vectorsTransposed_ : vectors_,
+                    !storeTransposed_, // is vectors row major?
+                    input,
+                    true, // input is row major
+                    k,
+                    outDistances,
+                    outIndices);
+    }
+  }
+}
+
+void
+FlatIndex::query(Tensor<half, 2, true>& input,
+                 int k,
+                 Tensor<half, 2, true>& outDistances,
+                 Tensor<int, 2, true>& outIndices,
+                 bool exactDistance) {
+  FAISS_ASSERT(useFloat16_);
+
+  if (l2Distance_) {
+    runL2Distance(resources_,
+                  storeTransposed_ ? vectorsHalfTransposed_ : vectorsHalf_,
+                  !storeTransposed_, // is vectors row major?
+                  &normsHalf_,
+                  input,
+                  true, // input is row major
+                  k,
+                  outDistances,
+                  outIndices,
+                  useFloat16Accumulator_,
+                  // FIXME
+                  !exactDistance);
+  } else {
+    runIPDistance(resources_,
+                  storeTransposed_ ? vectorsHalfTransposed_ : vectorsHalf_,
+                  !storeTransposed_, // is vectors row major?
+                  input,
+                  true, // input is row major
+                  k,
+                  outDistances,
+                  outIndices,
+                  useFloat16Accumulator_);
+  }
+}
+
+void
+FlatIndex::computeResidual(Tensor<float, 2, true>& vecs,
+                           Tensor<int, 1, true>& listIds,
+                           Tensor<float, 2, true>& residuals) {
+  if (useFloat16_) {
+    runCalcResidual(vecs,
+                    getVectorsFloat16Ref(),
+                    listIds,
+                    residuals,
+                    resources_->getDefaultStreamCurrentDevice());
+  } else {
+    runCalcResidual(vecs,
+                    getVectorsFloat32Ref(),
+                    listIds,
+                    residuals,
+                    resources_->getDefaultStreamCurrentDevice());
+  }
+}
+
+void
+FlatIndex::reconstruct(Tensor<int, 1, true>& listIds,
+                       Tensor<float, 2, true>& vecs) {
+  if (useFloat16_) {
+    runReconstruct(listIds,
+                   getVectorsFloat16Ref(),
+                   vecs,
+                   resources_->getDefaultStreamCurrentDevice());
+  } else {
+    runReconstruct(listIds,
+                   getVectorsFloat32Ref(),
+                   vecs,
+                   resources_->getDefaultStreamCurrentDevice());
+  }
+}
+
+void
+FlatIndex::reconstruct(Tensor<int, 2, true>& listIds,
+                       Tensor<float, 3, true>& vecs) {
+  auto listIds1 = listIds.downcastOuter<1>();
+  auto vecs2 = vecs.downcastOuter<2>();
+
+  reconstruct(listIds1, vecs2);
+}
+
+void
+FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
+  if (numVecs == 0) {
+    return;
+  }
+
+  if (useFloat16_) {
+    // Make sure that `data` is on our device; we'll run the
+    // conversion on our device
+    auto devData = toDevice<float, 2>(resources_,
+                                      getCurrentDevice(),
+                                      (float*) data,
+                                      stream,
+                                      {numVecs, dim_});
+
+    auto devDataHalf =
+      convertTensor<float, half, 2>(resources_, stream, devData);
+
+    rawData_.append((char*) devDataHalf.data(),
+                    devDataHalf.getSizeInBytes(),
+                    stream,
+                    true /* reserve exactly */);
+  } else {
+    rawData_.append((char*) data,
+                    (size_t) dim_ * numVecs * sizeof(float),
+                    stream,
+                    true /* reserve exactly */);
+  }
+
+  num_ += numVecs;
+
+  if (useFloat16_) {
+    DeviceTensor<half, 2, true> vectorsHalf(
+      (half*) rawData_.data(), {(int) num_, dim_}, space_);
+    vectorsHalf_ = std::move(vectorsHalf);
+  } else {
+    DeviceTensor<float, 2, true> vectors(
+      (float*) rawData_.data(), {(int) num_, dim_}, space_);
+    vectors_ = std::move(vectors);
+  }
+
+  if (storeTransposed_) {
+    if (useFloat16_) {
+      vectorsHalfTransposed_ =
+        std::move(DeviceTensor<half, 2, true>({dim_, (int) num_}, space_));
+      runTransposeAny(vectorsHalf_, 0, 1, vectorsHalfTransposed_, stream);
+    } else {
+      vectorsTransposed_ =
+        std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}, space_));
+      runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
+    }
+  }
+
+  if (l2Distance_) {
+    // Precompute L2 norms of our database
+    if (useFloat16_) {
+      DeviceTensor<half, 1, true> normsHalf({(int) num_}, space_);
+      runL2Norm(vectorsHalf_, true, normsHalf, true, stream);
+      normsHalf_ = std::move(normsHalf);
+    } else {
+      DeviceTensor<float, 1, true> norms({(int) num_}, space_);
+      runL2Norm(vectors_, true, norms, true, stream);
+      norms_ = std::move(norms);
+    }
+  }
+}
+
+void
+FlatIndex::reset() {
+  rawData_.clear();
+  vectors_ = std::move(DeviceTensor<float, 2, true>());
+  norms_ = std::move(DeviceTensor<float, 1, true>());
+  num_ = 0;
+}
+
+} }
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh
new file mode 100644
index 0000000000..da7b640d69
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh
@@ -0,0 +1,130 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/DeviceVector.cuh>
+#include <faiss/gpu/utils/MemorySpace.h>
+
+namespace faiss { namespace gpu {
+
+class GpuResources;
+
+/// Holder of GPU resources for a particular flat index
+class FlatIndex {
+ public:
+  FlatIndex(GpuResources* res,
+            int dim,
+            bool l2Distance,
+            bool useFloat16,
+            bool useFloat16Accumulator,
+            bool storeTransposed,
+            MemorySpace space);
+
+  bool getUseFloat16() const;
+
+  /// Returns the number of vectors we contain
+  int getSize() const;
+
+  int getDim() const;
+
+  /// Reserve storage that can contain at least this many vectors
+  void reserve(size_t numVecs, cudaStream_t stream);
+
+  /// Returns a reference to our vectors currently in use
+  Tensor<float, 2, true>& getVectorsFloat32Ref();
+
+  /// Returns a reference to our vectors currently in use (useFloat16 mode)
+  Tensor<half, 2, true>& getVectorsFloat16Ref();
+
+  /// Performs a copy of the vectors on the given device, converting
+  /// as needed from float16
+  DeviceTensor<float, 2, true> getVectorsFloat32Copy(cudaStream_t stream);
+
+  /// Returns only a subset of the vectors
+  DeviceTensor<float, 2, true> getVectorsFloat32Copy(int from,
+                                                     int num,
+                                                     cudaStream_t stream);
+
+  void query(Tensor<float, 2, true>& vecs,
+             int k,
+             Tensor<float, 2, true>& outDistances,
+             Tensor<int, 2, true>& outIndices,
+             bool exactDistance);
+
+  void query(Tensor<half, 2, true>& vecs,
+             int k,
+             Tensor<half, 2, true>& outDistances,
+             Tensor<int, 2, true>& outIndices,
+             bool exactDistance);
+
+  /// Compute residual for set of vectors
+  void computeResidual(Tensor<float, 2, true>& vecs,
+                       Tensor<int, 1, true>& listIds,
+                       Tensor<float, 2, true>& residuals);
+
+  /// Gather vectors given the set of IDs
+  void reconstruct(Tensor<int, 1, true>& listIds,
+                   Tensor<float, 2, true>& vecs);
+
+  void reconstruct(Tensor<int, 2, true>& listIds,
+                   Tensor<float, 3, true>& vecs);
+
+  /// Add vectors to ourselves; the pointer passed can be on the host
+  /// or the device
+  void add(const float* data, int numVecs, cudaStream_t stream);
+
+  /// Free all storage
+  void reset();
+
+ private:
+  /// Collection of GPU resources that we use
+  GpuResources* resources_;
+
+  /// Dimensionality of our vectors
+  const int dim_;
+
+  /// Float16 data format
+  const bool useFloat16_;
+
+  /// For supporting hardware, whether or not we use Hgemm
+  const bool useFloat16Accumulator_;
+
+  /// Store vectors in transposed layout for speed; makes addition to
+  /// the index slower
+  const bool storeTransposed_;
+
+  /// L2 or inner product distance?
+  bool l2Distance_;
+
+  /// Memory space for our allocations
+  MemorySpace space_;
+
+  /// How many vectors we have
+  int num_;
+
+  /// The underlying expandable storage
+  DeviceVector<char> rawData_;
+
+  /// Vectors currently in rawData_
+  DeviceTensor<float, 2, true> vectors_;
+  DeviceTensor<float, 2, true> vectorsTransposed_;
+
+  /// Vectors currently in rawData_, float16 form
+  DeviceTensor<half, 2, true> vectorsHalf_;
+  DeviceTensor<half, 2, true> vectorsHalfTransposed_;
+
+  /// Precomputed L2 norms
+  DeviceTensor<float, 1, true> norms_;
+
+  /// Precomputed L2 norms, float16 form
+  DeviceTensor<half, 1, true> normsHalf_;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/GpuScalarQuantizer.cuh b/core/src/index/thirdparty/faiss/gpu/impl/GpuScalarQuantizer.cuh
new file mode 100644
index 0000000000..2c71669faa
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/GpuScalarQuantizer.cuh
@@ -0,0 +1,611 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/HostTensor.cuh>
+
+namespace faiss { namespace gpu {
+
+inline bool isSQSupported(ScalarQuantizer::QuantizerType qtype) {
+  switch (qtype) {
+    case ScalarQuantizer::QuantizerType::QT_8bit:
+    case ScalarQuantizer::QuantizerType::QT_8bit_uniform:
+    case ScalarQuantizer::QuantizerType::QT_8bit_direct:
+    case ScalarQuantizer::QuantizerType::QT_4bit:
+    case ScalarQuantizer::QuantizerType::QT_4bit_uniform:
+    case ScalarQuantizer::QuantizerType::QT_fp16:
+      return true;
+    default:
+      return false;
+  }
+}
+
+// Wrapper around the CPU ScalarQuantizer that allows storage of parameters in
+// GPU memory
+struct GpuScalarQuantizer : public ScalarQuantizer {
+  GpuScalarQuantizer(const ScalarQuantizer& sq)
+      : ScalarQuantizer(sq),
+        gpuTrained(DeviceTensor<float, 1, true>({(int) sq.trained.size()})) {
+    HostTensor<float, 1, true>
+      cpuTrained((float*) sq.trained.data(), {(int) sq.trained.size()});
+
+    // Just use the default stream, as we're allocating memory above in any case
+    gpuTrained.copyFrom(cpuTrained, 0);
+    CUDA_VERIFY(cudaStreamSynchronize(0));
+  }
+
+  // ScalarQuantizer::trained copied to GPU memory
+  DeviceTensor<float, 1, true> gpuTrained;
+};
+
+//
+// Quantizer codecs
+//
+
+// QT is the quantizer type implemented
+// DimMultiple is the minimum guaranteed dimension multiple of the vectors
+// encoded (used for ensuring alignment for memory load/stores)
+template <int QT, int DimMultiple>
+struct Codec { };
+
+/////
+//
+// 32 bit encodings
+// (does not use qtype)
+//
+/////
+
+struct CodecFloat {
+  /// How many dimensions per iteration we are handling for encoding or decoding
+  static constexpr int kDimPerIter = 1;
+
+  CodecFloat(int vecBytes) : bytesPerVec(vecBytes) { }
+
+  size_t getSmemSize(int dim) { return 0; }
+  inline __device__ void setSmem(float* smem, int dim) { }
+
+  inline __device__ void decode(void* data, int vec, int d,
+                                float* out) const {
+    float* p = (float*) &((uint8_t*) data)[vec * bytesPerVec];
+    out[0] = p[d];
+  }
+
+  inline __device__ float decodePartial(void* data, int vec, int d,
+                                        int subD) const {
+    // doesn't need implementing (kDimPerIter == 1)
+    return 0.0f;
+  }
+
+  inline __device__ void encode(void* data, int vec, int d,
+                                float v[kDimPerIter]) const {
+    float* p = (float*) &((uint8_t*) data)[vec * bytesPerVec];
+    p[d] = v[0];
+  }
+
+  inline __device__ void encodePartial(void* data, int vec, int d,
+                                       int remaining,
+                                       float v[kDimPerIter]) const {
+    // doesn't need implementing (kDimPerIter == 1)
+  }
+
+  int bytesPerVec;
+};
+
+/////
+//
+// 16 bit encodings
+//
+/////
+
+// Arbitrary dimension fp16
+template <>
+struct Codec<ScalarQuantizer::QuantizerType::QT_fp16, 1> {
+  /// How many dimensions per iteration we are handling for encoding or decoding
+  static constexpr int kDimPerIter = 1;
+
+  Codec(int vecBytes) : bytesPerVec(vecBytes) { }
+
+  size_t getSmemSize(int dim) { return 0; }
+  inline __device__ void setSmem(float* smem, int dim) { }
+
+  inline __device__ void decode(void* data, int vec, int d,
+                                float* out) const {
+    half* p = (half*) &((uint8_t*) data)[vec * bytesPerVec];
+    out[0] = Convert<half, float>()(p[d]);
+  }
+
+  inline __device__ float decodePartial(void* data, int vec, int d,
+                                        int subD) const {
+    // doesn't need implementing (kDimPerIter == 1)
+    return 0.0f;
+  }
+
+  inline __device__ void encode(void* data, int vec, int d,
+                                float v[kDimPerIter]) const {
+    half* p = (half*) &((uint8_t*) data)[vec * bytesPerVec];
+    p[d] = Convert<float, half>()(v[0]);
+  }
+
+  inline __device__ void encodePartial(void* data, int vec, int d,
+                                       int remaining,
+                                       float v[kDimPerIter]) const {
+    // doesn't need implementing (kDimPerIter == 1)
+  }
+
+  int bytesPerVec;
+};
+
+// dim % 2 == 0, ensures uint32 alignment
+template <>
+struct Codec<ScalarQuantizer::QuantizerType::QT_fp16, 2> {
+  /// How many dimensions per iteration we are handling for encoding or decoding
+  static constexpr int kDimPerIter = 2;
+
+  Codec(int vecBytes) : bytesPerVec(vecBytes) { }
+
+  size_t getSmemSize(int dim) { return 0; }
+  inline __device__ void setSmem(float* smem, int dim) { }
+
+  inline __device__ void decode(void* data, int vec, int d,
+                                float* out) const {
+    half2* p = (half2*) &((uint8_t*) data)[vec * bytesPerVec];
+    half2 pd = p[d];
+
+    out[0] = Convert<half, float>()(pd.x);
+    out[1] = Convert<half, float>()(pd.y);
+  }
+
+  inline __device__ float decodePartial(void* data, int vec, int d,
+                                        int subD) const {
+    // should not be called
+    assert(false);
+    return 0;
+  }
+
+  inline __device__ void encode(void* data, int vec, int d,
+                                float v[kDimPerIter]) const {
+    half2* p = (half2*) &((uint8_t*) data)[vec * bytesPerVec];
+    half h0 = Convert<float, half>()(v[0]);
+    half h1 = Convert<float, half>()(v[1]);
+
+    half2 h;
+    h.x = h0;
+    h.y = h1;
+
+    p[d] = h;
+  }
+
+  inline __device__ void encodePartial(void* data, int vec, int d,
+                                       int remaining,
+                                       float v[kDimPerIter]) const {
+    // should not be called
+    assert(false);
+  }
+
+  int bytesPerVec;
+};
+
+/////
+//
+// 8 bit encodings
+//
+/////
+
+template <int DimPerIter>
+struct Get8BitType { };
+
+template <>
+struct Get8BitType<1> { using T = uint8_t; };
+
+template <>
+struct Get8BitType<2> { using T = uint16_t; };
+
+template <>
+struct Get8BitType<4> { using T = uint32_t; };
+
+// Uniform quantization across all dimensions
+template <int DimMultiple>
+struct Codec<ScalarQuantizer::QuantizerType::QT_8bit_uniform, DimMultiple> {
+  /// How many dimensions per iteration we are handling for encoding or decoding
+  static constexpr int kDimPerIter = DimMultiple;
+  using MemT = typename Get8BitType<DimMultiple>::T;
+
+  Codec(int vecBytes, float min, float diff)
+      : bytesPerVec(vecBytes), vmin(min), vdiff(diff) {
+  }
+
+  size_t getSmemSize(int dim) { return 0; }
+  inline __device__ void setSmem(float* smem, int dim) { }
+
+  inline __device__ float decodeHelper(uint8_t v) const {
+    float x = (((float) v) + 0.5f) / 255.0f;
+    return vmin + x * vdiff;
+  }
+
+  inline __device__ void decode(void* data, int vec, int d,
+                                float* out) const {
+    MemT* p = (MemT*) &((uint8_t*) data)[vec * bytesPerVec];
+    MemT pv = p[d];
+
+    uint8_t x[kDimPerIter];
+#pragma unroll
+    for (int i = 0; i < kDimPerIter; ++i) {
+      x[i] = (uint8_t) ((pv >> (i * 8)) & 0xffU);
+    }
+
+    float xDec[kDimPerIter];
+#pragma unroll
+    for (int i = 0; i < kDimPerIter; ++i) {
+      xDec[i] = decodeHelper(x[i]);
+    }
+
+  #pragma unroll
+    for (int i = 0; i < kDimPerIter; ++i) {
+      out[i] = xDec[i];
+    }
+  }
+
+  inline __device__ float decodePartial(void* data, int vec, int d,
+                                        int subD) const {
+    if (DimMultiple > 1) {
+      // should not be called
+      assert(false);
+    }
+
+    // otherwise does not need implementing
+    return 0;
+  }
+
+  inline __device__ uint8_t encodeHelper(float v) const {
+    float x = (v - vmin) / vdiff;
+    x = fminf(1.0f, fmaxf(0.0f, x));
+    return (uint8_t) (255 * x);
+  }
+
+  inline __device__ void encode(void* data, int vec, int d,
+                                float v[kDimPerIter]) const {
+    MemT* p = (MemT*) &((uint8_t*) data)[vec * bytesPerVec];
+
+    MemT x[kDimPerIter];
+#pragma unroll
+    for (int i = 0; i < kDimPerIter; ++i) {
+      x[i] = encodeHelper(v[i]);
+    }
+
+    MemT out = 0;
+#pragma unroll
+    for (int i = 0; i < kDimPerIter; ++i) {
+      out |= (x[i] << (i * 8));
+    }
+
+    p[d] = out;
+  }
+
+  inline __device__ void encodePartial(void* data, int vec, int d,
+                                       int remaining,
+                                       float v[kDimPerIter]) const {
+    if (DimMultiple > 1) {
+      // should not be called
+      assert(false);
+    }
+
+    // otherwise does not need implementing
+  }
+
+  int bytesPerVec;
+  const float vmin;
+  const float vdiff;
+};
+
+// Uniform quantization per each dimension
+template <int DimMultiple>
+struct Codec<ScalarQuantizer::QuantizerType::QT_8bit, DimMultiple> {
+  /// How many dimensions per iteration we are handling for encoding or decoding
+  static constexpr int kDimPerIter = DimMultiple;
+  using MemT = typename Get8BitType<DimMultiple>::T;
+
+  Codec(int vecBytes, float* min, float* diff)
+      : bytesPerVec(vecBytes), vmin(min), vdiff(diff),
+        smemVmin(nullptr),
+        smemVdiff(nullptr) {
+  }
+
+  size_t getSmemSize(int dim) {
+    return sizeof(float) * dim * 2;
+  }
+
+  inline __device__ void setSmem(float* smem, int dim) {
+    smemVmin = smem;
+    smemVdiff = smem + dim;
+
+    for (int i = threadIdx.x; i < dim; i += blockDim.x) {
+      smemVmin[i] = vmin[i];
+      smemVdiff[i] = vdiff[i];
+    }
+  }
+
+  inline __device__ float decodeHelper(uint8_t v, int realDim) const {
+    float x = (((float) v) + 0.5f) / 255.0f;
+    return smemVmin[realDim] + x * smemVdiff[realDim];
+  }
+
+  inline __device__ void decode(void* data, int vec, int d,
+                                float* out) const {
+    MemT* p = (MemT*) &((uint8_t*) data)[vec * bytesPerVec];
+    MemT pv = p[d];
+    int realDim = d * kDimPerIter;
+
+    uint8_t x[kDimPerIter];
+#pragma unroll
+    for (int i = 0; i < kDimPerIter; ++i) {
+      x[i] = (uint8_t) ((pv >> (i * 8)) & 0xffU);
+    }
+
+    float xDec[kDimPerIter];
+#pragma unroll
+    for (int i = 0; i < kDimPerIter; ++i) {
+      xDec[i] = decodeHelper(x[i], realDim + i);
+    }
+
+  #pragma unroll
+    for (int i = 0; i < kDimPerIter; ++i) {
+      out[i] = xDec[i];
+    }
+  }
+
+  inline __device__ float decodePartial(void* data, int vec, int d,
+                                        int subD) const {
+    if (DimMultiple > 1) {
+      // should not be called
+      assert(false);
+    }
+
+    // otherwise does not need implementing
+    return 0;
+  }
+
+  inline __device__ uint8_t encodeHelper(float v, int realDim) const {
+    float x = (v - vmin[realDim]) / vdiff[realDim];
+    x = fminf(1.0f, fmaxf(0.0f, x));
+    return (uint8_t) (255 * x);
+  }
+
+  inline __device__ void encode(void* data, int vec, int d,
+                                float v[kDimPerIter]) const {
+    MemT* p = (MemT*) &((uint8_t*) data)[vec * bytesPerVec];
+    int realDim = d * kDimPerIter;
+
+    MemT x[kDimPerIter];
+#pragma unroll
+    for (int i = 0; i < kDimPerIter; ++i) {
+      x[i] = encodeHelper(v[i], realDim + i);
+    }
+
+    MemT out = 0;
+#pragma unroll
+    for (int i = 0; i < kDimPerIter; ++i) {
+      out |= (x[i] << (i * 8));
+    }
+
+    p[d] = out;
+  }
+
+  inline __device__ void encodePartial(void* data, int vec, int d,
+                                       int remaining,
+                                       float v[kDimPerIter]) const {
+    if (DimMultiple > 1) {
+      // should not be called
+      assert(false);
+    }
+
+    // otherwise does not need implementing
+  }
+
+  int bytesPerVec;
+
+  // gmem pointers
+  const float* vmin;
+  const float* vdiff;
+
+  // smem pointers (configured in the kernel)
+  float* smemVmin;
+  float* smemVdiff;
+};
+
+template <>
+struct Codec<ScalarQuantizer::QuantizerType::QT_8bit_direct, 1> {
+  /// How many dimensions per iteration we are handling for encoding or decoding
+  static constexpr int kDimPerIter = 1;
+
+  Codec(int vecBytes) : bytesPerVec(vecBytes) { }
+
+  size_t getSmemSize(int dim) { return 0; }
+  inline __device__ void setSmem(float* smem, int dim) { }
+
+  inline __device__ void decode(void* data, int vec, int d,
+                                float* out) const {
+    uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec];
+    out[0] = (float) p[d];
+  }
+
+  inline __device__ float decodePartial(void* data, int vec, int d,
+                                        int subD) const {
+    // doesn't need implementing (kDimPerIter == 1)
+    return 0.0f;
+  }
+
+  inline __device__ void encode(void* data, int vec, int d,
+                                float v[kDimPerIter]) const {
+    uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec];
+    p[d] = (uint8_t) v[0];
+  }
+
+  inline __device__ void encodePartial(void* data, int vec, int d,
+                                       int remaining,
+                                       float v[kDimPerIter]) const {
+    // doesn't need implementing (kDimPerIter == 1)
+  }
+
+  int bytesPerVec;
+};
+
+/////
+//
+// 4 bit encodings
+//
+/////
+
+// Uniform quantization across all dimensions
+template <>
+struct Codec<ScalarQuantizer::QuantizerType::QT_4bit_uniform, 1> {
+  /// How many dimensions per iteration we are handling for encoding or decoding
+  static constexpr int kDimPerIter = 2;
+
+  Codec(int vecBytes, float min, float diff)
+      : bytesPerVec(vecBytes), vmin(min), vdiff(diff) {
+  }
+
+  size_t getSmemSize(int dim) { return 0; }
+  inline __device__ void setSmem(float* smem, int dim) { }
+
+  inline __device__ float decodeHelper(uint8_t v) const {
+    float x = (((float) v) + 0.5f) / 15.0f;
+    return vmin + x * vdiff;
+  }
+
+  inline __device__ void decode(void* data, int vec, int d,
+                                float* out) const {
+    uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec];
+    uint8_t pv = p[d];
+
+    out[0] = decodeHelper(pv & 0xf);
+    out[1] = decodeHelper(pv >> 4);
+  }
+
+  inline __device__ float decodePartial(void* data, int vec, int d,
+                                        int subD /* unused */) const {
+    // We can only be called for a single input
+    uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec];
+    uint8_t pv = p[d];
+
+    return decodeHelper(pv & 0xf);
+  }
+
+  inline __device__ uint8_t encodeHelper(float v) const {
+    float x = (v - vmin) / vdiff;
+    x = fminf(1.0f, fmaxf(0.0f, x));
+    return (uint8_t) (x * 15.0f);
+  }
+
+  inline __device__ void encode(void* data, int vec, int d,
+                                float v[kDimPerIter]) const {
+    uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec];
+    p[d] = encodeHelper(v[0]) | (encodeHelper(v[1]) << 4);
+  }
+
+  inline __device__ void encodePartial(void* data, int vec, int d,
+                                       int remaining, /* unused */
+                                       float v[kDimPerIter]) const {
+    // We can only be called for a single output
+    uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec];
+    p[d] = encodeHelper(v[0]);
+  }
+
+  int bytesPerVec;
+  const float vmin;
+  const float vdiff;
+};
+
+template <>
+struct Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1> {
+  /// How many dimensions per iteration we are handling for encoding or decoding
+  static constexpr int kDimPerIter = 2;
+
+  Codec(int vecBytes, float* min, float* diff)
+      : bytesPerVec(vecBytes), vmin(min), vdiff(diff),
+        smemVmin(nullptr),
+        smemVdiff(nullptr) {
+  }
+
+  size_t getSmemSize(int dim) {
+    return sizeof(float) * dim * 2;
+  }
+
+  inline __device__ void setSmem(float* smem, int dim) {
+    smemVmin = smem;
+    smemVdiff = smem + dim;
+
+    for (int i = threadIdx.x; i < dim; i += blockDim.x) {
+      smemVmin[i] = vmin[i];
+      smemVdiff[i] = vdiff[i];
+    }
+  }
+
+  inline __device__ float decodeHelper(uint8_t v, int realDim) const {
+    float x = (((float) v) + 0.5f) / 15.0f;
+    return smemVmin[realDim] + x * smemVdiff[realDim];
+  }
+
+  inline __device__ void decode(void* data, int vec, int d,
+                                float* out) const {
+    uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec];
+    uint8_t pv = p[d];
+    int realDim = d * kDimPerIter;
+
+    out[0] = decodeHelper(pv & 0xf, realDim);
+    out[1] = decodeHelper(pv >> 4, realDim + 1);
+  }
+
+  inline __device__ float decodePartial(void* data, int vec, int d,
+                                        int subD /* unused */) const {
+    // We can only be called for a single input
+    uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec];
+    uint8_t pv = p[d];
+    int realDim = d * kDimPerIter;
+
+    return decodeHelper(pv & 0xf, realDim);
+  }
+
+  inline __device__ uint8_t encodeHelper(float v, int realDim) const {
+    float x = (v - vmin[realDim]) / vdiff[realDim];
+    x = fminf(1.0f, fmaxf(0.0f, x));
+    return (uint8_t) (x * 15.0f);
+  }
+
+  inline __device__ void encode(void* data, int vec, int d,
+                                float v[kDimPerIter]) const {
+    uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec];
+    int realDim = d * kDimPerIter;
+    p[d] = encodeHelper(v[0], realDim) | (encodeHelper(v[1], realDim + 1) << 4);
+  }
+
+  inline __device__ void encodePartial(void* data, int vec, int d,
+                                       int remaining, /* unused */
+                                       float v[kDimPerIter]) const {
+    // We can only be called for a single output
+    uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec];
+    int realDim = d * kDimPerIter;
+
+    p[d] = encodeHelper(v[0], realDim);
+  }
+
+  int bytesPerVec;
+
+  // gmem pointers
+  const float* vmin;
+  const float* vdiff;
+
+  // smem pointers
+  float* smemVmin;
+  float* smemVdiff;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFAppend.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFAppend.cu
new file mode 100644
index 0000000000..b009075ca1
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFAppend.cu
@@ -0,0 +1,369 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/IVFAppend.cuh>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+
+namespace faiss { namespace gpu {
+
+//
+// IVF list length update
+//
+
+__global__ void
+runUpdateListPointers(Tensor<int, 1, true> listIds,
+                      Tensor<int, 1, true> newListLength,
+                      Tensor<void*, 1, true> newCodePointers,
+                      Tensor<void*, 1, true> newIndexPointers,
+                      int* listLengths,
+                      void** listCodes,
+                      void** listIndices) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (i < listIds.getSize(0)) {
+    int listId = listIds[i];
+    listLengths[listId] = newListLength[i];
+    listCodes[listId] = newCodePointers[i];
+    listIndices[listId] = newIndexPointers[i];
+  }
+}
+
+void
+runUpdateListPointers(Tensor<int, 1, true>& listIds,
+                      Tensor<int, 1, true>& newListLength,
+                      Tensor<void*, 1, true>& newCodePointers,
+                      Tensor<void*, 1, true>& newIndexPointers,
+                      thrust::device_vector<int>& listLengths,
+                      thrust::device_vector<void*>& listCodes,
+                      thrust::device_vector<void*>& listIndices,
+                      cudaStream_t stream) {
+  int numThreads = std::min(listIds.getSize(0), getMaxThreadsCurrentDevice());
+  int numBlocks = utils::divUp(listIds.getSize(0), numThreads);
+
+  dim3 grid(numBlocks);
+  dim3 block(numThreads);
+
+  runUpdateListPointers<<<grid, block, 0, stream>>>(
+    listIds, newListLength, newCodePointers, newIndexPointers,
+    listLengths.data().get(),
+    listCodes.data().get(),
+    listIndices.data().get());
+
+  CUDA_TEST_ERROR();
+}
+
+//
+// IVF PQ append
+//
+
+template <IndicesOptions Opt>
+__global__ void
+ivfpqInvertedListAppend(Tensor<int, 1, true> listIds,
+                        Tensor<int, 1, true> listOffset,
+                        Tensor<int, 2, true> encodings,
+                        Tensor<long, 1, true> indices,
+                        void** listCodes,
+                        void** listIndices) {
+  int encodingToAdd = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (encodingToAdd >= listIds.getSize(0)) {
+    return;
+  }
+
+  int listId = listIds[encodingToAdd];
+  int offset = listOffset[encodingToAdd];
+
+  // Add vector could be invalid (contains NaNs etc)
+  if (listId == -1 || offset == -1) {
+    return;
+  }
+
+  auto encoding = encodings[encodingToAdd];
+  long index = indices[encodingToAdd];
+
+  if (Opt == INDICES_32_BIT) {
+    // FIXME: there could be overflow here, but where should we check this?
+    ((int*) listIndices[listId])[offset] = (int) index;
+  } else if (Opt == INDICES_64_BIT) {
+    ((long*) listIndices[listId])[offset] = (long) index;
+  } else {
+    // INDICES_CPU or INDICES_IVF; no indices are being stored
+  }
+
+  unsigned char* codeStart =
+    ((unsigned char*) listCodes[listId]) + offset * encodings.getSize(1);
+
+  // FIXME: slow
+  for (int i = 0; i < encodings.getSize(1); ++i) {
+    codeStart[i] = (unsigned char) encoding[i];
+  }
+}
+
+void
+runIVFPQInvertedListAppend(Tensor<int, 1, true>& listIds,
+                           Tensor<int, 1, true>& listOffset,
+                           Tensor<int, 2, true>& encodings,
+                           Tensor<long, 1, true>& indices,
+                           thrust::device_vector<void*>& listCodes,
+                           thrust::device_vector<void*>& listIndices,
+                           IndicesOptions indicesOptions,
+                           cudaStream_t stream) {
+  int numThreads = std::min(listIds.getSize(0), getMaxThreadsCurrentDevice());
+  int numBlocks = utils::divUp(listIds.getSize(0), numThreads);
+
+  dim3 grid(numBlocks);
+  dim3 block(numThreads);
+
+#define RUN_APPEND(IND)                                         \
+  do {                                                          \
+    ivfpqInvertedListAppend<IND><<<grid, block, 0, stream>>>(   \
+      listIds, listOffset, encodings, indices,                  \
+      listCodes.data().get(),                                   \
+      listIndices.data().get());                                \
+  } while (0)
+
+  if ((indicesOptions == INDICES_CPU) || (indicesOptions == INDICES_IVF)) {
+    // no need to maintain indices on the GPU
+    RUN_APPEND(INDICES_IVF);
+  } else if (indicesOptions == INDICES_32_BIT) {
+    RUN_APPEND(INDICES_32_BIT);
+  } else if (indicesOptions == INDICES_64_BIT) {
+    RUN_APPEND(INDICES_64_BIT);
+  } else {
+    // unknown index storage type
+    FAISS_ASSERT(false);
+  }
+
+  CUDA_TEST_ERROR();
+
+#undef RUN_APPEND
+}
+
+//
+// IVF flat append
+//
+
+__global__ void
+ivfFlatIndicesAppend(Tensor<int, 1, true> listIds,
+                     Tensor<int, 1, true> listOffset,
+                     Tensor<long, 1, true> indices,
+                     IndicesOptions opt,
+                     void** listIndices) {
+  int vec = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (vec >= listIds.getSize(0)) {
+    return;
+  }
+
+  int listId = listIds[vec];
+  int offset = listOffset[vec];
+
+  // Add vector could be invalid (contains NaNs etc)
+  if (listId == -1 || offset == -1) {
+    return;
+  }
+
+  long index = indices[vec];
+
+  if (opt == INDICES_32_BIT) {
+    // FIXME: there could be overflow here, but where should we check this?
+    ((int*) listIndices[listId])[offset] = (int) index;
+  } else if (opt == INDICES_64_BIT) {
+    ((long*) listIndices[listId])[offset] = (long) index;
+  }
+}
+
+template <typename Codec>
+__global__ void
+ivfFlatInvertedListAppend(Tensor<int, 1, true> listIds,
+                          Tensor<int, 1, true> listOffset,
+                          Tensor<float, 2, true> vecs,
+                          void** listData,
+                          Codec codec) {
+  int vec = blockIdx.x;
+
+  int listId = listIds[vec];
+  int offset = listOffset[vec];
+
+  // Add vector could be invalid (contains NaNs etc)
+  if (listId == -1 || offset == -1) {
+    return;
+  }
+
+  // Handle whole encoding (only thread 0 will handle the remainder)
+  int limit = utils::divDown(vecs.getSize(1), Codec::kDimPerIter);
+
+  int i;
+  for (i = threadIdx.x; i < limit; i += blockDim.x) {
+    int realDim = i * Codec::kDimPerIter;
+    float toEncode[Codec::kDimPerIter];
+
+#pragma unroll
+    for (int j = 0; j < Codec::kDimPerIter; ++j) {
+      toEncode[j] = vecs[vec][realDim + j];
+    }
+
+    codec.encode(listData[listId], offset, i, toEncode);
+  }
+
+  // Handle remainder with a single thread, if any
+  if (Codec::kDimPerIter > 1) {
+    int realDim = limit * Codec::kDimPerIter;
+
+    // Was there any remainder?
+    if (realDim < vecs.getSize(1)) {
+      if (threadIdx.x == 0) {
+        float toEncode[Codec::kDimPerIter];
+
+        // How many remaining that we need to encode
+        int remaining = vecs.getSize(1) - realDim;
+
+#pragma unroll
+        for (int j = 0; j < Codec::kDimPerIter; ++j) {
+          int idx = realDim + j;
+          toEncode[j] = idx < vecs.getSize(1) ? vecs[vec][idx] : 0.0f;
+        }
+
+        codec.encodePartial(listData[listId], offset, i, remaining, toEncode);
+      }
+    }
+  }
+}
+
+void
+runIVFFlatInvertedListAppend(Tensor<int, 1, true>& listIds,
+                             Tensor<int, 1, true>& listOffset,
+                             Tensor<float, 2, true>& vecs,
+                             Tensor<long, 1, true>& indices,
+                             bool useResidual,
+                             Tensor<float, 2, true>& residuals,
+                             GpuScalarQuantizer* scalarQ,
+                             thrust::device_vector<void*>& listData,
+                             thrust::device_vector<void*>& listIndices,
+                             IndicesOptions indicesOptions,
+                             cudaStream_t stream) {
+  int dim = vecs.getSize(1);
+  int maxThreads = getMaxThreadsCurrentDevice();
+
+  // First, append the indices that we're about to add, if any
+  if (indicesOptions != INDICES_CPU && indicesOptions != INDICES_IVF) {
+    int blocks = utils::divUp(vecs.getSize(0), maxThreads);
+
+    ivfFlatIndicesAppend<<<blocks, maxThreads, 0, stream>>>(
+      listIds,
+      listOffset,
+      indices,
+      indicesOptions,
+      listIndices.data().get());
+  }
+
+  // Each block will handle appending a single vector
+#define RUN_APPEND                                                      \
+  do {                                                                  \
+    dim3 grid(vecs.getSize(0));                                         \
+    dim3 block(std::min(dim / codec.kDimPerIter, maxThreads));          \
+                                                                        \
+    ivfFlatInvertedListAppend                                           \
+      <<<grid, block, 0, stream>>>(                                     \
+        listIds,                                                        \
+        listOffset,                                                     \
+        useResidual ? residuals : vecs,                                 \
+        listData.data().get(),                                          \
+        codec);                                                         \
+  } while (0)
+
+  if (!scalarQ) {
+    CodecFloat codec(dim * sizeof(float));
+    RUN_APPEND;
+  } else {
+    switch (scalarQ->qtype) {
+      case ScalarQuantizer::QuantizerType::QT_8bit:
+      {
+        if (false) {
+//        if (dim % 4 == 0) {
+          Codec<ScalarQuantizer::QuantizerType::QT_8bit, 4>
+            codec(scalarQ->code_size,
+                  scalarQ->gpuTrained.data(),
+                  scalarQ->gpuTrained.data() + dim);
+          RUN_APPEND;
+        } else {
+          Codec<ScalarQuantizer::QuantizerType::QT_8bit, 1>
+            codec(scalarQ->code_size,
+                  scalarQ->gpuTrained.data(),
+                  scalarQ->gpuTrained.data() + dim);
+          RUN_APPEND;
+        }
+      }
+      break;
+      case ScalarQuantizer::QuantizerType::QT_8bit_uniform:
+      {
+//        if (dim % 4 == 0) {
+        if (false) {
+          Codec<ScalarQuantizer::QuantizerType::QT_8bit_uniform, 4>
+            codec(scalarQ->code_size, scalarQ->trained[0], scalarQ->trained[1]);
+          RUN_APPEND;
+        } else {
+          Codec<ScalarQuantizer::QuantizerType::QT_8bit_uniform, 1>
+            codec(scalarQ->code_size, scalarQ->trained[0], scalarQ->trained[1]);
+          RUN_APPEND;
+        }
+      }
+      break;
+      case ScalarQuantizer::QuantizerType::QT_fp16:
+      {
+//        if (dim % 2 == 0) {
+        if (false) {
+          Codec<ScalarQuantizer::QuantizerType::QT_fp16, 2>
+            codec(scalarQ->code_size);
+          RUN_APPEND;
+        } else {
+          Codec<ScalarQuantizer::QuantizerType::QT_fp16, 1>
+            codec(scalarQ->code_size);
+          RUN_APPEND;
+        }
+      }
+      break;
+      case ScalarQuantizer::QuantizerType::QT_8bit_direct:
+      {
+        Codec<ScalarQuantizer::QuantizerType::QT_8bit_direct, 1>
+          codec(scalarQ->code_size);
+        RUN_APPEND;
+      }
+      break;
+      case ScalarQuantizer::QuantizerType::QT_4bit:
+      {
+        Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1>
+          codec(scalarQ->code_size,
+                scalarQ->gpuTrained.data(),
+                scalarQ->gpuTrained.data() + dim);
+        RUN_APPEND;
+      }
+      break;
+      case ScalarQuantizer::QuantizerType::QT_4bit_uniform:
+      {
+        Codec<ScalarQuantizer::QuantizerType::QT_4bit_uniform, 1>
+          codec(scalarQ->code_size, scalarQ->trained[0], scalarQ->trained[1]);
+        RUN_APPEND;
+      }
+      break;
+      default:
+        // unimplemented, should be handled at a higher level
+        FAISS_ASSERT(false);
+    }
+  }
+
+  CUDA_TEST_ERROR();
+
+#undef RUN_APPEND
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFAppend.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFAppend.cuh
new file mode 100644
index 0000000000..3d61248082
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFAppend.cuh
@@ -0,0 +1,53 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
+#include <faiss/gpu/GpuIndicesOptions.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <thrust/device_vector.h>
+
+namespace faiss { namespace gpu {
+
+/// Update device-side list pointers in a batch
+void runUpdateListPointers(Tensor<int, 1, true>& listIds,
+                           Tensor<int, 1, true>& newListLength,
+                           Tensor<void*, 1, true>& newCodePointers,
+                           Tensor<void*, 1, true>& newIndexPointers,
+                           thrust::device_vector<int>& listLengths,
+                           thrust::device_vector<void*>& listCodes,
+                           thrust::device_vector<void*>& listIndices,
+                           cudaStream_t stream);
+
+/// Actually append the new codes / vector indices to the individual lists
+
+/// IVFPQ
+void runIVFPQInvertedListAppend(Tensor<int, 1, true>& listIds,
+                                Tensor<int, 1, true>& listOffset,
+                                Tensor<int, 2, true>& encodings,
+                                Tensor<long, 1, true>& indices,
+                                thrust::device_vector<void*>& listCodes,
+                                thrust::device_vector<void*>& listIndices,
+                                IndicesOptions indicesOptions,
+                                cudaStream_t stream);
+
+/// IVF flat storage
+void runIVFFlatInvertedListAppend(Tensor<int, 1, true>& listIds,
+                                  Tensor<int, 1, true>& listOffset,
+                                  Tensor<float, 2, true>& vecs,
+                                  Tensor<long, 1, true>& indices,
+                                  bool useResidual,
+                                  Tensor<float, 2, true>& residuals,
+                                  GpuScalarQuantizer* scalarQ,
+                                  thrust::device_vector<void*>& listData,
+                                  thrust::device_vector<void*>& listIndices,
+                                  IndicesOptions indicesOptions,
+                                  cudaStream_t stream);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFBase.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFBase.cu
new file mode 100644
index 0000000000..2c8b7fbcac
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFBase.cu
@@ -0,0 +1,375 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/IVFBase.cuh>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/impl/IVFAppend.cuh>
+#include <faiss/gpu/impl/RemapIndices.h>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/HostTensor.cuh>
+#include <limits>
+#include <thrust/host_vector.h>
+#include <unordered_map>
+#include <numeric>
+
+namespace faiss { namespace gpu {
+
+IVFBase::IVFBase(GpuResources* resources,
+                 FlatIndex* quantizer,
+                 int bytesPerVector,
+                 IndicesOptions indicesOptions,
+                 MemorySpace space) :
+    resources_(resources),
+    quantizer_(quantizer),
+    bytesPerVector_(bytesPerVector),
+    indicesOptions_(indicesOptions),
+    space_(space),
+    dim_(quantizer->getDim()),
+    numLists_(quantizer->getSize()),
+    maxListLength_(0) {
+  reset();
+}
+
+IVFBase::~IVFBase() {
+}
+
+void
+IVFBase::reserveMemory(size_t numVecs) {
+  size_t vecsPerList = numVecs / deviceListData_.size();
+  if (vecsPerList < 1) {
+    return;
+  }
+
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+
+  size_t bytesPerDataList = vecsPerList * bytesPerVector_;
+  for (auto& list : deviceListData_) {
+    list->reserve(bytesPerDataList, stream);
+  }
+
+  if ((indicesOptions_ == INDICES_32_BIT) ||
+      (indicesOptions_ == INDICES_64_BIT)) {
+    // Reserve for index lists as well
+    size_t bytesPerIndexList = vecsPerList *
+      (indicesOptions_ == INDICES_32_BIT ? sizeof(int) : sizeof(long));
+
+    for (auto& list : deviceListIndices_) {
+      list->reserve(bytesPerIndexList, stream);
+    }
+  }
+
+  // Update device info for all lists, since the base pointers may
+  // have changed
+  updateDeviceListInfo_(stream);
+}
+
+void
+IVFBase::reset() {
+  deviceListData_.clear();
+  deviceListIndices_.clear();
+  deviceListDataPointers_.clear();
+  deviceListIndexPointers_.clear();
+  deviceListLengths_.clear();
+  listOffsetToUserIndex_.clear();
+
+  deviceListData_.reserve(numLists_);
+  deviceListIndices_.reserve(numLists_);
+  listOffsetToUserIndex_.resize(numLists_);
+
+  for (size_t i = 0; i < numLists_; ++i) {
+    deviceListData_.emplace_back(
+      std::unique_ptr<DeviceVector<unsigned char>>(
+        new DeviceVector<unsigned char>(space_)));
+    deviceListIndices_.emplace_back(
+      std::unique_ptr<DeviceVector<unsigned char>>(
+        new DeviceVector<unsigned char>(space_)));
+    listOffsetToUserIndex_.emplace_back(std::vector<long>());
+  }
+
+  deviceListDataPointers_.resize(numLists_, nullptr);
+  deviceListIndexPointers_.resize(numLists_, nullptr);
+  deviceListLengths_.resize(numLists_, 0);
+  maxListLength_ = 0;
+
+  deviceData_.reset(new DeviceVector<unsigned char>(space_));
+  deviceIndices_.reset(new DeviceVector<unsigned char>(space_));
+  deviceTrained_.reset(new DeviceVector<unsigned char>(space_));
+}
+
+int
+IVFBase::getDim() const {
+  return dim_;
+}
+
+size_t
+IVFBase::reclaimMemory() {
+  // Reclaim all unused memory exactly
+  return reclaimMemory_(true);
+}
+
+size_t
+IVFBase::reclaimMemory_(bool exact) {
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+
+  size_t totalReclaimed = 0;
+
+  for (int i = 0; i < deviceListData_.size(); ++i) {
+    auto& data = deviceListData_[i];
+    totalReclaimed += data->reclaim(exact, stream);
+
+    deviceListDataPointers_[i] = data->data();
+  }
+
+  for (int i = 0; i < deviceListIndices_.size(); ++i) {
+    auto& indices = deviceListIndices_[i];
+    totalReclaimed += indices->reclaim(exact, stream);
+
+    deviceListIndexPointers_[i] = indices->data();
+  }
+
+  // Update device info for all lists, since the base pointers may
+  // have changed
+  updateDeviceListInfo_(stream);
+
+  return totalReclaimed;
+}
+
+void
+IVFBase::updateDeviceListInfo_(cudaStream_t stream) {
+  std::vector<int> listIds(deviceListData_.size());
+  for (int i = 0; i < deviceListData_.size(); ++i) {
+    listIds[i] = i;
+  }
+
+  updateDeviceListInfo_(listIds, stream);
+}
+
+void
+IVFBase::updateDeviceListInfo_(const std::vector<int>& listIds,
+                               cudaStream_t stream) {
+  auto& mem = resources_->getMemoryManagerCurrentDevice();
+
+  HostTensor<int, 1, true>
+    hostListsToUpdate({(int) listIds.size()});
+  HostTensor<int, 1, true>
+    hostNewListLength({(int) listIds.size()});
+  HostTensor<void*, 1, true>
+    hostNewDataPointers({(int) listIds.size()});
+  HostTensor<void*, 1, true>
+    hostNewIndexPointers({(int) listIds.size()});
+
+  for (int i = 0; i < listIds.size(); ++i) {
+    auto listId = listIds[i];
+    auto& data = deviceListData_[listId];
+    auto& indices = deviceListIndices_[listId];
+
+    hostListsToUpdate[i] = listId;
+    hostNewListLength[i] = data->size() / bytesPerVector_;
+    hostNewDataPointers[i] = data->data();
+    hostNewIndexPointers[i] = indices->data();
+  }
+
+  // Copy the above update sets to the GPU
+  DeviceTensor<int, 1, true> listsToUpdate(
+    mem, hostListsToUpdate, stream);
+  DeviceTensor<int, 1, true> newListLength(
+    mem,  hostNewListLength, stream);
+  DeviceTensor<void*, 1, true> newDataPointers(
+    mem, hostNewDataPointers, stream);
+  DeviceTensor<void*, 1, true> newIndexPointers(
+    mem, hostNewIndexPointers, stream);
+
+  // Update all pointers to the lists on the device that may have
+  // changed
+  runUpdateListPointers(listsToUpdate,
+                        newListLength,
+                        newDataPointers,
+                        newIndexPointers,
+                        deviceListLengths_,
+                        deviceListDataPointers_,
+                        deviceListIndexPointers_,
+                        stream);
+}
+
+size_t
+IVFBase::getNumLists() const {
+  return numLists_;
+}
+
+int
+IVFBase::getListLength(int listId) const {
+  FAISS_ASSERT(listId < deviceListLengths_.size());
+
+  return deviceListLengths_[listId];
+}
+
+std::vector<long>
+IVFBase::getListIndices(int listId) const {
+  FAISS_ASSERT(listId < numLists_);
+
+  if (indicesOptions_ == INDICES_32_BIT) {
+    FAISS_ASSERT(listId < deviceListIndices_.size());
+
+    auto intInd = deviceListIndices_[listId]->copyToHost<int>(
+      resources_->getDefaultStreamCurrentDevice());
+
+    std::vector<long> out(intInd.size());
+    for (size_t i = 0; i < intInd.size(); ++i) {
+      out[i] = (long) intInd[i];
+    }
+
+    return out;
+  } else if (indicesOptions_ == INDICES_64_BIT) {
+    FAISS_ASSERT(listId < deviceListIndices_.size());
+
+    return deviceListIndices_[listId]->copyToHost<long>(
+      resources_->getDefaultStreamCurrentDevice());
+  } else if (indicesOptions_ == INDICES_CPU) {
+    FAISS_ASSERT(listId < deviceListData_.size());
+    FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
+
+    auto& userIds = listOffsetToUserIndex_[listId];
+    FAISS_ASSERT(userIds.size() ==
+                 deviceListData_[listId]->size() / bytesPerVector_);
+
+    // this will return a copy
+    return userIds;
+  } else {
+    // unhandled indices type (includes INDICES_IVF)
+    FAISS_ASSERT(false);
+    return std::vector<long>();
+  }
+}
+
+std::vector<unsigned char>
+IVFBase::getListVectors(int listId) const {
+  FAISS_ASSERT(listId < deviceListData_.size());
+  auto& list = *deviceListData_[listId];
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+
+  return list.copyToHost<unsigned char>(stream);
+}
+
+void
+IVFBase::copyIndicesFromCpu_(const long* indices,
+                             const std::vector<size_t>& list_length) {
+    FAISS_ASSERT_FMT(list_length.size() == this->getNumLists(), "Expect list size %zu but %zu received!",
+                     this->getNumLists(), list_length.size());
+    auto numVecs = std::accumulate(list_length.begin(), list_length.end(), 0);
+
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+    int bytesPerRecord;
+
+    if (indicesOptions_ == INDICES_32_BIT) {
+        std::vector<int> indices32(numVecs);
+        for (size_t i = 0; i < numVecs; ++i) {
+            auto ind = indices[i];
+            FAISS_ASSERT(ind <= (long) std::numeric_limits<int>::max());
+            indices32[i] = (int) ind;
+        }
+
+        bytesPerRecord = sizeof(int);
+
+        deviceIndices_->append((unsigned char*) indices32.data(),
+                               numVecs * bytesPerRecord,
+                               stream,
+                               true);
+    } else if (indicesOptions_ == INDICES_64_BIT) {
+        bytesPerRecord = sizeof(long);
+        deviceIndices_->append((unsigned char*) indices,
+                               numVecs * bytesPerRecord,
+                               stream,
+                               true);
+    } else if (indicesOptions_ == INDICES_CPU) {
+        FAISS_ASSERT(false);
+        size_t listId = 0;
+        auto curr_indices = indices;
+        for (auto& userIndices : listOffsetToUserIndex_) {
+            userIndices.insert(userIndices.begin(), curr_indices, curr_indices + list_length[listId]);
+            curr_indices += list_length[listId];
+            listId++;
+        }
+    } else {
+        // indices are not stored
+        FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
+    }
+
+    size_t listId = 0;
+    size_t pos = 0;
+    size_t size = 0;
+
+    thrust::host_vector<void*> hostPointers(deviceListData_.size(), nullptr);
+    for (auto& device_indice : deviceListIndices_) {
+        auto data = deviceIndices_->data() + pos;
+        size = list_length[listId] * bytesPerRecord;
+        device_indice->reset(data, size, size);
+        hostPointers[listId] = device_indice->data();
+        pos += size;
+        ++ listId;
+    }
+
+    deviceListIndexPointers_ = hostPointers;
+}
+
+void
+IVFBase::addIndicesFromCpu_(int listId,
+                            const long* indices,
+                            size_t numVecs) {
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+
+  auto& listIndices = deviceListIndices_[listId];
+  auto prevIndicesData = listIndices->data();
+
+  if (indicesOptions_ == INDICES_32_BIT) {
+    // Make sure that all indices are in bounds
+    std::vector<int> indices32(numVecs);
+    for (size_t i = 0; i < numVecs; ++i) {
+      auto ind = indices[i];
+      FAISS_ASSERT(ind <= (long) std::numeric_limits<int>::max());
+      indices32[i] = (int) ind;
+    }
+
+    listIndices->append((unsigned char*) indices32.data(),
+                        numVecs * sizeof(int),
+                        stream,
+                        true /* exact reserved size */);
+  } else if (indicesOptions_ == INDICES_64_BIT) {
+    listIndices->append((unsigned char*) indices,
+                        numVecs * sizeof(long),
+                        stream,
+                        true /* exact reserved size */);
+  } else if (indicesOptions_ == INDICES_CPU) {
+    // indices are stored on the CPU
+    FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
+
+    auto& userIndices = listOffsetToUserIndex_[listId];
+    userIndices.insert(userIndices.begin(), indices, indices + numVecs);
+  } else {
+    // indices are not stored
+    FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
+  }
+
+  if (prevIndicesData != listIndices->data()) {
+    deviceListIndexPointers_[listId] = listIndices->data();
+  }
+}
+
+void
+IVFBase::addTrainedDataFromCpu_(const uint8_t* trained,
+                                size_t numData) {
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+
+    deviceTrained_->append((unsigned char*)trained,
+                           numData,
+                           stream,
+                           true);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFBase.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFBase.cuh
new file mode 100644
index 0000000000..f1c39867fa
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFBase.cuh
@@ -0,0 +1,142 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/GpuIndicesOptions.h>
+#include <faiss/gpu/utils/DeviceVector.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/MemorySpace.h>
+#include <memory>
+#include <thrust/device_vector.h>
+#include <vector>
+
+namespace faiss { namespace gpu {
+
+class GpuResources;
+struct FlatIndex;
+
+/// Base inverted list functionality for IVFFlat and IVFPQ
+class IVFBase {
+ public:
+  IVFBase(GpuResources* resources,
+          /// We do not own this reference
+          FlatIndex* quantizer,
+          int bytesPerVector,
+          IndicesOptions indicesOptions,
+          MemorySpace space);
+
+  virtual ~IVFBase();
+
+  /// Reserve GPU memory in our inverted lists for this number of vectors
+  void reserveMemory(size_t numVecs);
+
+  /// Clear out all inverted lists, but retain the coarse quantizer
+  /// and the product quantizer info
+  void reset();
+
+  /// Return the number of dimensions we are indexing
+  int getDim() const;
+
+  /// After adding vectors, one can call this to reclaim device memory
+  /// to exactly the amount needed. Returns space reclaimed in bytes
+  size_t reclaimMemory();
+
+  /// Returns the number of inverted lists
+  size_t getNumLists() const;
+
+  /// For debugging purposes, return the list length of a particular
+  /// list
+  int getListLength(int listId) const;
+
+  /// Return the list indices of a particular list back to the CPU
+  std::vector<long> getListIndices(int listId) const;
+
+  DeviceVector<unsigned char>* getTrainedData() { return deviceTrained_.get(); };
+
+  /// Return the encoded vectors of a particular list back to the CPU
+  std::vector<unsigned char> getListVectors(int listId) const;
+
+ protected:
+  /// Reclaim memory consumed on the device for our inverted lists
+  /// `exact` means we trim exactly to the memory needed
+  size_t reclaimMemory_(bool exact);
+
+  /// Update all device-side list pointer and size information
+  void updateDeviceListInfo_(cudaStream_t stream);
+
+  /// For a set of list IDs, update device-side list pointer and size
+  /// information
+  void updateDeviceListInfo_(const std::vector<int>& listIds,
+                             cudaStream_t stream);
+
+  /// Shared function to copy indices from CPU to GPU
+  void addIndicesFromCpu_(int listId,
+                          const long* indices,
+                          size_t numVecs);
+
+  void copyIndicesFromCpu_(const long* indices,
+                           const std::vector<size_t>& list_length);
+
+  void addTrainedDataFromCpu_(const uint8_t* trained, size_t numData);
+
+ protected:
+  /// Collection of GPU resources that we use
+  GpuResources* resources_;
+
+  /// Quantizer object
+  FlatIndex* quantizer_;
+
+  /// Expected dimensionality of the vectors
+  const int dim_;
+
+  /// Number of inverted lists we maintain
+  const int numLists_;
+
+  /// Number of bytes per vector in the list
+  const int bytesPerVector_;
+
+  /// How are user indices stored on the GPU?
+  const IndicesOptions indicesOptions_;
+
+  /// What memory space our inverted list storage is in
+  const MemorySpace space_;
+
+  /// Device representation of all inverted list data
+  /// id -> data
+  thrust::device_vector<void*> deviceListDataPointers_;
+
+  /// Device representation of all inverted list index pointers
+  /// id -> data
+  thrust::device_vector<void*> deviceListIndexPointers_;
+
+  /// Device representation of all inverted list lengths
+  /// id -> length
+  thrust::device_vector<int> deviceListLengths_;
+
+  /// Maximum list length seen
+  int maxListLength_;
+
+  /// Device memory for each separate list, as managed by the host.
+  /// Device memory as stored in DeviceVector is stored as unique_ptr
+  /// since deviceListSummary_ pointers must remain valid despite
+  /// resizing of deviceLists_
+  std::vector<std::unique_ptr<DeviceVector<unsigned char>>> deviceListData_;
+  std::vector<std::unique_ptr<DeviceVector<unsigned char>>> deviceListIndices_;
+
+  std::unique_ptr<DeviceVector<unsigned char>> deviceData_;
+  std::unique_ptr<DeviceVector<unsigned char>> deviceIndices_;
+  std::unique_ptr<DeviceVector<unsigned char>> deviceTrained_;
+
+  /// If we are storing indices on the CPU (indicesOptions_ is
+  /// INDICES_CPU), then this maintains a CPU-side map of what
+  /// (inverted list id, offset) maps to which user index
+  std::vector<std::vector<long>> listOffsetToUserIndex_;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu
new file mode 100644
index 0000000000..404a2d8603
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu
@@ -0,0 +1,404 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/IVFFlat.cuh>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/impl/IVFAppend.cuh>
+#include <faiss/gpu/impl/IVFFlatScan.cuh>
+#include <faiss/gpu/impl/RemapIndices.h>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/HostTensor.cuh>
+#include <faiss/gpu/utils/Transpose.cuh>
+#include <faiss/utils/utils.h>
+#include <limits>
+#include <thrust/host_vector.h>
+#include <unordered_map>
+#include <numeric>
+
+namespace faiss { namespace gpu {
+
+IVFFlat::IVFFlat(GpuResources* resources,
+                 FlatIndex* quantizer,
+                 faiss::MetricType metric,
+                 bool useResidual,
+                 faiss::ScalarQuantizer* scalarQ,
+                 IndicesOptions indicesOptions,
+                 MemorySpace space) :
+    IVFBase(resources,
+            quantizer,
+            scalarQ ? scalarQ->code_size :
+            sizeof(float) * quantizer->getDim(),
+            indicesOptions,
+            space),
+    metric_(metric),
+    useResidual_(useResidual),
+    scalarQ_(scalarQ ? new GpuScalarQuantizer(*scalarQ) : nullptr) {
+}
+
+IVFFlat::~IVFFlat() {
+}
+
+
+void
+IVFFlat::copyCodeVectorsFromCpu(const float* vecs,
+                                const long* indices,
+                                const std::vector<size_t>& list_length) {
+    FAISS_ASSERT_FMT(list_length.size() == this->getNumLists(), "Expect list size %zu but %zu received!",
+                     this->getNumLists(), list_length.size());
+    int64_t numVecs = std::accumulate(list_length.begin(), list_length.end(), 0);
+    if (numVecs == 0) {
+        return;
+    }
+
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+
+    deviceListLengths_ = list_length;
+
+    int64_t lengthInBytes = numVecs * bytesPerVector_;
+
+    // We only have int32 length representations on the GPU per each
+    // list; the length is in sizeof(char)
+    FAISS_ASSERT(deviceData_->size() + lengthInBytes <= std::numeric_limits<int64_t>::max());
+
+    deviceData_->append((unsigned char*) vecs,
+                            lengthInBytes,
+                            stream,
+                            true /* exact reserved size */);
+    copyIndicesFromCpu_(indices, list_length);
+
+    maxListLength_ = 0;
+
+    size_t listId = 0;
+    size_t pos = 0;
+    size_t size = 0;
+    thrust::host_vector<void*> hostPointers(deviceListData_.size(), nullptr);
+
+    for (auto& device_data : deviceListData_) {
+        auto data = deviceData_->data() + pos;
+
+        size = list_length[listId] * bytesPerVector_;
+
+        device_data->reset(data, size, size);
+        hostPointers[listId] = device_data->data();
+        maxListLength_ = std::max(maxListLength_, (int)list_length[listId]);
+        pos += size;
+        ++ listId;
+    }
+
+    deviceListDataPointers_ = hostPointers;
+
+    // device_vector add is potentially happening on a different stream
+    // than our default stream
+    if (stream != 0) {
+        streamWait({stream}, {0});
+    }
+}
+
+void
+IVFFlat::addCodeVectorsFromCpu(int listId,
+                               const unsigned char* vecs,
+                               const long* indices,
+                               size_t numVecs) {
+  // This list must already exist
+  FAISS_ASSERT(listId < deviceListData_.size());
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+
+  // If there's nothing to add, then there's nothing we have to do
+  if (numVecs == 0) {
+    return;
+  }
+
+  size_t lengthInBytes = numVecs * bytesPerVector_;
+
+  auto& listData = deviceListData_[listId];
+  auto prevData = listData->data();
+
+  // We only have int32 length representations on the GPU per each
+  // list; the length is in sizeof(char)
+  FAISS_ASSERT(listData->size() + lengthInBytes <=
+         (size_t) std::numeric_limits<int>::max());
+
+  listData->append(vecs,
+                   lengthInBytes,
+                   stream,
+                   true /* exact reserved size */);
+
+  // Handle the indices as well
+  addIndicesFromCpu_(listId, indices, numVecs);
+
+  // This list address may have changed due to vector resizing, but
+  // only bother updating it on the device if it has changed
+  if (prevData != listData->data()) {
+    deviceListDataPointers_[listId] = listData->data();
+  }
+
+  // And our size has changed too
+  int listLength = listData->size() / bytesPerVector_;
+  deviceListLengths_[listId] = listLength;
+
+  // We update this as well, since the multi-pass algorithm uses it
+  maxListLength_ = std::max(maxListLength_, listLength);
+
+  // device_vector add is potentially happening on a different stream
+  // than our default stream
+  if (stream != 0) {
+    streamWait({stream}, {0});
+  }
+}
+
+int
+IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
+                               Tensor<long, 1, true>& indices) {
+  FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
+  FAISS_ASSERT(vecs.getSize(1) == dim_);
+
+  auto& mem = resources_->getMemoryManagerCurrentDevice();
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+
+  // Number of valid vectors that we actually add; we return this
+  int numAdded = 0;
+
+  DeviceTensor<float, 2, true>
+    listDistance2d(mem, {vecs.getSize(0), 1}, stream);
+
+  DeviceTensor<int, 2, true>
+    listIds2d(mem, {vecs.getSize(0), 1},  stream);
+  auto listIds = listIds2d.view<1>({vecs.getSize(0)});
+
+  quantizer_->query(vecs, 1, listDistance2d, listIds2d, false);
+
+  // Calculate residuals for these vectors, if needed
+  DeviceTensor<float, 2, true>
+    residuals(mem, {vecs.getSize(0), dim_}, stream);
+
+  if (useResidual_) {
+    quantizer_->computeResidual(vecs, listIds, residuals);
+  }
+
+  // Copy the lists that we wish to append to back to the CPU
+  // FIXME: really this can be into pinned memory and a true async
+  // copy on a different stream; we can start the copy early, but it's
+  // tiny
+  HostTensor<int, 1, true> listIdsHost(listIds, stream);
+
+  // Now we add the encoded vectors to the individual lists
+  // First, make sure that there is space available for adding the new
+  // encoded vectors and indices
+
+  // list id -> # being added
+  std::unordered_map<int, int> assignCounts;
+
+  // vector id -> offset in list
+  // (we already have vector id -> list id in listIds)
+  HostTensor<int, 1, true> listOffsetHost({listIdsHost.getSize(0)});
+
+  for (int i = 0; i < listIds.getSize(0); ++i) {
+    int listId = listIdsHost[i];
+
+    // Add vector could be invalid (contains NaNs etc)
+    if (listId < 0) {
+      listOffsetHost[i] = -1;
+      continue;
+    }
+
+    FAISS_ASSERT(listId < numLists_);
+    ++numAdded;
+
+    int offset = deviceListData_[listId]->size() / bytesPerVector_;
+
+    auto it = assignCounts.find(listId);
+    if (it != assignCounts.end()) {
+      offset += it->second;
+      it->second++;
+    } else {
+      assignCounts[listId] = 1;
+    }
+
+    listOffsetHost[i] = offset;
+  }
+
+  // If we didn't add anything (all invalid vectors), no need to
+  // continue
+  if (numAdded == 0) {
+    return 0;
+  }
+
+  // We need to resize the data structures for the inverted lists on
+  // the GPUs, which means that they might need reallocation, which
+  // means that their base address may change. Figure out the new base
+  // addresses, and update those in a batch on the device
+  {
+    for (auto& counts : assignCounts) {
+      auto& data = deviceListData_[counts.first];
+      data->resize(data->size() + counts.second * bytesPerVector_,
+                   stream);
+      int newNumVecs = (int) (data->size() / bytesPerVector_);
+
+      auto& indices = deviceListIndices_[counts.first];
+      if ((indicesOptions_ == INDICES_32_BIT) ||
+          (indicesOptions_ == INDICES_64_BIT)) {
+        size_t indexSize =
+          (indicesOptions_ == INDICES_32_BIT) ? sizeof(int) : sizeof(long);
+
+        indices->resize(indices->size() + counts.second * indexSize, stream);
+      } else if (indicesOptions_ == INDICES_CPU) {
+        // indices are stored on the CPU side
+        FAISS_ASSERT(counts.first < listOffsetToUserIndex_.size());
+
+        auto& userIndices = listOffsetToUserIndex_[counts.first];
+        userIndices.resize(newNumVecs);
+      } else {
+        // indices are not stored on the GPU or CPU side
+        FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
+      }
+
+      // This is used by the multi-pass query to decide how much scratch
+      // space to allocate for intermediate results
+      maxListLength_ = std::max(maxListLength_, newNumVecs);
+    }
+
+    // Update all pointers to the lists on the device that may have
+    // changed
+    {
+      std::vector<int> listIds(assignCounts.size());
+      int i = 0;
+      for (auto& counts : assignCounts) {
+        listIds[i++] = counts.first;
+      }
+
+      updateDeviceListInfo_(listIds, stream);
+    }
+  }
+
+  // If we're maintaining the indices on the CPU side, update our
+  // map. We already resized our map above.
+  if (indicesOptions_ == INDICES_CPU) {
+    // We need to maintain the indices on the CPU side
+    HostTensor<long, 1, true> hostIndices(indices, stream);
+
+    for (int i = 0; i < hostIndices.getSize(0); ++i) {
+      int listId = listIdsHost[i];
+
+      // Add vector could be invalid (contains NaNs etc)
+      if (listId < 0) {
+        continue;
+      }
+
+      int offset = listOffsetHost[i];
+
+      FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
+      auto& userIndices = listOffsetToUserIndex_[listId];
+
+      FAISS_ASSERT(offset < userIndices.size());
+      userIndices[offset] = hostIndices[i];
+    }
+  }
+
+  // We similarly need to actually append the new vectors
+  {
+    DeviceTensor<int, 1, true> listOffset(mem, listOffsetHost, stream);
+
+    // Now, for each list to which a vector is being assigned, write it
+    runIVFFlatInvertedListAppend(listIds,
+                                 listOffset,
+                                 vecs,
+                                 indices,
+                                 useResidual_,
+                                 residuals,
+                                 scalarQ_.get(),
+                                 deviceListDataPointers_,
+                                 deviceListIndexPointers_,
+                                 indicesOptions_,
+                                 stream);
+  }
+
+  return numAdded;
+}
+
+void
+IVFFlat::query(Tensor<float, 2, true>& queries,
+               int nprobe,
+               int k,
+               Tensor<float, 2, true>& outDistances,
+               Tensor<long, 2, true>& outIndices) {
+  auto& mem = resources_->getMemoryManagerCurrentDevice();
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+
+  // These are caught at a higher level
+  FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K);
+  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
+  nprobe = std::min(nprobe, quantizer_->getSize());
+
+  FAISS_ASSERT(queries.getSize(1) == dim_);
+
+  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
+  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
+
+  // Reserve space for the quantized information
+  DeviceTensor<float, 2, true>
+    coarseDistances(mem, {queries.getSize(0), nprobe}, stream);
+  DeviceTensor<int, 2, true>
+    coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
+
+  // Find the `nprobe` closest lists; we can use int indices both
+  // internally and externally
+  quantizer_->query(queries,
+                    nprobe,
+                    coarseDistances,
+                    coarseIndices,
+                    false);
+
+  DeviceTensor<float, 3, true>
+    residualBase(mem, {queries.getSize(0), nprobe, dim_}, stream);
+
+  if (useResidual_) {
+    // Reconstruct vectors from the quantizer
+    quantizer_->reconstruct(coarseIndices, residualBase);
+  }
+
+  runIVFFlatScan(queries,
+                 coarseIndices,
+                 deviceListDataPointers_,
+                 deviceListIndexPointers_,
+                 indicesOptions_,
+                 deviceListLengths_,
+                 maxListLength_,
+                 k,
+                 metric_,
+                 useResidual_,
+                 residualBase,
+                 scalarQ_.get(),
+                 outDistances,
+                 outIndices,
+                 resources_);
+
+  // If the GPU isn't storing indices (they are on the CPU side), we
+  // need to perform the re-mapping here
+  // FIXME: we might ultimately be calling this function with inputs
+  // from the CPU, these are unnecessary copies
+  if (indicesOptions_ == INDICES_CPU) {
+    HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
+
+    ivfOffsetToUserIndex(hostOutIndices.data(),
+                         numLists_,
+                         hostOutIndices.getSize(0),
+                         hostOutIndices.getSize(1),
+                         listOffsetToUserIndex_);
+
+    // Copy back to GPU, since the input to this function is on the
+    // GPU
+    outIndices.copyFrom(hostOutIndices, stream);
+  }
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh
new file mode 100644
index 0000000000..66c05a7d61
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh
@@ -0,0 +1,72 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/impl/IVFBase.cuh>
+#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
+
+namespace faiss { namespace gpu {
+
+class IVFFlat : public IVFBase {
+ public:
+  /// Construct from a quantizer that has elemen
+  IVFFlat(GpuResources* resources,
+          /// We do not own this reference
+          FlatIndex* quantizer,
+          faiss::MetricType metric,
+          bool useResidual,
+          /// Optional ScalarQuantizer
+          faiss::ScalarQuantizer* scalarQ,
+          IndicesOptions indicesOptions,
+          MemorySpace space);
+
+  ~IVFFlat() override;
+
+  /// Add vectors to a specific list; the input data can be on the
+  /// host or on our current device
+  void addCodeVectorsFromCpu(int listId,
+                             const unsigned char* vecs,
+                             const long* indices,
+                             size_t numVecs);
+
+  void copyCodeVectorsFromCpu(const float* vecs,
+                              const long* indices,
+                              const std::vector<size_t>& list_length);
+
+    /// Adds the given vectors to this index.
+  /// The input data must be on our current device.
+  /// Returns the number of vectors successfully added. Vectors may
+  /// not be able to be added because they contain NaNs.
+  int classifyAndAddVectors(Tensor<float, 2, true>& vecs,
+                            Tensor<long, 1, true>& indices);
+
+  /// Find the approximate k nearest neigbors for `queries` against
+  /// our database
+  void query(Tensor<float, 2, true>& queries,
+             int nprobe,
+             int k,
+             Tensor<float, 2, true>& outDistances,
+             Tensor<long, 2, true>& outIndices);
+
+ private:
+  /// Returns the size of our stored vectors, in bytes
+  size_t getVectorMemorySize() const;
+
+ private:
+  /// Metric type used
+  faiss::MetricType metric_;
+
+  /// Do we encode the residual from a coarse quantizer or not?
+  bool useResidual_;
+
+  /// Scalar quantizer for encoded vectors, if any
+  std::unique_ptr<GpuScalarQuantizer> scalarQ_;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu
new file mode 100644
index 0000000000..7247a58238
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu
@@ -0,0 +1,515 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/IVFFlatScan.cuh>
+#include <faiss/gpu/impl/IVFUtils.cuh>
+#include <faiss/gpu/impl/Metrics.cuh>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/MathOperators.cuh>
+#include <faiss/gpu/utils/LoadStoreOperators.cuh>
+#include <faiss/gpu/utils/PtxUtils.cuh>
+#include <faiss/gpu/utils/Reductions.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <thrust/host_vector.h>
+
+namespace faiss { namespace gpu {
+
+// Number of warps we create per block of IVFFlatScan
+constexpr int kIVFFlatScanWarps = 4;
+
+// Works for any dimension size
+template <typename Codec, typename Metric>
+struct IVFFlatScan {
+  static __device__ void scan(float* query,
+                              bool useResidual,
+                              float* residualBaseSlice,
+                              void* vecData,
+                              const Codec& codec,
+                              const Metric& metric,
+                              int numVecs,
+                              int dim,
+                              float* distanceOut) {
+    // How many separate loading points are there for the decoder?
+    int limit = utils::divDown(dim, Codec::kDimPerIter);
+
+    // Each warp handles a separate chunk of vectors
+    int warpId = threadIdx.x / kWarpSize;
+    // FIXME: why does getLaneId() not work when we write out below!?!?!
+    int laneId = threadIdx.x % kWarpSize; // getLaneId();
+
+    // Divide the set of vectors among the warps
+    int vecsPerWarp = utils::divUp(numVecs, kIVFFlatScanWarps);
+
+    int vecStart = vecsPerWarp * warpId;
+    int vecEnd = min(vecsPerWarp * (warpId + 1), numVecs);
+
+    // Walk the list of vectors for this warp
+    for (int vec = vecStart; vec < vecEnd; ++vec) {
+      // Reduce in dist
+      float dist = 0.0f;
+
+      // Scan the dimensions availabe that have whole units for the decoder,
+      // as the decoder may handle more than one dimension at once (leaving the
+      // remainder to be handled separately)
+      for (int d = laneId; d < limit; d += kWarpSize) {
+        int realDim = d * Codec::kDimPerIter;
+        float vecVal[Codec::kDimPerIter];
+
+        // Decode the kDimPerIter dimensions
+        codec.decode(vecData, vec, d, vecVal);
+
+#pragma unroll
+        for (int j = 0; j < Codec::kDimPerIter; ++j) {
+          vecVal[j] += useResidual ? residualBaseSlice[realDim + j] : 0.0f;
+        }
+
+#pragma unroll
+        for (int j = 0; j < Codec::kDimPerIter; ++j) {
+          dist += metric.distance(query[realDim + j], vecVal[j]);
+        }
+      }
+
+      // Handle remainder by a single thread, if any
+      // Not needed if we decode 1 dim per time
+      if (Codec::kDimPerIter > 1) {
+        int realDim = limit * Codec::kDimPerIter;
+
+        // Was there any remainder?
+        if (realDim < dim) {
+          // Let the first threads in the block sequentially perform it
+          int remainderDim = realDim + laneId;
+
+          if (remainderDim < dim) {
+            float vecVal =
+              codec.decodePartial(vecData, vec, limit, laneId);
+            vecVal += useResidual ? residualBaseSlice[remainderDim] : 0.0f;
+            dist += metric.distance(query[remainderDim], vecVal);
+          }
+        }
+      }
+
+      // Reduce distance within warp
+      dist = warpReduceAllSum(dist);
+
+      if (laneId == 0) {
+        distanceOut[vec] = dist;
+      }
+    }
+  }
+};
+
+template <typename Codec, typename Metric>
+__global__ void
+ivfFlatScan(Tensor<float, 2, true> queries,
+            bool useResidual,
+            Tensor<float, 3, true> residualBase,
+            Tensor<int, 2, true> listIds,
+            void** allListData,
+            int* listLengths,
+            Codec codec,
+            Metric metric,
+            Tensor<int, 2, true> prefixSumOffsets,
+            Tensor<float, 1, true> distance) {
+  extern __shared__ float smem[];
+
+  auto queryId = blockIdx.y;
+  auto probeId = blockIdx.x;
+
+  // This is where we start writing out data
+  // We ensure that before the array (at offset -1), there is a 0 value
+  int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
+
+  auto listId = listIds[queryId][probeId];
+  // Safety guard in case NaNs in input cause no list ID to be generated
+  if (listId == -1) {
+    return;
+  }
+
+  auto query = queries[queryId].data();
+  auto vecs = allListData[listId];
+  auto numVecs = listLengths[listId];
+  auto dim = queries.getSize(1);
+  auto distanceOut = distance[outBase].data();
+
+  auto residualBaseSlice = residualBase[queryId][probeId].data();
+
+  codec.setSmem(smem, dim);
+
+  IVFFlatScan<Codec, Metric>::scan(query,
+                                   useResidual,
+                                   residualBaseSlice,
+                                   vecs,
+                                   codec,
+                                   metric,
+                                   numVecs,
+                                   dim,
+                                   distanceOut);
+}
+
+void
+runIVFFlatScanTile(Tensor<float, 2, true>& queries,
+                   Tensor<int, 2, true>& listIds,
+                   thrust::device_vector<void*>& listData,
+                   thrust::device_vector<void*>& listIndices,
+                   IndicesOptions indicesOptions,
+                   thrust::device_vector<int>& listLengths,
+                   Tensor<char, 1, true>& thrustMem,
+                   Tensor<int, 2, true>& prefixSumOffsets,
+                   Tensor<float, 1, true>& allDistances,
+                   Tensor<float, 3, true>& heapDistances,
+                   Tensor<int, 3, true>& heapIndices,
+                   int k,
+                   faiss::MetricType metricType,
+                   bool useResidual,
+                   Tensor<float, 3, true>& residualBase,
+                   GpuScalarQuantizer* scalarQ,
+                   Tensor<float, 2, true>& outDistances,
+                   Tensor<long, 2, true>& outIndices,
+                   cudaStream_t stream) {
+  int dim = queries.getSize(1);
+
+  // Check the amount of shared memory per block available based on our type is
+  // sufficient
+  if (scalarQ &&
+      (scalarQ->qtype == ScalarQuantizer::QuantizerType::QT_8bit ||
+       scalarQ->qtype == ScalarQuantizer::QuantizerType::QT_4bit)) {
+    int maxDim = getMaxSharedMemPerBlockCurrentDevice() /
+      (sizeof(float) * 2);
+
+    FAISS_THROW_IF_NOT_FMT(dim < maxDim,
+                           "Insufficient shared memory available on the GPU "
+                           "for QT_8bit or QT_4bit with %d dimensions; "
+                           "maximum dimensions possible is %d", dim, maxDim);
+  }
+
+
+  // Calculate offset lengths, so we know where to write out
+  // intermediate results
+  runCalcListOffsets(listIds, listLengths, prefixSumOffsets, thrustMem, stream);
+
+  auto grid = dim3(listIds.getSize(1), listIds.getSize(0));
+  auto block = dim3(kWarpSize * kIVFFlatScanWarps);
+
+#define RUN_IVF_FLAT                                                    \
+  do {                                                                  \
+    ivfFlatScan                                                         \
+      <<<grid, block, codec.getSmemSize(dim), stream>>>(                \
+        queries,                                                        \
+        useResidual,                                                    \
+        residualBase,                                                   \
+        listIds,                                                        \
+        listData.data().get(),                                          \
+        listLengths.data().get(),                                       \
+        codec,                                                          \
+        metric,                                                         \
+        prefixSumOffsets,                                               \
+        allDistances);                                                  \
+  } while (0)
+
+#define HANDLE_METRICS                                  \
+    do {                                                \
+      if (metricType == MetricType::METRIC_L2) {        \
+        L2Metric metric; RUN_IVF_FLAT;                  \
+      } else {                                          \
+        IPMetric metric; RUN_IVF_FLAT;                  \
+      }                                                 \
+    } while (0)
+
+  if (!scalarQ) {
+    CodecFloat codec(dim * sizeof(float));
+    HANDLE_METRICS;
+  } else {
+    switch (scalarQ->qtype) {
+      case ScalarQuantizer::QuantizerType::QT_8bit:
+      {
+        // FIXME: investigate 32 bit load perf issues
+//        if (dim % 4 == 0) {
+        if (false) {
+          Codec<ScalarQuantizer::QuantizerType::QT_8bit, 4>
+            codec(scalarQ->code_size,
+                  scalarQ->gpuTrained.data(),
+                  scalarQ->gpuTrained.data() + dim);
+          HANDLE_METRICS;
+        } else {
+          Codec<ScalarQuantizer::QuantizerType::QT_8bit, 1>
+            codec(scalarQ->code_size,
+                  scalarQ->gpuTrained.data(),
+                  scalarQ->gpuTrained.data() + dim);
+          HANDLE_METRICS;
+        }
+      }
+      break;
+      case ScalarQuantizer::QuantizerType::QT_8bit_uniform:
+      {
+        // FIXME: investigate 32 bit load perf issues
+        if (false) {
+//        if (dim % 4 == 0) {
+          Codec<ScalarQuantizer::QuantizerType::QT_8bit_uniform, 4>
+            codec(scalarQ->code_size, scalarQ->trained[0], scalarQ->trained[1]);
+          HANDLE_METRICS;
+        } else {
+          Codec<ScalarQuantizer::QuantizerType::QT_8bit_uniform, 1>
+            codec(scalarQ->code_size, scalarQ->trained[0], scalarQ->trained[1]);
+          HANDLE_METRICS;
+        }
+      }
+      break;
+      case ScalarQuantizer::QuantizerType::QT_fp16:
+      {
+        if (false) {
+          // FIXME: investigate 32 bit load perf issues
+//        if (dim % 2 == 0) {
+          Codec<ScalarQuantizer::QuantizerType::QT_fp16, 2>
+            codec(scalarQ->code_size);
+          HANDLE_METRICS;
+        } else {
+          Codec<ScalarQuantizer::QuantizerType::QT_fp16, 1>
+            codec(scalarQ->code_size);
+          HANDLE_METRICS;
+        }
+      }
+      break;
+      case ScalarQuantizer::QuantizerType::QT_8bit_direct:
+      {
+        Codec<ScalarQuantizer::QuantizerType::QT_8bit_direct, 1>
+          codec(scalarQ->code_size);
+        HANDLE_METRICS;
+      }
+      break;
+      case ScalarQuantizer::QuantizerType::QT_4bit:
+      {
+        Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1>
+          codec(scalarQ->code_size,
+                scalarQ->gpuTrained.data(),
+                scalarQ->gpuTrained.data() + dim);
+        HANDLE_METRICS;
+      }
+      break;
+      case ScalarQuantizer::QuantizerType::QT_4bit_uniform:
+      {
+        Codec<ScalarQuantizer::QuantizerType::QT_4bit_uniform, 1>
+          codec(scalarQ->code_size, scalarQ->trained[0], scalarQ->trained[1]);
+        HANDLE_METRICS;
+      }
+      break;
+      default:
+        // unimplemented, should be handled at a higher level
+        FAISS_ASSERT(false);
+    }
+  }
+
+  CUDA_TEST_ERROR();
+
+#undef HANDLE_METRICS
+#undef RUN_IVF_FLAT
+
+  // k-select the output in chunks, to increase parallelism
+  runPass1SelectLists(prefixSumOffsets,
+                      allDistances,
+                      listIds.getSize(1),
+                      k,
+                      metricToSortDirection(metricType),
+                      heapDistances,
+                      heapIndices,
+                      stream);
+
+  // k-select final output
+  auto flatHeapDistances = heapDistances.downcastInner<2>();
+  auto flatHeapIndices = heapIndices.downcastInner<2>();
+
+  runPass2SelectLists(flatHeapDistances,
+                      flatHeapIndices,
+                      listIndices,
+                      indicesOptions,
+                      prefixSumOffsets,
+                      listIds,
+                      k,
+                      metricToSortDirection(metricType),
+                      outDistances,
+                      outIndices,
+                      stream);
+}
+
+void
+runIVFFlatScan(Tensor<float, 2, true>& queries,
+               Tensor<int, 2, true>& listIds,
+               thrust::device_vector<void*>& listData,
+               thrust::device_vector<void*>& listIndices,
+               IndicesOptions indicesOptions,
+               thrust::device_vector<int>& listLengths,
+               int maxListLength,
+               int k,
+               faiss::MetricType metric,
+               bool useResidual,
+               Tensor<float, 3, true>& residualBase,
+               GpuScalarQuantizer* scalarQ,
+               // output
+               Tensor<float, 2, true>& outDistances,
+               // output
+               Tensor<long, 2, true>& outIndices,
+               GpuResources* res) {
+  constexpr int kMinQueryTileSize = 8;
+  constexpr int kMaxQueryTileSize = 128;
+  constexpr int kThrustMemSize = 16384;
+
+  int nprobe = listIds.getSize(1);
+
+  auto& mem = res->getMemoryManagerCurrentDevice();
+  auto stream = res->getDefaultStreamCurrentDevice();
+
+  // Make a reservation for Thrust to do its dirty work (global memory
+  // cross-block reduction space); hopefully this is large enough.
+  DeviceTensor<char, 1, true> thrustMem1(
+    mem, {kThrustMemSize}, stream);
+  DeviceTensor<char, 1, true> thrustMem2(
+    mem, {kThrustMemSize}, stream);
+  DeviceTensor<char, 1, true>* thrustMem[2] =
+    {&thrustMem1, &thrustMem2};
+
+  // How much temporary storage is available?
+  // If possible, we'd like to fit within the space available.
+  size_t sizeAvailable = mem.getSizeAvailable();
+
+  // We run two passes of heap selection
+  // This is the size of the first-level heap passes
+  constexpr int kNProbeSplit = 8;
+  int pass2Chunks = std::min(nprobe, kNProbeSplit);
+
+  size_t sizeForFirstSelectPass =
+    pass2Chunks * k * (sizeof(float) + sizeof(int));
+
+  // How much temporary storage we need per each query
+  size_t sizePerQuery =
+    2 * // # streams
+    ((nprobe * sizeof(int) + sizeof(int)) + // prefixSumOffsets
+     nprobe * maxListLength * sizeof(float) + // allDistances
+     sizeForFirstSelectPass);
+
+  int queryTileSize = (int) (sizeAvailable / sizePerQuery);
+
+  if (queryTileSize < kMinQueryTileSize) {
+    queryTileSize = kMinQueryTileSize;
+  } else if (queryTileSize > kMaxQueryTileSize) {
+    queryTileSize = kMaxQueryTileSize;
+  }
+
+  // FIXME: we should adjust queryTileSize to deal with this, since
+  // indexing is in int32
+  FAISS_ASSERT(queryTileSize * nprobe * maxListLength <
+         std::numeric_limits<int>::max());
+
+  // Temporary memory buffers
+  // Make sure there is space prior to the start which will be 0, and
+  // will handle the boundary condition without branches
+  DeviceTensor<int, 1, true> prefixSumOffsetSpace1(
+    mem, {queryTileSize * nprobe + 1}, stream);
+  DeviceTensor<int, 1, true> prefixSumOffsetSpace2(
+    mem, {queryTileSize * nprobe + 1}, stream);
+
+  DeviceTensor<int, 2, true> prefixSumOffsets1(
+    prefixSumOffsetSpace1[1].data(),
+    {queryTileSize, nprobe});
+  DeviceTensor<int, 2, true> prefixSumOffsets2(
+    prefixSumOffsetSpace2[1].data(),
+    {queryTileSize, nprobe});
+  DeviceTensor<int, 2, true>* prefixSumOffsets[2] =
+    {&prefixSumOffsets1, &prefixSumOffsets2};
+
+  // Make sure the element before prefixSumOffsets is 0, since we
+  // depend upon simple, boundary-less indexing to get proper results
+  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(),
+                              0,
+                              sizeof(int),
+                              stream));
+  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(),
+                              0,
+                              sizeof(int),
+                              stream));
+
+  DeviceTensor<float, 1, true> allDistances1(
+    mem, {queryTileSize * nprobe * maxListLength}, stream);
+  DeviceTensor<float, 1, true> allDistances2(
+    mem, {queryTileSize * nprobe * maxListLength}, stream);
+  DeviceTensor<float, 1, true>* allDistances[2] =
+    {&allDistances1, &allDistances2};
+
+  DeviceTensor<float, 3, true> heapDistances1(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<float, 3, true> heapDistances2(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<float, 3, true>* heapDistances[2] =
+    {&heapDistances1, &heapDistances2};
+
+  DeviceTensor<int, 3, true> heapIndices1(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<int, 3, true> heapIndices2(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<int, 3, true>* heapIndices[2] =
+    {&heapIndices1, &heapIndices2};
+
+  auto streams = res->getAlternateStreamsCurrentDevice();
+  streamWait(streams, {stream});
+
+  int curStream = 0;
+
+  for (int query = 0; query < queries.getSize(0); query += queryTileSize) {
+    int numQueriesInTile =
+      std::min(queryTileSize, queries.getSize(0) - query);
+
+    auto prefixSumOffsetsView =
+      prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile);
+
+    auto listIdsView =
+      listIds.narrowOutermost(query, numQueriesInTile);
+    auto queryView =
+      queries.narrowOutermost(query, numQueriesInTile);
+    auto residualBaseView =
+      residualBase.narrowOutermost(query, numQueriesInTile);
+
+    auto heapDistancesView =
+      heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);
+    auto heapIndicesView =
+      heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);
+
+    auto outDistanceView =
+      outDistances.narrowOutermost(query, numQueriesInTile);
+    auto outIndicesView =
+      outIndices.narrowOutermost(query, numQueriesInTile);
+
+    runIVFFlatScanTile(queryView,
+                       listIdsView,
+                       listData,
+                       listIndices,
+                       indicesOptions,
+                       listLengths,
+                       *thrustMem[curStream],
+                       prefixSumOffsetsView,
+                       *allDistances[curStream],
+                       heapDistancesView,
+                       heapIndicesView,
+                       k,
+                       metric,
+                       useResidual,
+                       residualBaseView,
+                       scalarQ,
+                       outDistanceView,
+                       outIndicesView,
+                       streams[curStream]);
+
+    curStream = (curStream + 1) % 2;
+  }
+
+  streamWait({stream}, streams);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh
new file mode 100644
index 0000000000..475e71ab5d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh
@@ -0,0 +1,39 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
+#include <faiss/gpu/GpuIndicesOptions.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/Index.h>
+#include <thrust/device_vector.h>
+
+namespace faiss { namespace gpu {
+
+class GpuResources;
+
+void runIVFFlatScan(Tensor<float, 2, true>& queries,
+                    Tensor<int, 2, true>& listIds,
+                    thrust::device_vector<void*>& listData,
+                    thrust::device_vector<void*>& listIndices,
+                    IndicesOptions indicesOptions,
+                    thrust::device_vector<int>& listLengths,
+                    int maxListLength,
+                    int k,
+                    faiss::MetricType metric,
+                    bool useResidual,
+                    Tensor<float, 3, true>& residualBase,
+                    GpuScalarQuantizer* scalarQ,
+                    // output
+                    Tensor<float, 2, true>& outDistances,
+                    // output
+                    Tensor<long, 2, true>& outIndices,
+                    GpuResources* res);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu
new file mode 100644
index 0000000000..aa843fed1e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu
@@ -0,0 +1,698 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/IVFPQ.cuh>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/BroadcastSum.cuh>
+#include <faiss/gpu/impl/Distance.cuh>
+#include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/impl/IVFAppend.cuh>
+#include <faiss/gpu/impl/L2Norm.cuh>
+#include <faiss/gpu/impl/PQCodeDistances.cuh>
+#include <faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh>
+#include <faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh>
+#include <faiss/gpu/impl/RemapIndices.h>
+#include <faiss/gpu/impl/VectorResidual.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/HostTensor.cuh>
+#include <faiss/gpu/utils/MatrixMult.cuh>
+#include <faiss/gpu/utils/NoTypeTensor.cuh>
+#include <faiss/gpu/utils/Transpose.cuh>
+#include <limits>
+#include <thrust/host_vector.h>
+#include <unordered_map>
+
+namespace faiss { namespace gpu {
+
+IVFPQ::IVFPQ(GpuResources* resources,
+             FlatIndex* quantizer,
+             int numSubQuantizers,
+             int bitsPerSubQuantizer,
+             float* pqCentroidData,
+             IndicesOptions indicesOptions,
+             bool useFloat16LookupTables,
+             MemorySpace space) :
+    IVFBase(resources,
+            quantizer,
+            numSubQuantizers,
+            indicesOptions,
+            space),
+    numSubQuantizers_(numSubQuantizers),
+    bitsPerSubQuantizer_(bitsPerSubQuantizer),
+    numSubQuantizerCodes_(utils::pow2(bitsPerSubQuantizer_)),
+    dimPerSubQuantizer_(dim_ / numSubQuantizers),
+    precomputedCodes_(false),
+    useFloat16LookupTables_(useFloat16LookupTables) {
+  FAISS_ASSERT(pqCentroidData);
+
+  FAISS_ASSERT(bitsPerSubQuantizer_ <= 8);
+  FAISS_ASSERT(dim_ % numSubQuantizers_ == 0);
+  FAISS_ASSERT(isSupportedPQCodeLength(bytesPerVector_));
+
+  setPQCentroids_(pqCentroidData);
+}
+
+IVFPQ::~IVFPQ() {
+}
+
+
+bool
+IVFPQ::isSupportedPQCodeLength(int size) {
+  switch (size) {
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+    case 8:
+    case 12:
+    case 16:
+    case 20:
+    case 24:
+    case 28:
+    case 32:
+    case 40:
+    case 48:
+    case 56: // only supported with float16
+    case 64: // only supported with float16
+    case 96: // only supported with float16
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool
+IVFPQ::isSupportedNoPrecomputedSubDimSize(int dims) {
+  return faiss::gpu::isSupportedNoPrecomputedSubDimSize(dims);
+}
+
+void
+IVFPQ::setPrecomputedCodes(bool enable) {
+  if (precomputedCodes_ != enable) {
+    precomputedCodes_ = enable;
+
+    if (precomputedCodes_) {
+      precomputeCodes_();
+    } else {
+      // Clear out old precomputed code data
+      precomputedCode_ = std::move(DeviceTensor<float, 3, true>());
+      precomputedCodeHalf_ = std::move(DeviceTensor<half, 3, true>());
+    }
+  }
+}
+
+int
+IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
+                             Tensor<long, 1, true>& indices) {
+  FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
+  FAISS_ASSERT(vecs.getSize(1) == dim_);
+
+  FAISS_ASSERT(!quantizer_->getUseFloat16());
+  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
+  auto& mem = resources_->getMemoryManagerCurrentDevice();
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+
+  // Number of valid vectors that we actually add; we return this
+  int numAdded = 0;
+
+  // We don't actually need this
+  DeviceTensor<float, 2, true> listDistance(mem, {vecs.getSize(0), 1}, stream);
+  // We use this
+  DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
+  auto listIds = listIds2d.view<1>({vecs.getSize(0)});
+
+  quantizer_->query(vecs, 1, listDistance, listIds2d, false);
+
+  // Copy the lists that we wish to append to back to the CPU
+  // FIXME: really this can be into pinned memory and a true async
+  // copy on a different stream; we can start the copy early, but it's
+  // tiny
+  HostTensor<int, 1, true> listIdsHost(listIds, stream);
+
+  // Calculate the residual for each closest centroid
+  DeviceTensor<float, 2, true> residuals(
+    mem, {vecs.getSize(0), vecs.getSize(1)}, stream);
+
+  runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
+
+  // Residuals are in the form
+  // (vec x numSubQuantizer x dimPerSubQuantizer)
+  // transpose to
+  // (numSubQuantizer x vec x dimPerSubQuantizer)
+  auto residualsView = residuals.view<3>(
+    {residuals.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
+
+  DeviceTensor<float, 3, true> residualsTranspose(
+    mem,
+    {numSubQuantizers_, residuals.getSize(0), dimPerSubQuantizer_},
+    stream);
+
+  runTransposeAny(residualsView, 0, 1, residualsTranspose, stream);
+
+  // Get the product quantizer centroids in the form
+  // (numSubQuantizer x numSubQuantizerCodes x dimPerSubQuantizer)
+  // which is pqCentroidsMiddleCode_
+
+  // We now have a batch operation to find the top-1 distances:
+  // batch size: numSubQuantizer
+  // centroids: (numSubQuantizerCodes x dimPerSubQuantizer)
+  // residuals: (vec x dimPerSubQuantizer)
+  // => (numSubQuantizer x vec x 1)
+
+  DeviceTensor<float, 3, true> closestSubQDistance(
+    mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
+  DeviceTensor<int, 3, true> closestSubQIndex(
+    mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
+
+  for (int subQ = 0; subQ < numSubQuantizers_; ++subQ) {
+    auto closestSubQDistanceView = closestSubQDistance[subQ].view();
+    auto closestSubQIndexView = closestSubQIndex[subQ].view();
+
+    auto pqCentroidsMiddleCodeView = pqCentroidsMiddleCode_[subQ].view();
+    auto residualsTransposeView = residualsTranspose[subQ].view();
+
+    runL2Distance(resources_,
+                  pqCentroidsMiddleCodeView,
+                  true, // pqCentroidsMiddleCodeView is row major
+                  nullptr, // no precomputed norms
+                  residualsTransposeView,
+                  true, // residualsTransposeView is row major
+                  1,
+                  closestSubQDistanceView,
+                  closestSubQIndexView,
+                  // We don't care about distances
+                  true);
+  }
+
+  // Now, we have the nearest sub-q centroid for each slice of the
+  // residual vector.
+  auto closestSubQIndexView = closestSubQIndex.view<2>(
+    {numSubQuantizers_, residuals.getSize(0)});
+
+  // Transpose this for easy use
+  DeviceTensor<int, 2, true> encodings(
+    mem, {residuals.getSize(0), numSubQuantizers_}, stream);
+
+  runTransposeAny(closestSubQIndexView, 0, 1, encodings, stream);
+
+  // Now we add the encoded vectors to the individual lists
+  // First, make sure that there is space available for adding the new
+  // encoded vectors and indices
+
+  // list id -> # being added
+  std::unordered_map<int, int> assignCounts;
+
+  // vector id -> offset in list
+  // (we already have vector id -> list id in listIds)
+  HostTensor<int, 1, true> listOffsetHost({listIdsHost.getSize(0)});
+
+  for (int i = 0; i < listIdsHost.getSize(0); ++i) {
+    int listId = listIdsHost[i];
+
+    // Add vector could be invalid (contains NaNs etc)
+    if (listId < 0) {
+      listOffsetHost[i] = -1;
+      continue;
+    }
+
+    FAISS_ASSERT(listId < numLists_);
+    ++numAdded;
+
+    int offset = deviceListData_[listId]->size() / bytesPerVector_;
+
+    auto it = assignCounts.find(listId);
+    if (it != assignCounts.end()) {
+      offset += it->second;
+      it->second++;
+    } else {
+      assignCounts[listId] = 1;
+    }
+
+    listOffsetHost[i] = offset;
+  }
+
+  // If we didn't add anything (all invalid vectors), no need to
+  // continue
+  if (numAdded == 0) {
+    return 0;
+  }
+
+  // We need to resize the data structures for the inverted lists on
+  // the GPUs, which means that they might need reallocation, which
+  // means that their base address may change. Figure out the new base
+  // addresses, and update those in a batch on the device
+  {
+    // Resize all of the lists that we are appending to
+    for (auto& counts : assignCounts) {
+      auto& codes = deviceListData_[counts.first];
+      codes->resize(codes->size() + counts.second * bytesPerVector_,
+                    stream);
+      int newNumVecs = (int) (codes->size() / bytesPerVector_);
+
+      auto& indices = deviceListIndices_[counts.first];
+      if ((indicesOptions_ == INDICES_32_BIT) ||
+          (indicesOptions_ == INDICES_64_BIT)) {
+        size_t indexSize =
+          (indicesOptions_ == INDICES_32_BIT) ? sizeof(int) : sizeof(long);
+
+        indices->resize(indices->size() + counts.second * indexSize, stream);
+      } else if (indicesOptions_ == INDICES_CPU) {
+        // indices are stored on the CPU side
+        FAISS_ASSERT(counts.first < listOffsetToUserIndex_.size());
+
+        auto& userIndices = listOffsetToUserIndex_[counts.first];
+        userIndices.resize(newNumVecs);
+      } else {
+        // indices are not stored on the GPU or CPU side
+        FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
+      }
+
+      // This is used by the multi-pass query to decide how much scratch
+      // space to allocate for intermediate results
+      maxListLength_ = std::max(maxListLength_, newNumVecs);
+    }
+
+    // Update all pointers and sizes on the device for lists that we
+    // appended to
+    {
+      std::vector<int> listIds(assignCounts.size());
+      int i = 0;
+      for (auto& counts : assignCounts) {
+        listIds[i++] = counts.first;
+      }
+
+      updateDeviceListInfo_(listIds, stream);
+    }
+  }
+
+  // If we're maintaining the indices on the CPU side, update our
+  // map. We already resized our map above.
+  if (indicesOptions_ == INDICES_CPU) {
+    // We need to maintain the indices on the CPU side
+    HostTensor<long, 1, true> hostIndices(indices, stream);
+
+    for (int i = 0; i < hostIndices.getSize(0); ++i) {
+      int listId = listIdsHost[i];
+
+      // Add vector could be invalid (contains NaNs etc)
+      if (listId < 0) {
+        continue;
+      }
+
+      int offset = listOffsetHost[i];
+
+      FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
+      auto& userIndices = listOffsetToUserIndex_[listId];
+
+      FAISS_ASSERT(offset < userIndices.size());
+      userIndices[offset] = hostIndices[i];
+    }
+  }
+
+  // We similarly need to actually append the new encoded vectors
+  {
+    DeviceTensor<int, 1, true> listOffset(mem, listOffsetHost, stream);
+
+    // This kernel will handle appending each encoded vector + index to
+    // the appropriate list
+    runIVFPQInvertedListAppend(listIds,
+                               listOffset,
+                               encodings,
+                               indices,
+                               deviceListDataPointers_,
+                               deviceListIndexPointers_,
+                               indicesOptions_,
+                               stream);
+  }
+
+  return numAdded;
+}
+
+void
+IVFPQ::addCodeVectorsFromCpu(int listId,
+                             const void* codes,
+                             const long* indices,
+                             size_t numVecs) {
+  // This list must already exist
+  FAISS_ASSERT(listId < deviceListData_.size());
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+
+  // If there's nothing to add, then there's nothing we have to do
+  if (numVecs == 0) {
+    return;
+  }
+
+  size_t lengthInBytes = numVecs * bytesPerVector_;
+
+  auto& listCodes = deviceListData_[listId];
+  auto prevCodeData = listCodes->data();
+
+  // We only have int32 length representations on the GPU per each
+  // list; the length is in sizeof(char)
+  FAISS_ASSERT(listCodes->size() % bytesPerVector_ == 0);
+  FAISS_ASSERT(listCodes->size() + lengthInBytes <=
+               (size_t) std::numeric_limits<int>::max());
+
+  listCodes->append((unsigned char*) codes,
+                    lengthInBytes,
+                    stream,
+                    true /* exact reserved size */);
+
+  // Handle the indices as well
+  addIndicesFromCpu_(listId, indices, numVecs);
+
+  // This list address may have changed due to vector resizing, but
+  // only bother updating it on the device if it has changed
+  if (prevCodeData != listCodes->data()) {
+    deviceListDataPointers_[listId] = listCodes->data();
+  }
+
+  // And our size has changed too
+  int listLength = listCodes->size() / bytesPerVector_;
+  deviceListLengths_[listId] = listLength;
+
+  // We update this as well, since the multi-pass algorithm uses it
+  maxListLength_ = std::max(maxListLength_, listLength);
+
+  // device_vector add is potentially happening on a different stream
+  // than our default stream
+  if (resources_->getDefaultStreamCurrentDevice() != 0) {
+    streamWait({stream}, {0});
+  }
+}
+
+void
+IVFPQ::setPQCentroids_(float* data) {
+  size_t pqSize =
+    numSubQuantizers_ * numSubQuantizerCodes_ * dimPerSubQuantizer_;
+
+  // Make sure the data is on the host
+  // FIXME: why are we doing this?
+  thrust::host_vector<float> hostMemory;
+  hostMemory.insert(hostMemory.end(), data, data + pqSize);
+
+  HostTensor<float, 3, true> pqHost(
+    hostMemory.data(),
+    {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
+  DeviceTensor<float, 3, true> pqDevice(
+    pqHost,
+    resources_->getDefaultStreamCurrentDevice());
+
+  DeviceTensor<float, 3, true> pqDeviceTranspose(
+    {numSubQuantizers_, dimPerSubQuantizer_, numSubQuantizerCodes_});
+  runTransposeAny(pqDevice, 1, 2, pqDeviceTranspose,
+                  resources_->getDefaultStreamCurrentDevice());
+
+  pqCentroidsInnermostCode_ = std::move(pqDeviceTranspose);
+
+  // Also maintain the PQ centroids in the form
+  // (sub q)(code id)(sub dim)
+  DeviceTensor<float, 3, true> pqCentroidsMiddleCode(
+    {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
+  runTransposeAny(pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode,
+                  resources_->getDefaultStreamCurrentDevice());
+
+  pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
+}
+
+void
+IVFPQ::precomputeCodes_() {
+  //
+  //    d = || x - y_C ||^2 + || y_R ||^2 + 2 * (y_C|y_R) - 2 * (x|y_R)
+  //        ---------------   ---------------------------       -------
+  //            term 1                 term 2                   term 3
+  //
+
+  // Terms 1 and 3 are available only at query time. We compute term 2
+  // here.
+  FAISS_ASSERT(!quantizer_->getUseFloat16());
+  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
+
+  // Compute ||y_R||^2 by treating
+  // (sub q)(code id)(sub dim) as (sub q * code id)(sub dim)
+  auto pqCentroidsMiddleCodeView =
+    pqCentroidsMiddleCode_.view<2>(
+      {numSubQuantizers_ * numSubQuantizerCodes_, dimPerSubQuantizer_});
+  DeviceTensor<float, 1, true> subQuantizerNorms(
+    {numSubQuantizers_ * numSubQuantizerCodes_});
+
+  runL2Norm(pqCentroidsMiddleCodeView, true,
+            subQuantizerNorms, true,
+            resources_->getDefaultStreamCurrentDevice());
+
+  // Compute 2 * (y_C|y_R) via batch matrix multiplication
+  // batch size (sub q) x {(centroid id)(sub dim) x (code id)(sub dim)'}
+  //         => (sub q) x {(centroid id)(code id)}
+  //         => (sub q)(centroid id)(code id)
+
+  // View (centroid id)(dim) as
+  //      (centroid id)(sub q)(dim)
+  // Transpose (centroid id)(sub q)(sub dim) to
+  //           (sub q)(centroid id)(sub dim)
+  auto centroidView = coarseCentroids.view<3>(
+    {coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
+  DeviceTensor<float, 3, true> centroidsTransposed(
+    {numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_});
+
+  runTransposeAny(centroidView, 0, 1, centroidsTransposed,
+                  resources_->getDefaultStreamCurrentDevice());
+
+  DeviceTensor<float, 3, true> coarsePQProduct(
+    {numSubQuantizers_, coarseCentroids.getSize(0), numSubQuantizerCodes_});
+
+  runIteratedMatrixMult(coarsePQProduct, false,
+                        centroidsTransposed, false,
+                        pqCentroidsMiddleCode_, true,
+                        2.0f, 0.0f,
+                        resources_->getBlasHandleCurrentDevice(),
+                        resources_->getDefaultStreamCurrentDevice());
+
+  // Transpose (sub q)(centroid id)(code id) to
+  //           (centroid id)(sub q)(code id)
+  DeviceTensor<float, 3, true> coarsePQProductTransposed(
+    {coarseCentroids.getSize(0), numSubQuantizers_, numSubQuantizerCodes_});
+  runTransposeAny(coarsePQProduct, 0, 1, coarsePQProductTransposed,
+                  resources_->getDefaultStreamCurrentDevice());
+
+  // View (centroid id)(sub q)(code id) as
+  //      (centroid id)(sub q * code id)
+  auto coarsePQProductTransposedView = coarsePQProductTransposed.view<2>(
+    {coarseCentroids.getSize(0), numSubQuantizers_ * numSubQuantizerCodes_});
+
+  // Sum || y_R ||^2 + 2 * (y_C|y_R)
+  // i.e., add norms                              (sub q * code id)
+  // along columns of inner product  (centroid id)(sub q * code id)
+  runSumAlongColumns(subQuantizerNorms, coarsePQProductTransposedView,
+                     resources_->getDefaultStreamCurrentDevice());
+
+  // We added into the view, so `coarsePQProductTransposed` is now our
+  // precomputed term 2.
+  if (useFloat16LookupTables_) {
+    precomputedCodeHalf_ =
+      convertTensor<float, half, 3>(resources_,
+                                    resources_->getDefaultStreamCurrentDevice(),
+                                    coarsePQProductTransposed);
+  } else {
+    precomputedCode_ = std::move(coarsePQProductTransposed);
+  }
+}
+
+void
+IVFPQ::query(Tensor<float, 2, true>& queries,
+             int nprobe,
+             int k,
+             Tensor<float, 2, true>& outDistances,
+             Tensor<long, 2, true>& outIndices) {
+  // These are caught at a higher level
+  FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K);
+  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
+
+  auto& mem = resources_->getMemoryManagerCurrentDevice();
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+  nprobe = std::min(nprobe, quantizer_->getSize());
+
+  FAISS_ASSERT(queries.getSize(1) == dim_);
+  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
+  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
+
+  // Reserve space for the closest coarse centroids
+  DeviceTensor<float, 2, true>
+    coarseDistances(mem, {queries.getSize(0), nprobe}, stream);
+  DeviceTensor<int, 2, true>
+    coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
+
+  // Find the `nprobe` closest coarse centroids; we can use int
+  // indices both internally and externally
+  quantizer_->query(queries,
+                    nprobe,
+                    coarseDistances,
+                    coarseIndices,
+                    true);
+
+  if (precomputedCodes_) {
+    runPQPrecomputedCodes_(queries,
+                           coarseDistances,
+                           coarseIndices,
+                           k,
+                           outDistances,
+                           outIndices);
+  } else {
+    runPQNoPrecomputedCodes_(queries,
+                             coarseDistances,
+                             coarseIndices,
+                             k,
+                             outDistances,
+                             outIndices);
+  }
+
+  // If the GPU isn't storing indices (they are on the CPU side), we
+  // need to perform the re-mapping here
+  // FIXME: we might ultimately be calling this function with inputs
+  // from the CPU, these are unnecessary copies
+  if (indicesOptions_ == INDICES_CPU) {
+    HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
+
+    ivfOffsetToUserIndex(hostOutIndices.data(),
+                         numLists_,
+                         hostOutIndices.getSize(0),
+                         hostOutIndices.getSize(1),
+                         listOffsetToUserIndex_);
+
+    // Copy back to GPU, since the input to this function is on the
+    // GPU
+    outIndices.copyFrom(hostOutIndices, stream);
+  }
+}
+
+std::vector<unsigned char>
+IVFPQ::getListCodes(int listId) const {
+  FAISS_ASSERT(listId < deviceListData_.size());
+
+  return deviceListData_[listId]->copyToHost<unsigned char>(
+    resources_->getDefaultStreamCurrentDevice());
+}
+
+Tensor<float, 3, true>
+IVFPQ::getPQCentroids() {
+  return pqCentroidsMiddleCode_;
+}
+
+void
+IVFPQ::runPQPrecomputedCodes_(
+  Tensor<float, 2, true>& queries,
+  DeviceTensor<float, 2, true>& coarseDistances,
+  DeviceTensor<int, 2, true>& coarseIndices,
+  int k,
+  Tensor<float, 2, true>& outDistances,
+  Tensor<long, 2, true>& outIndices) {
+  auto& mem = resources_->getMemoryManagerCurrentDevice();
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+
+  // Compute precomputed code term 3, - 2 * (x|y_R)
+  // This is done via batch MM
+  // {sub q} x {(query id)(sub dim) * (code id)(sub dim)'} =>
+  // {sub q} x {(query id)(code id)}
+  DeviceTensor<float, 3, true> term3Transposed(
+    mem,
+    {queries.getSize(0), numSubQuantizers_, numSubQuantizerCodes_},
+    stream);
+
+  // These allocations within are only temporary, so release them when
+  // we're done to maximize free space
+  {
+    auto querySubQuantizerView = queries.view<3>(
+      {queries.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
+    DeviceTensor<float, 3, true> queriesTransposed(
+      mem,
+      {numSubQuantizers_, queries.getSize(0), dimPerSubQuantizer_},
+      stream);
+    runTransposeAny(querySubQuantizerView, 0, 1, queriesTransposed, stream);
+
+    DeviceTensor<float, 3, true> term3(
+      mem,
+      {numSubQuantizers_, queries.getSize(0), numSubQuantizerCodes_},
+      stream);
+
+    runIteratedMatrixMult(term3, false,
+                          queriesTransposed, false,
+                          pqCentroidsMiddleCode_, true,
+                          -2.0f, 0.0f,
+                          resources_->getBlasHandleCurrentDevice(),
+                          stream);
+
+    runTransposeAny(term3, 0, 1, term3Transposed, stream);
+  }
+
+  NoTypeTensor<3, true> term2;
+  NoTypeTensor<3, true> term3;
+  DeviceTensor<half, 3, true> term3Half;
+
+  if (useFloat16LookupTables_) {
+    term3Half =
+      convertTensor<float, half, 3>(resources_, stream, term3Transposed);
+
+    term2 = NoTypeTensor<3, true>(precomputedCodeHalf_);
+    term3 = NoTypeTensor<3, true>(term3Half);
+  } else {
+    term2 = NoTypeTensor<3, true>(precomputedCode_);
+    term3 = NoTypeTensor<3, true>(term3Transposed);
+  }
+
+  runPQScanMultiPassPrecomputed(queries,
+                                coarseDistances, // term 1
+                                term2, // term 2
+                                term3, // term 3
+                                coarseIndices,
+                                useFloat16LookupTables_,
+                                bytesPerVector_,
+                                numSubQuantizers_,
+                                numSubQuantizerCodes_,
+                                deviceListDataPointers_,
+                                deviceListIndexPointers_,
+                                indicesOptions_,
+                                deviceListLengths_,
+                                maxListLength_,
+                                k,
+                                outDistances,
+                                outIndices,
+                                resources_);
+}
+
+void
+IVFPQ::runPQNoPrecomputedCodes_(
+  Tensor<float, 2, true>& queries,
+  DeviceTensor<float, 2, true>& coarseDistances,
+  DeviceTensor<int, 2, true>& coarseIndices,
+  int k,
+  Tensor<float, 2, true>& outDistances,
+  Tensor<long, 2, true>& outIndices) {
+  FAISS_ASSERT(!quantizer_->getUseFloat16());
+  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
+
+  runPQScanMultiPassNoPrecomputed(queries,
+                                  coarseCentroids,
+                                  pqCentroidsInnermostCode_,
+                                  coarseIndices,
+                                  useFloat16LookupTables_,
+                                  bytesPerVector_,
+                                  numSubQuantizers_,
+                                  numSubQuantizerCodes_,
+                                  deviceListDataPointers_,
+                                  deviceListIndexPointers_,
+                                  indicesOptions_,
+                                  deviceListLengths_,
+                                  maxListLength_,
+                                  k,
+                                  outDistances,
+                                  outIndices,
+                                  resources_);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh
new file mode 100644
index 0000000000..781104d77b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh
@@ -0,0 +1,137 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/impl/IVFBase.cuh>
+#include <faiss/gpu/utils/Float16.cuh>
+
+namespace faiss { namespace gpu {
+
+/// Implementing class for IVFPQ on the GPU
+class IVFPQ : public IVFBase {
+ public:
+  IVFPQ(GpuResources* resources,
+        /// We do not own this reference
+        FlatIndex* quantizer,
+        int numSubQuantizers,
+        int bitsPerSubQuantizer,
+        float* pqCentroidData,
+        IndicesOptions indicesOptions,
+        bool useFloat16LookupTables,
+        MemorySpace space);
+
+  /// Returns true if we support PQ in this size
+  static bool isSupportedPQCodeLength(int size);
+
+  /// For no precomputed codes, is this a supported sub-dimension
+  /// size?
+  /// FIXME: get MM implementation working again
+  static bool isSupportedNoPrecomputedSubDimSize(int dims);
+
+  ~IVFPQ() override;
+
+  /// Enable or disable pre-computed codes
+  void setPrecomputedCodes(bool enable);
+
+  /// Adds a set of codes and indices to a list; the data can be
+  /// resident on either the host or the device
+  void addCodeVectorsFromCpu(int listId,
+                             const void* codes,
+                             const long* indices,
+                             size_t numVecs);
+
+  /// Calcuates the residual and quantizes the vectors, adding them to
+  /// this index
+  /// The input data must be on our current device.
+  /// Returns the number of vectors successfully added. Vectors may
+  /// not be able to be added because they contain NaNs.
+  int classifyAndAddVectors(Tensor<float, 2, true>& vecs,
+                            Tensor<long, 1, true>& indices);
+
+  /// Find the approximate k nearest neigbors for `queries` against
+  /// our database
+  void query(Tensor<float, 2, true>& queries,
+             int nprobe,
+             int k,
+             Tensor<float, 2, true>& outDistances,
+             Tensor<long, 2, true>& outIndices);
+
+  /// Return the list codes of a particular list back to the CPU
+  std::vector<unsigned char> getListCodes(int listId) const;
+
+  /// Returns our set of sub-quantizers of the form
+  /// (sub q)(code id)(sub dim)
+  Tensor<float, 3, true> getPQCentroids();
+
+ private:
+  /// Sets the current product quantizer centroids; the data can be
+  /// resident on either the host or the device. It will be transposed
+  /// into our preferred data layout
+  /// Data must be a row-major, 3-d array of size
+  /// (numSubQuantizers, numSubQuantizerCodes, dim / numSubQuantizers)
+  void setPQCentroids_(float* data);
+
+  /// Calculate precomputed residual distance information
+  void precomputeCodes_();
+
+  /// Runs kernels for scanning inverted lists with precomputed codes
+  void runPQPrecomputedCodes_(Tensor<float, 2, true>& queries,
+                              DeviceTensor<float, 2, true>& coarseDistances,
+                              DeviceTensor<int, 2, true>& coarseIndices,
+                              int k,
+                              Tensor<float, 2, true>& outDistances,
+                              Tensor<long, 2, true>& outIndices);
+
+  /// Runs kernels for scanning inverted lists without precomputed codes
+  void runPQNoPrecomputedCodes_(Tensor<float, 2, true>& queries,
+                                DeviceTensor<float, 2, true>& coarseDistances,
+                                DeviceTensor<int, 2, true>& coarseIndices,
+                                int k,
+                                Tensor<float, 2, true>& outDistances,
+                                Tensor<long, 2, true>& outIndices);
+
+ private:
+  /// Number of sub-quantizers per vector
+  const int numSubQuantizers_;
+
+  /// Number of bits per sub-quantizer
+  const int bitsPerSubQuantizer_;
+
+  /// Number of per sub-quantizer codes (2^bits)
+  const int numSubQuantizerCodes_;
+
+  /// Number of dimensions per each sub-quantizer
+  const int dimPerSubQuantizer_;
+
+  /// Do we maintain precomputed terms and lookup tables in float16
+  /// form?
+  const bool useFloat16LookupTables_;
+
+  /// On the GPU, we prefer different PQ centroid data layouts for
+  /// different purposes.
+  ///
+  /// (sub q)(sub dim)(code id)
+  DeviceTensor<float, 3, true> pqCentroidsInnermostCode_;
+
+  /// (sub q)(code id)(sub dim)
+  DeviceTensor<float, 3, true> pqCentroidsMiddleCode_;
+
+  /// Are precomputed codes enabled? (additional factoring and
+  /// precomputation of the residual distance, to reduce query-time work)
+  bool precomputedCodes_;
+
+  /// Precomputed term 2 in float form
+  /// (centroid id)(sub q)(code id)
+  DeviceTensor<float, 3, true> precomputedCode_;
+
+  /// Precomputed term 2 in half form
+  DeviceTensor<half, 3, true> precomputedCodeHalf_;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtils.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtils.cu
new file mode 100644
index 0000000000..fda439fea2
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtils.cu
@@ -0,0 +1,78 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/IVFUtils.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/gpu/utils/ThrustAllocator.cuh>
+#include <thrust/scan.h>
+#include <thrust/execution_policy.h>
+
+namespace faiss { namespace gpu {
+
+// Calculates the total number of intermediate distances to consider
+// for all queries
+__global__ void
+getResultLengths(Tensor<int, 2, true> topQueryToCentroid,
+                 int* listLengths,
+                 int totalSize,
+                 Tensor<int, 2, true> length) {
+  int linearThreadId = blockIdx.x * blockDim.x + threadIdx.x;
+  if (linearThreadId >= totalSize) {
+    return;
+  }
+
+  int nprobe = topQueryToCentroid.getSize(1);
+  int queryId = linearThreadId / nprobe;
+  int listId = linearThreadId % nprobe;
+
+  int centroidId = topQueryToCentroid[queryId][listId];
+
+  // Safety guard in case NaNs in input cause no list ID to be generated
+  length[queryId][listId] = (centroidId != -1) ? listLengths[centroidId] : 0;
+}
+
+void runCalcListOffsets(Tensor<int, 2, true>& topQueryToCentroid,
+                        thrust::device_vector<int>& listLengths,
+                        Tensor<int, 2, true>& prefixSumOffsets,
+                        Tensor<char, 1, true>& thrustMem,
+                        cudaStream_t stream) {
+  FAISS_ASSERT(topQueryToCentroid.getSize(0) == prefixSumOffsets.getSize(0));
+  FAISS_ASSERT(topQueryToCentroid.getSize(1) == prefixSumOffsets.getSize(1));
+
+  int totalSize = topQueryToCentroid.numElements();
+
+  int numThreads = std::min(totalSize, getMaxThreadsCurrentDevice());
+  int numBlocks = utils::divUp(totalSize, numThreads);
+
+  auto grid = dim3(numBlocks);
+  auto block = dim3(numThreads);
+
+  getResultLengths<<<grid, block, 0, stream>>>(
+    topQueryToCentroid,
+    listLengths.data().get(),
+    totalSize,
+    prefixSumOffsets);
+  CUDA_TEST_ERROR();
+
+  // Prefix sum of the indices, so we know where the intermediate
+  // results should be maintained
+  // Thrust wants a place for its temporary allocations, so provide
+  // one, so it won't call cudaMalloc/Free
+  GpuResourcesThrustAllocator alloc(thrustMem.data(),
+                                    thrustMem.getSizeInBytes());
+
+  thrust::inclusive_scan(thrust::cuda::par(alloc).on(stream),
+                         prefixSumOffsets.data(),
+                         prefixSumOffsets.data() + totalSize,
+                         prefixSumOffsets.data());
+  CUDA_TEST_ERROR();
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtils.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtils.cuh
new file mode 100644
index 0000000000..eba3a1051b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtils.cuh
@@ -0,0 +1,51 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/GpuIndicesOptions.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <thrust/device_vector.h>
+
+// A collection of utility functions for IVFPQ and IVFFlat, for
+// post-processing and k-selecting the results
+namespace faiss { namespace gpu {
+
+/// Function for multi-pass scanning that collects the length of
+/// intermediate results for all (query, probe) pair
+void runCalcListOffsets(Tensor<int, 2, true>& topQueryToCentroid,
+                        thrust::device_vector<int>& listLengths,
+                        Tensor<int, 2, true>& prefixSumOffsets,
+                        Tensor<char, 1, true>& thrustMem,
+                        cudaStream_t stream);
+
+/// Performs a first pass of k-selection on the results
+void runPass1SelectLists(Tensor<int, 2, true>& prefixSumOffsets,
+                         Tensor<float, 1, true>& distance,
+                         int nprobe,
+                         int k,
+                         bool chooseLargest,
+                         Tensor<float, 3, true>& heapDistances,
+                         Tensor<int, 3, true>& heapIndices,
+                         cudaStream_t stream);
+
+/// Performs a final pass of k-selection on the results, producing the
+/// final indices
+void runPass2SelectLists(Tensor<float, 2, true>& heapDistances,
+                         Tensor<int, 2, true>& heapIndices,
+                         thrust::device_vector<void*>& listIndices,
+                         IndicesOptions indicesOptions,
+                         Tensor<int, 2, true>& prefixSumOffsets,
+                         Tensor<int, 2, true>& topQueryToCentroid,
+                         int k,
+                         bool chooseLargest,
+                         Tensor<float, 2, true>& outDistances,
+                         Tensor<long, 2, true>& outIndices,
+                         cudaStream_t stream);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect1.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect1.cu
new file mode 100644
index 0000000000..63c563c8fd
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect1.cu
@@ -0,0 +1,168 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/IVFUtils.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Limits.cuh>
+#include <faiss/gpu/utils/Select.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+
+//
+// This kernel is split into a separate compilation unit to cut down
+// on compile time
+//
+
+namespace faiss { namespace gpu {
+
+template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir>
+__global__ void
+pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
+                 Tensor<float, 1, true> distance,
+                 int nprobe,
+                 int k,
+                 Tensor<float, 3, true> heapDistances,
+                 Tensor<int, 3, true> heapIndices) {
+  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+
+  __shared__ float smemK[kNumWarps * NumWarpQ];
+  __shared__ int smemV[kNumWarps * NumWarpQ];
+
+  constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
+  BlockSelect<float, int, Dir, Comparator<float>,
+              NumWarpQ, NumThreadQ, ThreadsPerBlock>
+    heap(kInit, -1, smemK, smemV, k);
+
+  auto queryId = blockIdx.y;
+  auto sliceId = blockIdx.x;
+  auto numSlices = gridDim.x;
+
+  int sliceSize = (nprobe / numSlices);
+  int sliceStart = sliceSize * sliceId;
+  int sliceEnd = sliceId == (numSlices - 1) ? nprobe :
+    sliceStart + sliceSize;
+  auto offsets = prefixSumOffsets[queryId].data();
+
+  // We ensure that before the array (at offset -1), there is a 0 value
+  int start = *(&offsets[sliceStart] - 1);
+  int end = offsets[sliceEnd - 1];
+
+  int num = end - start;
+  int limit = utils::roundDown(num, kWarpSize);
+
+  int i = threadIdx.x;
+  auto distanceStart = distance[start].data();
+
+  // BlockSelect add cannot be used in a warp divergent circumstance; we
+  // handle the remainder warp below
+  for (; i < limit; i += blockDim.x) {
+    heap.add(distanceStart[i], start + i);
+  }
+
+  // Handle warp divergence separately
+  if (i < num) {
+    heap.addThreadQ(distanceStart[i], start + i);
+  }
+
+  // Merge all final results
+  heap.reduce();
+
+  // Write out the final k-selected values; they should be all
+  // together
+  for (int i = threadIdx.x; i < k; i += blockDim.x) {
+    heapDistances[queryId][sliceId][i] = smemK[i];
+    heapIndices[queryId][sliceId][i] = smemV[i];
+  }
+}
+
+void
+runPass1SelectLists(Tensor<int, 2, true>& prefixSumOffsets,
+                    Tensor<float, 1, true>& distance,
+                    int nprobe,
+                    int k,
+                    bool chooseLargest,
+                    Tensor<float, 3, true>& heapDistances,
+                    Tensor<int, 3, true>& heapIndices,
+                    cudaStream_t stream) {
+  // This is caught at a higher level
+  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
+
+  auto grid = dim3(heapDistances.getSize(1), prefixSumOffsets.getSize(0));
+
+#define RUN_PASS(BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR)                  \
+  do {                                                                  \
+    pass1SelectLists<BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR>              \
+      <<<grid, BLOCK, 0, stream>>>(prefixSumOffsets,                    \
+                                   distance,                            \
+                                   nprobe,                              \
+                                   k,                                   \
+                                   heapDistances,                       \
+                                   heapIndices);                        \
+    CUDA_TEST_ERROR();                                                  \
+    return; /* success */                                               \
+  } while (0)
+
+#if GPU_MAX_SELECTION_K >= 2048
+
+  // block size 128 for k <= 1024, 64 for k = 2048
+#define RUN_PASS_DIR(DIR)                                 \
+  do {                                                    \
+    if (k == 1) {                                         \
+      RUN_PASS(128, 1, 1, DIR);                           \
+    } else if (k <= 32) {                                 \
+      RUN_PASS(128, 32, 2, DIR);                          \
+    } else if (k <= 64) {                                 \
+      RUN_PASS(128, 64, 3, DIR);                          \
+    } else if (k <= 128) {                                \
+      RUN_PASS(128, 128, 3, DIR);                         \
+    } else if (k <= 256) {                                \
+      RUN_PASS(128, 256, 4, DIR);                         \
+    } else if (k <= 512) {                                \
+      RUN_PASS(128, 512, 8, DIR);                         \
+    } else if (k <= 1024) {                               \
+      RUN_PASS(128, 1024, 8, DIR);                        \
+    } else if (k <= 2048) {                               \
+      RUN_PASS(64, 2048, 8, DIR);                         \
+    }                                                     \
+  } while (0)
+
+#else
+
+#define RUN_PASS_DIR(DIR)                                 \
+  do {                                                    \
+    if (k == 1) {                                         \
+      RUN_PASS(128, 1, 1, DIR);                           \
+    } else if (k <= 32) {                                 \
+      RUN_PASS(128, 32, 2, DIR);                          \
+    } else if (k <= 64) {                                 \
+      RUN_PASS(128, 64, 3, DIR);                          \
+    } else if (k <= 128) {                                \
+      RUN_PASS(128, 128, 3, DIR);                         \
+    } else if (k <= 256) {                                \
+      RUN_PASS(128, 256, 4, DIR);                         \
+    } else if (k <= 512) {                                \
+      RUN_PASS(128, 512, 8, DIR);                         \
+    } else if (k <= 1024) {                               \
+      RUN_PASS(128, 1024, 8, DIR);                        \
+    }                                                     \
+  } while (0)
+
+#endif // GPU_MAX_SELECTION_K
+
+  if (chooseLargest) {
+    RUN_PASS_DIR(true);
+  } else {
+    RUN_PASS_DIR(false);
+  }
+
+#undef RUN_PASS_DIR
+#undef RUN_PASS
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect2.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect2.cu
new file mode 100644
index 0000000000..e629dbdfe4
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect2.cu
@@ -0,0 +1,236 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/IVFUtils.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Limits.cuh>
+#include <faiss/gpu/utils/Select.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+
+//
+// This kernel is split into a separate compilation unit to cut down
+// on compile time
+//
+
+namespace faiss { namespace gpu {
+
+// This is warp divergence central, but this is really a final step
+// and happening a small number of times
+inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
+                                            int size,
+                                            int val) {
+  int start = 0;
+  int end = size;
+
+  while (end - start > 0) {
+    int mid = start + (end - start) / 2;
+
+    int midVal = prefixSumOffsets[mid];
+
+    // Find the first bucket that we are <=
+    if (midVal <= val) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+
+  // We must find the bucket that it is in
+  assert(start != size);
+
+  return start;
+}
+
+template <int ThreadsPerBlock,
+          int NumWarpQ,
+          int NumThreadQ,
+          bool Dir>
+__global__ void
+pass2SelectLists(Tensor<float, 2, true> heapDistances,
+                 Tensor<int, 2, true> heapIndices,
+                 void** listIndices,
+                 Tensor<int, 2, true> prefixSumOffsets,
+                 Tensor<int, 2, true> topQueryToCentroid,
+                 int k,
+                 IndicesOptions opt,
+                 Tensor<float, 2, true> outDistances,
+                 Tensor<long, 2, true> outIndices) {
+  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+
+  __shared__ float smemK[kNumWarps * NumWarpQ];
+  __shared__ int smemV[kNumWarps * NumWarpQ];
+
+  constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
+  BlockSelect<float, int, Dir, Comparator<float>,
+            NumWarpQ, NumThreadQ, ThreadsPerBlock>
+    heap(kInit, -1, smemK, smemV, k);
+
+  auto queryId = blockIdx.x;
+  int num = heapDistances.getSize(1);
+  int limit = utils::roundDown(num, kWarpSize);
+
+  int i = threadIdx.x;
+  auto heapDistanceStart = heapDistances[queryId];
+
+  // BlockSelect add cannot be used in a warp divergent circumstance; we
+  // handle the remainder warp below
+  for (; i < limit; i += blockDim.x) {
+    heap.add(heapDistanceStart[i], i);
+  }
+
+  // Handle warp divergence separately
+  if (i < num) {
+    heap.addThreadQ(heapDistanceStart[i], i);
+  }
+
+  // Merge all final results
+  heap.reduce();
+
+  for (int i = threadIdx.x; i < k; i += blockDim.x) {
+    outDistances[queryId][i] = smemK[i];
+
+    // `v` is the index in `heapIndices`
+    // We need to translate this into an original user index. The
+    // reason why we don't maintain intermediate results in terms of
+    // user indices is to substantially reduce temporary memory
+    // requirements and global memory write traffic for the list
+    // scanning.
+    // This code is highly divergent, but it's probably ok, since this
+    // is the very last step and it is happening a small number of
+    // times (#queries x k).
+    int v = smemV[i];
+    long index = -1;
+
+    if (v != -1) {
+      // `offset` is the offset of the intermediate result, as
+      // calculated by the original scan.
+      int offset = heapIndices[queryId][v];
+
+      // In order to determine the actual user index, we need to first
+      // determine what list it was in.
+      // We do this by binary search in the prefix sum list.
+      int probe = binarySearchForBucket(prefixSumOffsets[queryId].data(),
+                                        prefixSumOffsets.getSize(1),
+                                        offset);
+
+      // This is then the probe for the query; we can find the actual
+      // list ID from this
+      int listId = topQueryToCentroid[queryId][probe];
+
+      // Now, we need to know the offset within the list
+      // We ensure that before the array (at offset -1), there is a 0 value
+      int listStart = *(prefixSumOffsets[queryId][probe].data() - 1);
+      int listOffset = offset - listStart;
+
+      // This gives us our final index
+      if (opt == INDICES_32_BIT) {
+        index = (long) ((int*) listIndices[listId])[listOffset];
+      } else if (opt == INDICES_64_BIT) {
+        index = ((long*) listIndices[listId])[listOffset];
+      } else {
+        index = ((long) listId << 32 | (long) listOffset);
+      }
+    }
+
+    outIndices[queryId][i] = index;
+  }
+}
+
+void
+runPass2SelectLists(Tensor<float, 2, true>& heapDistances,
+                    Tensor<int, 2, true>& heapIndices,
+                    thrust::device_vector<void*>& listIndices,
+                    IndicesOptions indicesOptions,
+                    Tensor<int, 2, true>& prefixSumOffsets,
+                    Tensor<int, 2, true>& topQueryToCentroid,
+                    int k,
+                    bool chooseLargest,
+                    Tensor<float, 2, true>& outDistances,
+                    Tensor<long, 2, true>& outIndices,
+                    cudaStream_t stream) {
+  auto grid = dim3(topQueryToCentroid.getSize(0));
+
+#define RUN_PASS(BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR)                  \
+  do {                                                                  \
+    pass2SelectLists<BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR>              \
+      <<<grid, BLOCK, 0, stream>>>(heapDistances,                       \
+                                   heapIndices,                         \
+                                   listIndices.data().get(),            \
+                                   prefixSumOffsets,                    \
+                                   topQueryToCentroid,                  \
+                                   k,                                   \
+                                   indicesOptions,                      \
+                                   outDistances,                        \
+                                   outIndices);                         \
+    CUDA_TEST_ERROR();                                                  \
+    return; /* success */                                               \
+  } while (0)
+
+#if GPU_MAX_SELECTION_K >= 2048
+
+  // block size 128 for k <= 1024, 64 for k = 2048
+#define RUN_PASS_DIR(DIR)                                 \
+  do {                                                    \
+    if (k == 1) {                                         \
+      RUN_PASS(128, 1, 1, DIR);                           \
+    } else if (k <= 32) {                                 \
+      RUN_PASS(128, 32, 2, DIR);                          \
+    } else if (k <= 64) {                                 \
+      RUN_PASS(128, 64, 3, DIR);                          \
+    } else if (k <= 128) {                                \
+      RUN_PASS(128, 128, 3, DIR);                         \
+    } else if (k <= 256) {                                \
+      RUN_PASS(128, 256, 4, DIR);                         \
+    } else if (k <= 512) {                                \
+      RUN_PASS(128, 512, 8, DIR);                         \
+    } else if (k <= 1024) {                               \
+      RUN_PASS(128, 1024, 8, DIR);                        \
+    } else if (k <= 2048) {                               \
+      RUN_PASS(64, 2048, 8, DIR);                         \
+    }                                                     \
+  } while (0)
+
+#else
+
+#define RUN_PASS_DIR(DIR)                                 \
+  do {                                                    \
+    if (k == 1) {                                         \
+      RUN_PASS(128, 1, 1, DIR);                           \
+    } else if (k <= 32) {                                 \
+      RUN_PASS(128, 32, 2, DIR);                          \
+    } else if (k <= 64) {                                 \
+      RUN_PASS(128, 64, 3, DIR);                          \
+    } else if (k <= 128) {                                \
+      RUN_PASS(128, 128, 3, DIR);                         \
+    } else if (k <= 256) {                                \
+      RUN_PASS(128, 256, 4, DIR);                         \
+    } else if (k <= 512) {                                \
+      RUN_PASS(128, 512, 8, DIR);                         \
+    } else if (k <= 1024) {                               \
+      RUN_PASS(128, 1024, 8, DIR);                        \
+    }                                                     \
+  } while (0)
+
+#endif // GPU_MAX_SELECTION_K
+
+  if (chooseLargest) {
+    RUN_PASS_DIR(true);
+  } else {
+    RUN_PASS_DIR(false);
+  }
+
+  // unimplemented / too many resources
+  FAISS_ASSERT_FMT(false, "unimplemented k value (%d)", k);
+
+#undef RUN_PASS_DIR
+#undef RUN_PASS
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cu b/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cu
new file mode 100644
index 0000000000..c8e7228095
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cu
@@ -0,0 +1,331 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/L2Norm.cuh>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/MathOperators.cuh>
+#include <faiss/gpu/utils/PtxUtils.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/gpu/utils/Reductions.cuh>
+
+namespace faiss { namespace gpu {
+
+// Input: (batch x dim)
+// Output: (batch norm)
+// Done under the presumption that the dimension size is not too large
+// (<10k or so), since there wouldn't be enough parallelism applying a
+// single block to the problem. Also that each vector is large enough
+// (>64), since a single block works on multiple rows' norms at the
+// same time.
+// T: the type we are doing the math in (e.g., float, half)
+// TVec: the potentially vectorized type we are loading in (e.g.,
+// float4, half2)
+template <typename T, typename TVec, typename IndexType,
+          int RowTileSize, bool NormLoop, bool NormSquared>
+__global__ void
+l2NormRowMajor(Tensor<TVec, 2, true, IndexType> input,
+               Tensor<T, 1, true, IndexType> output) {
+  extern __shared__ char smemByte[]; // #warps * RowTileSize elements
+  T* smem = (T*) smemByte;
+
+  IndexType numWarps = utils::divUp(blockDim.x, kWarpSize);
+  IndexType laneId = getLaneId();
+  IndexType warpId = threadIdx.x / kWarpSize;
+
+  bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
+  IndexType rowStart = RowTileSize * blockIdx.x;
+  T rowNorm[RowTileSize];
+
+  if (lastRowTile) {
+    // We are handling the very end of the input matrix rows
+    for (IndexType row = 0; row < input.getSize(0) - rowStart; ++row) {
+      if (NormLoop) {
+        rowNorm[0] = Math<T>::zero();
+
+        for (IndexType col = threadIdx.x;
+             col < input.getSize(1); col += blockDim.x) {
+          TVec val = input[rowStart + row][col];
+          val = Math<TVec>::mul(val, val);
+          rowNorm[0] = Math<T>::add(rowNorm[0], Math<TVec>::reduceAdd(val));
+        }
+      } else {
+        TVec val = input[rowStart + row][threadIdx.x];
+        val = Math<TVec>::mul(val, val);
+        rowNorm[0] = Math<TVec>::reduceAdd(val);
+      }
+
+      rowNorm[0] = warpReduceAllSum(rowNorm[0]);
+      if (laneId == 0) {
+        smem[row * numWarps + warpId] = rowNorm[0];
+      }
+    }
+  } else {
+    // We are guaranteed that all RowTileSize rows are available in
+    // [rowStart, rowStart + RowTileSize)
+
+    if (NormLoop) {
+      // A single block of threads is not big enough to span each
+      // vector
+      TVec tmp[RowTileSize];
+
+#pragma unroll
+      for (int row = 0; row < RowTileSize; ++row) {
+        rowNorm[row] = Math<T>::zero();
+      }
+
+      for (IndexType col = threadIdx.x;
+           col < input.getSize(1); col += blockDim.x) {
+#pragma unroll
+        for (int row = 0; row < RowTileSize; ++row) {
+          tmp[row] = input[rowStart + row][col];
+        }
+
+#pragma unroll
+        for (int row = 0; row < RowTileSize; ++row) {
+          tmp[row] = Math<TVec>::mul(tmp[row], tmp[row]);
+        }
+
+#pragma unroll
+        for (int row = 0; row < RowTileSize; ++row) {
+          rowNorm[row] = Math<T>::add(rowNorm[row],
+                                      Math<TVec>::reduceAdd(tmp[row]));
+        }
+      }
+    } else {
+      TVec tmp[RowTileSize];
+
+      // A block of threads is the exact size of the vector
+#pragma unroll
+      for (int row = 0; row < RowTileSize; ++row) {
+        tmp[row] = input[rowStart + row][threadIdx.x];
+      }
+
+#pragma unroll
+      for (int row = 0; row < RowTileSize; ++row) {
+        tmp[row] = Math<TVec>::mul(tmp[row], tmp[row]);
+      }
+
+#pragma unroll
+      for (int row = 0; row < RowTileSize; ++row) {
+        rowNorm[row] = Math<TVec>::reduceAdd(tmp[row]);
+      }
+    }
+
+    // Sum up all parts in each warp
+#pragma unroll
+    for (int row = 0; row < RowTileSize; ++row) {
+      rowNorm[row] = warpReduceAllSum(rowNorm[row]);
+    }
+
+    if (laneId == 0) {
+#pragma unroll
+      for (int row = 0; row < RowTileSize; ++row) {
+        smem[row * numWarps + warpId] = rowNorm[row];
+      }
+    }
+  }
+
+  __syncthreads();
+
+  // Sum across warps
+  if (warpId == 0) {
+#pragma unroll
+    for (int row = 0; row < RowTileSize; ++row) {
+      rowNorm[row] = laneId < numWarps ?
+                              smem[row * numWarps + laneId] : Math<T>::zero();
+    }
+
+#pragma unroll
+    for (int row = 0; row < RowTileSize; ++row) {
+      rowNorm[row] = warpReduceAllSum(rowNorm[row]);
+    }
+
+    // Write out answer
+    if (laneId == 0) {
+#pragma unroll
+      for (int row = 0; row < RowTileSize; ++row) {
+        int outCol = rowStart + row;
+
+        if (lastRowTile) {
+          if (outCol < output.getSize(0)) {
+            output[outCol] =
+              NormSquared ? rowNorm[row] :
+              ConvertTo<T>::to(
+                sqrtf(ConvertTo<float>::to(rowNorm[row])));
+          }
+        } else {
+          output[outCol] =
+            NormSquared ? rowNorm[row] :
+            ConvertTo<T>::to(
+              sqrtf(ConvertTo<float>::to(rowNorm[row])));
+        }
+      }
+    }
+  }
+}
+
+// Input: (dim x batch)
+// Output: (batch norm)
+// Handles the case where `input` is column major. A single thread calculates
+// the norm of each vector instead of a block-wide reduction.
+template <typename T, typename IndexType, bool NormSquared>
+__global__ void
+l2NormColMajor(Tensor<T, 2, true, IndexType> input,
+               Tensor<T, 1, true, IndexType> output) {
+  // grid-stride loop to handle all batch elements
+  for (IndexType batch = blockIdx.x * blockDim.x + threadIdx.x;
+       batch < input.getSize(1);
+       batch += gridDim.x * blockDim.x) {
+    float sum = 0;
+
+    // This is still a coalesced load from the memory
+    for (IndexType dim = 0; dim < input.getSize(0); ++dim) {
+      // Just do the math in float32, even if the input is float16
+      float v = ConvertTo<float>::to(input[dim][batch]);
+      sum += v * v;
+    }
+
+    if (!NormSquared) {
+      sum = sqrtf(sum);
+    }
+
+    output[batch] = ConvertTo<T>::to(sum);
+  }
+}
+
+template <typename T, typename TVec, typename IndexType>
+void runL2Norm(Tensor<T, 2, true, IndexType>& input,
+               bool inputRowMajor,
+               Tensor<T, 1, true, IndexType>& output,
+               bool normSquared,
+               cudaStream_t stream) {
+  IndexType maxThreads = (IndexType) getMaxThreadsCurrentDevice();
+  constexpr int rowTileSize = 8;
+
+#define RUN_L2_ROW_MAJOR(TYPE_T, TYPE_TVEC, INPUT)                      \
+  do {                                                                  \
+    if (normLoop) {                                                     \
+      if (normSquared) {                                                \
+        l2NormRowMajor<TYPE_T, TYPE_TVEC, IndexType, rowTileSize, true, true> \
+          <<<grid, block, smem, stream>>>(INPUT, output);               \
+      } else {                                                          \
+        l2NormRowMajor<TYPE_T, TYPE_TVEC, IndexType, rowTileSize, true, false> \
+          <<<grid, block, smem, stream>>>(INPUT, output);               \
+      }                                                                 \
+    } else {                                                            \
+      if (normSquared) {                                                \
+        l2NormRowMajor<TYPE_T, TYPE_TVEC, IndexType, rowTileSize, false, true> \
+          <<<grid, block, smem, stream>>>(INPUT, output);               \
+      } else {                                                          \
+        l2NormRowMajor<TYPE_T, TYPE_TVEC, IndexType, rowTileSize, false, false> \
+          <<<grid, block, smem, stream>>>(INPUT, output);               \
+      }                                                                 \
+    }                                                                   \
+  } while (0)
+
+  if (inputRowMajor) {
+    //
+    // Row-major kernel
+    ///
+
+    if (input.template canCastResize<TVec>()) {
+      // Can load using the vectorized type
+      auto inputV = input.template castResize<TVec>();
+
+      auto dim = inputV.getSize(1);
+      bool normLoop = dim > maxThreads;
+      auto numThreads = min(dim, maxThreads);
+
+      auto grid = dim3(utils::divUp(inputV.getSize(0), rowTileSize));
+      auto block = dim3(numThreads);
+
+      auto smem = sizeof(T) * rowTileSize * utils::divUp(numThreads, kWarpSize);
+
+      RUN_L2_ROW_MAJOR(T, TVec, inputV);
+    } else {
+      // Can't load using the vectorized type
+
+      auto dim = input.getSize(1);
+      bool normLoop = dim > maxThreads;
+      auto numThreads = min(dim, maxThreads);
+
+      auto grid = dim3(utils::divUp(input.getSize(0), rowTileSize));
+      auto block = dim3(numThreads);
+
+      auto smem = sizeof(T) * rowTileSize * utils::divUp(numThreads, kWarpSize);
+
+      RUN_L2_ROW_MAJOR(T, T, input);
+    }
+  } else {
+    //
+    // Column-major kernel
+    //
+
+    // Just use a fixed-sized block, since the kernel threads are fully
+    // independent
+    auto block = 128;
+
+    // Cap the grid size at 2^16 since there is a grid-stride loop to handle
+    // processing everything
+    auto grid = (int)
+      std::min(utils::divUp(input.getSize(1), (IndexType) block),
+               (IndexType) 65536);
+
+    if (normSquared) {
+      l2NormColMajor<T, IndexType, true><<<grid, block, 0, stream>>>(
+        input, output);
+    } else {
+      l2NormColMajor<T, IndexType, false><<<grid, block, 0, stream>>>(
+        input, output);
+    }
+  }
+
+#undef RUN_L2
+
+  CUDA_TEST_ERROR();
+}
+
+void runL2Norm(Tensor<float, 2, true>& input,
+               bool inputRowMajor,
+               Tensor<float, 1, true>& output,
+               bool normSquared,
+               cudaStream_t stream) {
+  if (input.canUseIndexType<int>()) {
+    runL2Norm<float, float4, int>(
+      input, inputRowMajor, output, normSquared, stream);
+  } else {
+    auto inputCast = input.castIndexType<long>();
+    auto outputCast = output.castIndexType<long>();
+
+    runL2Norm<float, float4, long>(
+      inputCast, inputRowMajor, outputCast, normSquared, stream);
+  }
+}
+
+void runL2Norm(Tensor<half, 2, true>& input,
+               bool inputRowMajor,
+               Tensor<half, 1, true>& output,
+               bool normSquared,
+               cudaStream_t stream) {
+  if (input.canUseIndexType<int>()) {
+    runL2Norm<half, half2, int>(
+      input, inputRowMajor, output, normSquared, stream);
+  } else {
+    auto inputCast = input.castIndexType<long>();
+    auto outputCast = output.castIndexType<long>();
+
+    runL2Norm<half, half2, long>(
+      inputCast, inputRowMajor, outputCast, normSquared, stream);
+  }
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cuh b/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cuh
new file mode 100644
index 0000000000..1841f4b3a3
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cuh
@@ -0,0 +1,27 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/Tensor.cuh>
+
+namespace faiss { namespace gpu {
+
+void runL2Norm(Tensor<float, 2, true>& input,
+               bool inputRowMajor,
+               Tensor<float, 1, true>& output,
+               bool normSquared,
+               cudaStream_t stream);
+
+void runL2Norm(Tensor<half, 2, true>& input,
+               bool inputRowMajor,
+               Tensor<half, 1, true>& output,
+               bool normSquared,
+               cudaStream_t stream);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu
new file mode 100644
index 0000000000..1480ec07df
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu
@@ -0,0 +1,253 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/L2Select.cuh>
+#include <faiss/impl/FaissAssert.h>
+
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/MathOperators.cuh>
+#include <faiss/gpu/utils/Pair.cuh>
+#include <faiss/gpu/utils/Reductions.cuh>
+#include <faiss/gpu/utils/Select.cuh>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+
+namespace faiss { namespace gpu {
+
+// L2 + select kernel for k == 1, implements re-use of ||c||^2
+template <typename T, int kRowsPerBlock, int kBlockSize>
+__global__ void l2SelectMin1(Tensor<T, 2, true> productDistances,
+                             Tensor<T, 1, true> centroidDistances,
+                             Tensor<T, 2, true> outDistances,
+                             Tensor<int, 2, true> outIndices) {
+  // Each block handles kRowsPerBlock rows of the distances (results)
+  Pair<T, int> threadMin[kRowsPerBlock];
+  __shared__ Pair<T, int> blockMin[kRowsPerBlock * (kBlockSize / kWarpSize)];
+
+  T distance[kRowsPerBlock];
+
+#pragma unroll
+  for (int i = 0; i < kRowsPerBlock; ++i) {
+    threadMin[i].k = Limits<T>::getMax();
+    threadMin[i].v = -1;
+  }
+
+  // blockIdx.x: which chunk of rows we are responsible for updating
+  int rowStart = blockIdx.x * kRowsPerBlock;
+
+  // FIXME: if we have exact multiples, don't need this
+  bool endRow = (blockIdx.x == gridDim.x - 1);
+
+  if (endRow) {
+    if (productDistances.getSize(0) % kRowsPerBlock == 0) {
+      endRow = false;
+    }
+  }
+
+  if (endRow) {
+    for (int row = rowStart; row < productDistances.getSize(0); ++row) {
+      for (int col = threadIdx.x; col < productDistances.getSize(1);
+           col += blockDim.x) {
+        distance[0] = Math<T>::add(centroidDistances[col],
+                                   productDistances[row][col]);
+
+        if (Math<T>::lt(distance[0], threadMin[0].k)) {
+          threadMin[0].k = distance[0];
+          threadMin[0].v = col;
+        }
+      }
+
+      // Reduce within the block
+      threadMin[0] =
+        blockReduceAll<Pair<T, int>, Min<Pair<T, int> >, false, false>(
+        threadMin[0], Min<Pair<T, int> >(), blockMin);
+
+      if (threadIdx.x == 0) {
+        outDistances[row][0] = threadMin[0].k;
+        outIndices[row][0] = threadMin[0].v;
+      }
+
+      // so we can use the shared memory again
+      __syncthreads();
+
+      threadMin[0].k = Limits<T>::getMax();
+      threadMin[0].v = -1;
+    }
+  } else {
+    for (int col = threadIdx.x; col < productDistances.getSize(1);
+         col += blockDim.x) {
+      T centroidDistance = centroidDistances[col];
+
+#pragma unroll
+      for (int row = 0; row < kRowsPerBlock; ++row) {
+        distance[row] = productDistances[rowStart + row][col];
+      }
+
+#pragma unroll
+      for (int row = 0; row < kRowsPerBlock; ++row) {
+        distance[row] = Math<T>::add(distance[row], centroidDistance);
+      }
+
+#pragma unroll
+      for (int row = 0; row < kRowsPerBlock; ++row) {
+        if (Math<T>::lt(distance[row], threadMin[row].k)) {
+          threadMin[row].k = distance[row];
+          threadMin[row].v = col;
+        }
+      }
+    }
+
+    // Reduce within the block
+    blockReduceAll<kRowsPerBlock, Pair<T, int>, Min<Pair<T, int> >, false, false>(
+      threadMin, Min<Pair<T, int> >(), blockMin);
+
+    if (threadIdx.x == 0) {
+#pragma unroll
+      for (int row = 0; row < kRowsPerBlock; ++row) {
+        outDistances[rowStart + row][0] = threadMin[row].k;
+        outIndices[rowStart + row][0] = threadMin[row].v;
+      }
+    }
+  }
+}
+
+// L2 + select kernel for k > 1, no re-use of ||c||^2
+template <typename T, int NumWarpQ, int NumThreadQ, int ThreadsPerBlock>
+__global__ void l2SelectMinK(Tensor<T, 2, true> productDistances,
+                             Tensor<T, 1, true> centroidDistances,
+                             Tensor<T, 2, true> outDistances,
+                             Tensor<int, 2, true> outIndices,
+                             int k, T initK) {
+  // Each block handles a single row of the distances (results)
+  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+
+  __shared__ T smemK[kNumWarps * NumWarpQ];
+  __shared__ int smemV[kNumWarps * NumWarpQ];
+
+  BlockSelect<T, int, false, Comparator<T>,
+              NumWarpQ, NumThreadQ, ThreadsPerBlock>
+    heap(initK, -1, smemK, smemV, k);
+
+  int row = blockIdx.x;
+
+  // Whole warps must participate in the selection
+  int limit = utils::roundDown(productDistances.getSize(1), kWarpSize);
+  int i = threadIdx.x;
+
+  for (; i < limit; i += blockDim.x) {
+    T v = Math<T>::add(centroidDistances[i],
+                       productDistances[row][i]);
+    heap.add(v, i);
+  }
+
+  if (i < productDistances.getSize(1)) {
+    T v = Math<T>::add(centroidDistances[i],
+                       productDistances[row][i]);
+    heap.addThreadQ(v, i);
+  }
+
+  heap.reduce();
+  for (int i = threadIdx.x; i < k; i += blockDim.x) {
+    outDistances[row][i] = smemK[i];
+    outIndices[row][i] = smemV[i];
+  }
+}
+
+template <typename T>
+void runL2SelectMin(Tensor<T, 2, true>& productDistances,
+                    Tensor<T, 1, true>& centroidDistances,
+                    Tensor<T, 2, true>& outDistances,
+                    Tensor<int, 2, true>& outIndices,
+                    int k,
+                    cudaStream_t stream) {
+  FAISS_ASSERT(productDistances.getSize(0) == outDistances.getSize(0));
+  FAISS_ASSERT(productDistances.getSize(0) == outIndices.getSize(0));
+  FAISS_ASSERT(centroidDistances.getSize(0) == productDistances.getSize(1));
+  FAISS_ASSERT(outDistances.getSize(1) == k);
+  FAISS_ASSERT(outIndices.getSize(1) == k);
+  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
+
+  if (k == 1) {
+    constexpr int kThreadsPerBlock = 256;
+    constexpr int kRowsPerBlock = 8;
+
+    auto block = dim3(kThreadsPerBlock);
+    auto grid = dim3(utils::divUp(outDistances.getSize(0), kRowsPerBlock));
+
+    l2SelectMin1<T, kRowsPerBlock, kThreadsPerBlock>
+      <<<grid, block, 0, stream>>>(productDistances, centroidDistances,
+                                   outDistances, outIndices);
+  } else {
+    auto grid = dim3(outDistances.getSize(0));
+
+#define RUN_L2_SELECT(BLOCK, NUM_WARP_Q, NUM_THREAD_Q)                  \
+    do {                                                                \
+      l2SelectMinK<T, NUM_WARP_Q, NUM_THREAD_Q, BLOCK>                  \
+        <<<grid, BLOCK, 0, stream>>>(productDistances, centroidDistances, \
+                                     outDistances, outIndices,          \
+                                     k, Limits<T>::getMax());           \
+    } while (0)
+
+    // block size 128 for everything <= 1024
+    if (k <= 32) {
+      RUN_L2_SELECT(128, 32, 2);
+    } else if (k <= 64) {
+      RUN_L2_SELECT(128, 64, 3);
+    } else if (k <= 128) {
+      RUN_L2_SELECT(128, 128, 3);
+    } else if (k <= 256) {
+      RUN_L2_SELECT(128, 256, 4);
+    } else if (k <= 512) {
+      RUN_L2_SELECT(128, 512, 8);
+    } else if (k <= 1024) {
+      RUN_L2_SELECT(128, 1024, 8);
+
+#if GPU_MAX_SELECTION_K >= 2048
+    } else if (k <= 2048) {
+      // smaller block for less shared memory
+      RUN_L2_SELECT(64, 2048, 8);
+#endif
+
+    } else {
+      FAISS_ASSERT(false);
+    }
+  }
+
+  CUDA_TEST_ERROR();
+}
+
+void runL2SelectMin(Tensor<float, 2, true>& productDistances,
+                    Tensor<float, 1, true>& centroidDistances,
+                    Tensor<float, 2, true>& outDistances,
+                    Tensor<int, 2, true>& outIndices,
+                    int k,
+                    cudaStream_t stream) {
+  runL2SelectMin<float>(productDistances,
+                        centroidDistances,
+                        outDistances,
+                        outIndices,
+                        k,
+                        stream);
+}
+
+void runL2SelectMin(Tensor<half, 2, true>& productDistances,
+                    Tensor<half, 1, true>& centroidDistances,
+                    Tensor<half, 2, true>& outDistances,
+                    Tensor<int, 2, true>& outIndices,
+                    int k,
+                    cudaStream_t stream) {
+  runL2SelectMin<half>(productDistances,
+                       centroidDistances,
+                       outDistances,
+                       outIndices,
+                       k,
+                       stream);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh
new file mode 100644
index 0000000000..95c35ca571
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh
@@ -0,0 +1,29 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/Tensor.cuh>
+
+namespace faiss { namespace gpu {
+
+void runL2SelectMin(Tensor<float, 2, true>& productDistances,
+                    Tensor<float, 1, true>& centroidDistances,
+                    Tensor<float, 2, true>& outDistances,
+                    Tensor<int, 2, true>& outIndices,
+                    int k,
+                    cudaStream_t stream);
+
+void runL2SelectMin(Tensor<half, 2, true>& productDistances,
+                    Tensor<half, 1, true>& centroidDistances,
+                    Tensor<half, 2, true>& outDistances,
+                    Tensor<int, 2, true>& outIndices,
+                    int k,
+                    cudaStream_t stream);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/Metrics.cuh b/core/src/index/thirdparty/faiss/gpu/impl/Metrics.cuh
new file mode 100644
index 0000000000..5b9feac3ee
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/Metrics.cuh
@@ -0,0 +1,52 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+namespace faiss { namespace gpu {
+
+/// List of supported metrics
+inline bool isMetricSupported(MetricType mt) {
+  switch (mt) {
+    case MetricType::METRIC_INNER_PRODUCT:
+    case MetricType::METRIC_L2:
+      return true;
+    default:
+      return false;
+  }
+}
+
+/// Sort direction per each metric
+inline bool metricToSortDirection(MetricType mt) {
+  switch (mt) {
+    case MetricType::METRIC_INNER_PRODUCT:
+      // highest
+      return true;
+    case MetricType::METRIC_L2:
+      // lowest
+      return false;
+    default:
+      // unhandled metric
+      FAISS_ASSERT(false);
+      return false;
+  }
+}
+
+struct L2Metric {
+  static inline __device__ float distance(float a, float b) {
+    float d = a - b;
+    return d * d;
+  }
+};
+
+struct IPMetric {
+  static inline __device__ float distance(float a, float b) {
+    return a * b;
+  }
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cu b/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cu
new file mode 100644
index 0000000000..73a6952dcc
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cu
@@ -0,0 +1,501 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/PQCodeDistances.cuh>
+
+#include <faiss/gpu/impl/BroadcastSum.cuh>
+#include <faiss/gpu/impl/Distance.cuh>
+#include <faiss/gpu/impl/L2Norm.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/MatrixMult.cuh>
+#include <faiss/gpu/utils/PtxUtils.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/gpu/utils/Transpose.cuh>
+
+namespace faiss { namespace gpu {
+
+template <typename T>
+struct Converter {
+};
+
+template <>
+struct Converter<half> {
+  inline static __device__ half to(float v) { return __float2half(v); }
+};
+
+template <>
+struct Converter<float> {
+  inline static __device__ float to(float v) { return v; }
+};
+
+// Kernel responsible for calculating distance from residual vector to
+// each product quantizer code centroid
+template <typename OutCodeT, int DimsPerSubQuantizer>
+__global__ void
+__launch_bounds__(288, 4)
+pqCodeDistances(Tensor<float, 2, true> queries,
+                int queriesPerBlock,
+                Tensor<float, 2, true> coarseCentroids,
+                Tensor<float, 3, true> pqCentroids,
+                Tensor<int, 2, true> topQueryToCentroid,
+                // (query id)(coarse)(subquantizer)(code) -> dist
+                Tensor<OutCodeT, 4, true> outCodeDistances) {
+  const auto numSubQuantizers = pqCentroids.getSize(0);
+  const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
+  assert(DimsPerSubQuantizer == dimsPerSubQuantizer);
+  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
+
+  bool isLoadingThread = threadIdx.x >= codesPerSubQuantizer;
+  int loadingThreadId = threadIdx.x - codesPerSubQuantizer;
+
+  extern __shared__ float smem[];
+
+  // Each thread calculates a single code
+  float subQuantizerData[DimsPerSubQuantizer];
+
+  auto code = threadIdx.x;
+  auto subQuantizer = blockIdx.y;
+
+  // Each thread will load the pq centroid data for the code that it
+  // is processing
+#pragma unroll
+  for (int i = 0; i < DimsPerSubQuantizer; ++i) {
+    subQuantizerData[i] = pqCentroids[subQuantizer][i][code].ldg();
+  }
+
+  // Where we store our query vector
+  float* smemQuery = smem;
+
+  // Where we store our residual vector; this is double buffered so we
+  // can be loading the next one while processing the current one
+  float* smemResidual1 = &smemQuery[DimsPerSubQuantizer];
+  float* smemResidual2 = &smemResidual1[DimsPerSubQuantizer];
+
+  // Where we pre-load the coarse centroid IDs
+  int* coarseIds = (int*) &smemResidual2[DimsPerSubQuantizer];
+
+  // Each thread is calculating the distance for a single code,
+  // performing the reductions locally
+
+  // Handle multiple queries per block
+  auto startQueryId = blockIdx.x * queriesPerBlock;
+  auto numQueries = queries.getSize(0) - startQueryId;
+  if (numQueries > queriesPerBlock) {
+    numQueries = queriesPerBlock;
+  }
+
+  for (int query = 0; query < numQueries; ++query) {
+    auto queryId = startQueryId + query;
+
+    auto querySubQuantizer =
+      queries[queryId][subQuantizer * DimsPerSubQuantizer].data();
+
+    // Load current query vector
+    for (int i = threadIdx.x; i < DimsPerSubQuantizer; i += blockDim.x) {
+      smemQuery[i] = querySubQuantizer[i];
+    }
+
+    // Load list of coarse centroids found
+    for (int i = threadIdx.x;
+         i < topQueryToCentroid.getSize(1); i += blockDim.x) {
+      coarseIds[i] = topQueryToCentroid[queryId][i];
+    }
+
+    // We need coarseIds below
+    // FIXME: investigate loading separately, so we don't need this
+    __syncthreads();
+
+    // Preload first buffer of residual data
+    if (isLoadingThread) {
+      for (int i = loadingThreadId;
+           i < DimsPerSubQuantizer;
+           i += blockDim.x - codesPerSubQuantizer) {
+        auto coarseId = coarseIds[0];
+        // In case NaNs were in the original query data
+        coarseId = coarseId == -1 ? 0 : coarseId;
+        auto coarseCentroidSubQuantizer =
+          coarseCentroids[coarseId][subQuantizer * dimsPerSubQuantizer].data();
+
+        smemResidual1[i] = smemQuery[i] - coarseCentroidSubQuantizer[i];
+      }
+    }
+
+    // The block walks the list for a single query
+    for (int coarse = 0; coarse < topQueryToCentroid.getSize(1); ++coarse) {
+      // Wait for smemResidual1 to be loaded
+      __syncthreads();
+
+      if (isLoadingThread) {
+        // Preload second buffer of residual data
+        for (int i = loadingThreadId;
+             i < DimsPerSubQuantizer;
+             i += blockDim.x - codesPerSubQuantizer) {
+          // FIXME: try always making this centroid id 0 so we can
+          // terminate
+          if (coarse != (topQueryToCentroid.getSize(1) - 1)) {
+            auto coarseId = coarseIds[coarse + 1];
+            // In case NaNs were in the original query data
+            coarseId = coarseId == -1 ? 0 : coarseId;
+
+            auto coarseCentroidSubQuantizer =
+              coarseCentroids[coarseId][subQuantizer * dimsPerSubQuantizer].data();
+
+            smemResidual2[i] = smemQuery[i] - coarseCentroidSubQuantizer[i];
+          }
+        }
+      } else {
+        // These are the processing threads
+        float dist = 0.0f;
+
+        constexpr int kUnroll = 4;
+        constexpr int kRemainder = DimsPerSubQuantizer % kUnroll;
+        constexpr int kRemainderBase = DimsPerSubQuantizer - kRemainder;
+        float vals[kUnroll];
+
+        // Calculate residual - pqCentroid for each dim that we're
+        // processing
+
+        // Unrolled loop
+#pragma unroll
+        for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
+
+#pragma unroll
+          for (int j = 0; j < kUnroll; ++j) {
+            vals[j] = smemResidual1[i * kUnroll + j];
+          }
+
+#pragma unroll
+          for (int j = 0; j < kUnroll; ++j) {
+            vals[j] -= subQuantizerData[i * kUnroll + j];
+          }
+
+#pragma unroll
+          for (int j = 0; j < kUnroll; ++j) {
+            vals[j] *= vals[j];
+          }
+
+#pragma unroll
+          for (int j = 0; j < kUnroll; ++j) {
+            dist += vals[j];
+          }
+        }
+
+        // Remainder loop
+#pragma unroll
+        for (int j = 0; j < kRemainder; ++j) {
+          vals[j] = smemResidual1[kRemainderBase + j];
+        }
+
+#pragma unroll
+        for (int j = 0; j < kRemainder; ++j) {
+          vals[j] -= subQuantizerData[kRemainderBase + j];
+        }
+
+#pragma unroll
+        for (int j = 0; j < kRemainder; ++j) {
+          vals[j] *= vals[j];
+        }
+
+#pragma unroll
+        for (int j = 0; j < kRemainder; ++j) {
+          dist += vals[j];
+        }
+
+        // We have the distance for our code; write it out
+        outCodeDistances[queryId][coarse][subQuantizer][code] =
+          Converter<OutCodeT>::to(dist);
+      } // !isLoadingThread
+
+      // Swap residual buffers
+      float* tmp = smemResidual1;
+      smemResidual1 = smemResidual2;
+      smemResidual2 = tmp;
+    }
+  }
+}
+
+__global__ void
+residualVector(Tensor<float, 2, true> queries,
+               Tensor<float, 2, true> coarseCentroids,
+               Tensor<int, 2, true> topQueryToCentroid,
+               int numSubDim,
+               // output is transposed:
+               // (sub q)(query id)(centroid id)(sub dim)
+               Tensor<float, 4, true> residual) {
+  // block x is query id
+  // block y is centroid id
+  // thread x is dim
+  auto queryId = blockIdx.x;
+  auto centroidId = blockIdx.y;
+
+  int realCentroidId = topQueryToCentroid[queryId][centroidId];
+
+  for (int dim = threadIdx.x; dim < queries.getSize(1); dim += blockDim.x) {
+    float q = queries[queryId][dim];
+    float c = coarseCentroids[realCentroidId][dim];
+
+    residual[dim / numSubDim][queryId][centroidId][dim % numSubDim] =
+      q - c;
+  }
+}
+
+void
+runResidualVector(Tensor<float, 3, true>& pqCentroids,
+                  Tensor<float, 2, true>& queries,
+                  Tensor<float, 2, true>& coarseCentroids,
+                  Tensor<int, 2, true>& topQueryToCentroid,
+                  Tensor<float, 4, true>& residual,
+                  cudaStream_t stream) {
+  auto grid =
+    dim3(topQueryToCentroid.getSize(0), topQueryToCentroid.getSize(1));
+  auto block = dim3(std::min(queries.getSize(1), getMaxThreadsCurrentDevice()));
+
+  residualVector<<<grid, block, 0, stream>>>(
+    queries, coarseCentroids, topQueryToCentroid, pqCentroids.getSize(1),
+    residual);
+
+  CUDA_TEST_ERROR();
+}
+
+void
+runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
+                     Tensor<float, 2, true>& queries,
+                     Tensor<float, 2, true>& coarseCentroids,
+                     Tensor<int, 2, true>& topQueryToCentroid,
+                     NoTypeTensor<4, true>& outCodeDistances,
+                     bool useFloat16Lookup,
+                     DeviceMemory& mem,
+                     cublasHandle_t handle,
+                     cudaStream_t stream) {
+  // Calculate (q - c) residual vector
+  // (sub q)(query id)(centroid id)(sub dim)
+  DeviceTensor<float, 4, true> residual(
+    mem,
+    {pqCentroids.getSize(0),
+        topQueryToCentroid.getSize(0),
+        topQueryToCentroid.getSize(1),
+        pqCentroids.getSize(1)},
+    stream);
+
+  runResidualVector(pqCentroids, queries,
+                    coarseCentroids, topQueryToCentroid,
+                    residual, stream);
+
+  // Calculate ||q - c||^2
+  DeviceTensor<float, 1, true> residualNorms(
+    mem,
+    {pqCentroids.getSize(0) *
+        topQueryToCentroid.getSize(0) *
+        topQueryToCentroid.getSize(1)},
+    stream);
+
+  auto residualView2 = residual.view<2>(
+    {pqCentroids.getSize(0) *
+        topQueryToCentroid.getSize(0) *
+        topQueryToCentroid.getSize(1),
+        pqCentroids.getSize(1)});
+
+  runL2Norm(residualView2, true, residualNorms, true, stream);
+
+  // Perform a batch MM:
+  // (sub q) x {(q * c)(sub dim) x (sub dim)(code)} =>
+  // (sub q) x {(q * c)(code)}
+  auto residualView3 = residual.view<3>(
+    {pqCentroids.getSize(0),
+        topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
+        pqCentroids.getSize(1)});
+
+  DeviceTensor<float, 3, true> residualDistance(
+    mem,
+    {pqCentroids.getSize(0),
+        topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
+        pqCentroids.getSize(2)},
+    stream);
+
+  runIteratedMatrixMult(residualDistance, false,
+                        residualView3, false,
+                        pqCentroids, false,
+                        -2.0f, 0.0f,
+                        handle,
+                        stream);
+
+  // Sum ||q - c||^2 along rows
+  auto residualDistanceView2 = residualDistance.view<2>(
+    {pqCentroids.getSize(0) *
+        topQueryToCentroid.getSize(0) *
+        topQueryToCentroid.getSize(1),
+        pqCentroids.getSize(2)});
+
+  runSumAlongRows(residualNorms, residualDistanceView2, false, stream);
+
+  Tensor<float, 4, true> outCodeDistancesF;
+  DeviceTensor<float, 4, true> outCodeDistancesFloatMem;
+
+  if (useFloat16Lookup) {
+    outCodeDistancesFloatMem = DeviceTensor<float, 4, true>(
+      mem, {outCodeDistances.getSize(0),
+          outCodeDistances.getSize(1),
+          outCodeDistances.getSize(2),
+          outCodeDistances.getSize(3)},
+      stream);
+
+    outCodeDistancesF = outCodeDistancesFloatMem;
+  } else {
+    outCodeDistancesF = outCodeDistances.toTensor<float>();
+  }
+
+  // Transpose -2(sub q)(q * c)(code) to -2(q * c)(sub q)(code) (which
+  // is where we build our output distances)
+  auto outCodeDistancesView = outCodeDistancesF.view<3>(
+    {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
+        outCodeDistances.getSize(2),
+        outCodeDistances.getSize(3)});
+
+  runTransposeAny(residualDistance, 0, 1, outCodeDistancesView, stream);
+
+  // Calculate code norms per each sub-dim
+  // (sub q)(sub dim)(code) is pqCentroids
+  // transpose to (sub q)(code)(sub dim)
+  DeviceTensor<float, 3, true> pqCentroidsTranspose(
+    mem,
+    {pqCentroids.getSize(0), pqCentroids.getSize(2), pqCentroids.getSize(1)},
+    stream);
+
+  runTransposeAny(pqCentroids, 1, 2, pqCentroidsTranspose, stream);
+
+  auto pqCentroidsTransposeView = pqCentroidsTranspose.view<2>(
+    {pqCentroids.getSize(0) * pqCentroids.getSize(2),
+        pqCentroids.getSize(1)});
+
+  DeviceTensor<float, 1, true> pqCentroidsNorm(
+    mem,
+    {pqCentroids.getSize(0) * pqCentroids.getSize(2)},
+    stream);
+
+  runL2Norm(pqCentroidsTransposeView, true, pqCentroidsNorm, true, stream);
+
+  // View output as (q * c)(sub q * code), and add centroid norm to
+  // each row
+  auto outDistancesCodeViewCols = outCodeDistancesView.view<2>(
+    {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
+        outCodeDistances.getSize(2) * outCodeDistances.getSize(3)});
+
+  runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream);
+
+  if (useFloat16Lookup) {
+    // Need to convert back
+    auto outCodeDistancesH = outCodeDistances.toTensor<half>();
+    convertTensor<float, half, 4>(stream,
+                                  outCodeDistancesF,
+                                  outCodeDistancesH);
+  }
+}
+
+void
+runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
+                   Tensor<float, 2, true>& queries,
+                   Tensor<float, 2, true>& coarseCentroids,
+                   Tensor<int, 2, true>& topQueryToCentroid,
+                   NoTypeTensor<4, true>& outCodeDistances,
+                   bool useFloat16Lookup,
+                   cudaStream_t stream) {
+  const auto numSubQuantizers = pqCentroids.getSize(0);
+  const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
+  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
+
+  // FIXME: tune
+  // Reuse of pq centroid data is based on both # of queries * nprobe,
+  // and we should really be tiling in both dimensions
+  constexpr int kQueriesPerBlock = 8;
+
+  auto grid = dim3(utils::divUp(queries.getSize(0), kQueriesPerBlock),
+                   numSubQuantizers);
+
+  // Reserve one block of threads for double buffering
+  // FIXME: probably impractical for large # of dims?
+  auto loadingThreads = utils::roundUp(dimsPerSubQuantizer, kWarpSize);
+  auto block = dim3(codesPerSubQuantizer + loadingThreads);
+
+  auto smem = (3 * dimsPerSubQuantizer) * sizeof(float)
+    + topQueryToCentroid.getSize(1) * sizeof(int);
+
+#define CODE_DISTANCE(DIMS)                                             \
+  do {                                                                  \
+    if (useFloat16Lookup) {                                             \
+      auto outCodeDistancesT = outCodeDistances.toTensor<half>();       \
+                                                                        \
+      pqCodeDistances<half, DIMS><<<grid, block, smem, stream>>>(       \
+        queries, kQueriesPerBlock,                                      \
+        coarseCentroids, pqCentroids,                                   \
+        topQueryToCentroid, outCodeDistancesT);                         \
+    } else {                                                            \
+      auto outCodeDistancesT = outCodeDistances.toTensor<float>();      \
+                                                                        \
+      pqCodeDistances<float, DIMS><<<grid, block, smem, stream>>>(      \
+        queries, kQueriesPerBlock,                                      \
+        coarseCentroids, pqCentroids,                                   \
+        topQueryToCentroid, outCodeDistancesT);                         \
+    }                                                                   \
+  } while (0)
+
+  switch (dimsPerSubQuantizer) {
+    case 1:
+      CODE_DISTANCE(1);
+      break;
+    case 2:
+      CODE_DISTANCE(2);
+      break;
+    case 3:
+      CODE_DISTANCE(3);
+      break;
+    case 4:
+      CODE_DISTANCE(4);
+      break;
+    case 6:
+      CODE_DISTANCE(6);
+      break;
+    case 8:
+      CODE_DISTANCE(8);
+      break;
+    case 10:
+      CODE_DISTANCE(10);
+      break;
+    case 12:
+      CODE_DISTANCE(12);
+      break;
+    case 16:
+      CODE_DISTANCE(16);
+      break;
+    case 20:
+      CODE_DISTANCE(20);
+      break;
+    case 24:
+      CODE_DISTANCE(24);
+      break;
+    case 28:
+      CODE_DISTANCE(28);
+      break;
+    case 32:
+      CODE_DISTANCE(32);
+      break;
+      // FIXME: larger sizes require too many registers - we need the
+      // MM implementation working
+    default:
+      FAISS_ASSERT(false);
+      break;
+  }
+
+#undef CODE_DISTANCE
+
+  CUDA_TEST_ERROR();
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cuh b/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cuh
new file mode 100644
index 0000000000..67f9159178
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cuh
@@ -0,0 +1,41 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/gpu/utils/NoTypeTensor.cuh>
+#include <cublas_v2.h>
+
+namespace faiss { namespace gpu {
+
+class DeviceMemory;
+
+/// pqCentroids is of the form (sub q)(sub dim)(code id)
+/// Calculates the distance from the (query - centroid) residual to
+/// each sub-code vector, for the given list of query results in
+/// topQueryToCentroid
+void runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
+                        Tensor<float, 2, true>& queries,
+                        Tensor<float, 2, true>& coarseCentroids,
+                        Tensor<int, 2, true>& topQueryToCentroid,
+                        NoTypeTensor<4, true>& outCodeDistances,
+                        bool useFloat16Lookup,
+                        cudaStream_t stream);
+
+void runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
+                          Tensor<float, 2, true>& queries,
+                          Tensor<float, 2, true>& coarseCentroids,
+                          Tensor<int, 2, true>& topQueryToCentroid,
+                          NoTypeTensor<4, true>& outCodeDistances,
+                          bool useFloat16Lookup,
+                          DeviceMemory& mem,
+                          cublasHandle_t handle,
+                          cudaStream_t stream);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQCodeLoad.cuh b/core/src/index/thirdparty/faiss/gpu/impl/PQCodeLoad.cuh
new file mode 100644
index 0000000000..da933b1d00
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQCodeLoad.cuh
@@ -0,0 +1,357 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/PtxUtils.cuh>
+
+namespace faiss { namespace gpu {
+
+#if __CUDA_ARCH__ >= 350
+// Use the CC 3.5+ read-only texture cache (nc)
+#define LD_NC_V1 "ld.global.cs.nc.u32"
+#define LD_NC_V2 "ld.global.cs.nc.v2.u32"
+#define LD_NC_V4 "ld.global.cs.nc.v4.u32"
+#else
+// Read normally
+#define LD_NC_V1 "ld.global.cs.u32"
+#define LD_NC_V2 "ld.global.cs.v2.u32"
+#define LD_NC_V4 "ld.global.cs.v4.u32"
+#endif // __CUDA_ARCH__
+
+///
+/// This file contains loader functions for PQ codes of various byte
+/// length.
+///
+
+// Type-specific wrappers around the PTX bfe.* instruction, for
+// quantization code extraction
+inline __device__ unsigned int getByte(unsigned char v,
+                                       int pos,
+                                       int width) {
+  return v;
+}
+
+inline __device__ unsigned int getByte(unsigned short v,
+                                       int pos,
+                                       int width) {
+  return getBitfield((unsigned int) v, pos, width);
+}
+
+inline __device__ unsigned int getByte(unsigned int v,
+                                       int pos,
+                                       int width) {
+  return getBitfield(v, pos, width);
+}
+
+inline __device__ unsigned int getByte(unsigned long v,
+                                       int pos,
+                                       int width) {
+  return getBitfield(v, pos, width);
+}
+
+template <int NumSubQuantizers>
+struct LoadCode32 {};
+
+template<>
+struct LoadCode32<1> {
+  static inline __device__ void load(unsigned int code32[1],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 1;
+    asm("ld.global.cs.u8 {%0}, [%1];" :
+        "=r"(code32[0]) : "l"(p));
+  }
+};
+
+template<>
+struct LoadCode32<2> {
+  static inline __device__ void load(unsigned int code32[1],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 2;
+    asm("ld.global.cs.u16 {%0}, [%1];" :
+        "=r"(code32[0]) : "l"(p));
+  }
+};
+
+template<>
+struct LoadCode32<3> {
+  static inline __device__ void load(unsigned int code32[1],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 3;
+    unsigned int a;
+    unsigned int b;
+    unsigned int c;
+
+    // FIXME: this is a non-coalesced, unaligned, non-vectorized load
+    // unfortunately need to reorganize memory layout by warp
+    asm("ld.global.cs.u8 {%0}, [%1 + 0];" :
+        "=r"(a) : "l"(p));
+    asm("ld.global.cs.u8 {%0}, [%1 + 1];" :
+        "=r"(b) : "l"(p));
+    asm("ld.global.cs.u8 {%0}, [%1 + 2];" :
+        "=r"(c) : "l"(p));
+
+    // FIXME: this is also slow, since we have to recover the
+    // individual bytes loaded
+    code32[0] = (c << 16) | (b << 8) | a;
+  }
+};
+
+template<>
+struct LoadCode32<4> {
+  static inline __device__ void load(unsigned int code32[1],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 4;
+    asm("ld.global.cs.u32 {%0}, [%1];" :
+        "=r"(code32[0]) : "l"(p));
+  }
+};
+
+template<>
+struct LoadCode32<8> {
+  static inline __device__ void load(unsigned int code32[2],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 8;
+    asm("ld.global.cs.v2.u32 {%0, %1}, [%2];" :
+        "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
+  }
+};
+
+template<>
+struct LoadCode32<12> {
+  static inline __device__ void load(unsigned int code32[3],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 12;
+    // FIXME: this is a non-coalesced, unaligned, non-vectorized load
+    // unfortunately need to reorganize memory layout by warp
+    asm(LD_NC_V1 " {%0}, [%1 + 0];" :
+        "=r"(code32[0]) : "l"(p));
+    asm(LD_NC_V1 " {%0}, [%1 + 4];" :
+        "=r"(code32[1]) : "l"(p));
+    asm(LD_NC_V1 " {%0}, [%1 + 8];" :
+        "=r"(code32[2]) : "l"(p));
+  }
+};
+
+template<>
+struct LoadCode32<16> {
+  static inline __device__ void load(unsigned int code32[4],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 16;
+    asm("ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];" :
+        "=r"(code32[0]), "=r"(code32[1]),
+        "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
+  }
+};
+
+template<>
+struct LoadCode32<20> {
+  static inline __device__ void load(unsigned int code32[5],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 20;
+    // FIXME: this is a non-coalesced, unaligned, non-vectorized load
+    // unfortunately need to reorganize memory layout by warp
+    asm(LD_NC_V1 " {%0}, [%1 + 0];" :
+        "=r"(code32[0]) : "l"(p));
+    asm(LD_NC_V1 " {%0}, [%1 + 4];" :
+        "=r"(code32[1]) : "l"(p));
+    asm(LD_NC_V1 " {%0}, [%1 + 8];" :
+        "=r"(code32[2]) : "l"(p));
+    asm(LD_NC_V1 " {%0}, [%1 + 12];" :
+        "=r"(code32[3]) : "l"(p));
+    asm(LD_NC_V1 " {%0}, [%1 + 16];" :
+        "=r"(code32[4]) : "l"(p));
+  }
+};
+
+template<>
+struct LoadCode32<24> {
+  static inline __device__ void load(unsigned int code32[6],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 24;
+    // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
+    // unfortunately need to reorganize memory layout by warp
+    asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" :
+        "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
+    asm(LD_NC_V2 " {%0, %1}, [%2 + 8];" :
+        "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
+    asm(LD_NC_V2 " {%0, %1}, [%2 + 16];" :
+        "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
+  }
+};
+
+template<>
+struct LoadCode32<28> {
+  static inline __device__ void load(unsigned int code32[7],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 28;
+    // FIXME: this is a non-coalesced, unaligned, non-vectorized load
+    // unfortunately need to reorganize memory layout by warp
+    asm(LD_NC_V1 " {%0}, [%1 + 0];" :
+        "=r"(code32[0]) : "l"(p));
+    asm(LD_NC_V1 " {%0}, [%1 + 4];" :
+        "=r"(code32[1]) : "l"(p));
+    asm(LD_NC_V1 " {%0}, [%1 + 8];" :
+        "=r"(code32[2]) : "l"(p));
+    asm(LD_NC_V1 " {%0}, [%1 + 12];" :
+        "=r"(code32[3]) : "l"(p));
+    asm(LD_NC_V1 " {%0}, [%1 + 16];" :
+        "=r"(code32[4]) : "l"(p));
+    asm(LD_NC_V1 " {%0}, [%1 + 20];" :
+        "=r"(code32[5]) : "l"(p));
+    asm(LD_NC_V1 " {%0}, [%1 + 24];" :
+        "=r"(code32[6]) : "l"(p));
+  }
+};
+
+template<>
+struct LoadCode32<32> {
+  static inline __device__ void load(unsigned int code32[8],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 32;
+    // FIXME: this is a non-coalesced load
+    // unfortunately need to reorganize memory layout by warp
+    asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
+        "=r"(code32[0]), "=r"(code32[1]),
+        "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
+    asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
+        "=r"(code32[4]), "=r"(code32[5]),
+        "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
+  }
+};
+
+template<>
+struct LoadCode32<40> {
+  static inline __device__ void load(unsigned int code32[10],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 40;
+    // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
+    // unfortunately need to reorganize memory layout by warp
+    asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" :
+        "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
+    asm(LD_NC_V2 " {%0, %1}, [%2 + 8];" :
+        "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
+    asm(LD_NC_V2 " {%0, %1}, [%2 + 16];" :
+        "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
+    asm(LD_NC_V2 " {%0, %1}, [%2 + 24];" :
+        "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
+    asm(LD_NC_V2 " {%0, %1}, [%2 + 32];" :
+        "=r"(code32[8]), "=r"(code32[9]) : "l"(p));
+  }
+};
+
+template<>
+struct LoadCode32<48> {
+  static inline __device__ void load(unsigned int code32[12],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 48;
+    // FIXME: this is a non-coalesced load
+    // unfortunately need to reorganize memory layout by warp
+    asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
+        "=r"(code32[0]), "=r"(code32[1]),
+        "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
+    asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
+        "=r"(code32[4]), "=r"(code32[5]),
+        "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
+    asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];" :
+        "=r"(code32[8]), "=r"(code32[9]),
+        "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
+  }
+};
+
+template<>
+struct LoadCode32<56> {
+  static inline __device__ void load(unsigned int code32[14],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 56;
+    // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
+    // unfortunately need to reorganize memory layout by warp
+    asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" :
+        "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
+    asm(LD_NC_V2 " {%0, %1}, [%2 + 8];" :
+        "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
+    asm(LD_NC_V2 " {%0, %1}, [%2 + 16];" :
+        "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
+    asm(LD_NC_V2 " {%0, %1}, [%2 + 24];" :
+        "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
+    asm(LD_NC_V2 " {%0, %1}, [%2 + 32];" :
+        "=r"(code32[8]), "=r"(code32[9]) : "l"(p));
+    asm(LD_NC_V2 " {%0, %1}, [%2 + 40];" :
+        "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
+    asm(LD_NC_V2 " {%0, %1}, [%2 + 48];" :
+        "=r"(code32[12]), "=r"(code32[13]) : "l"(p));
+  }
+};
+
+template<>
+struct LoadCode32<64> {
+  static inline __device__ void load(unsigned int code32[16],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 64;
+    // FIXME: this is a non-coalesced load
+    // unfortunately need to reorganize memory layout by warp
+    asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
+        "=r"(code32[0]), "=r"(code32[1]),
+        "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
+    asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
+        "=r"(code32[4]), "=r"(code32[5]),
+        "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
+    asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];" :
+        "=r"(code32[8]), "=r"(code32[9]),
+        "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
+    asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 48];" :
+        "=r"(code32[12]), "=r"(code32[13]),
+        "=r"(code32[14]), "=r"(code32[15]) : "l"(p));
+  }
+};
+
+template<>
+struct LoadCode32<96> {
+  static inline __device__ void load(unsigned int code32[24],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 96;
+    // FIXME: this is a non-coalesced load
+    // unfortunately need to reorganize memory layout by warp
+    asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
+        "=r"(code32[0]), "=r"(code32[1]),
+        "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
+    asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
+        "=r"(code32[4]), "=r"(code32[5]),
+        "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
+    asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];" :
+        "=r"(code32[8]), "=r"(code32[9]),
+        "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
+    asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 48];" :
+        "=r"(code32[12]), "=r"(code32[13]),
+        "=r"(code32[14]), "=r"(code32[15]) : "l"(p));
+    asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 64];" :
+        "=r"(code32[16]), "=r"(code32[17]),
+        "=r"(code32[18]), "=r"(code32[19]) : "l"(p));
+    asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 80];" :
+        "=r"(code32[20]), "=r"(code32[21]),
+        "=r"(code32[22]), "=r"(code32[23]) : "l"(p));
+  }
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu
new file mode 100644
index 0000000000..d885d5f7ba
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu
@@ -0,0 +1,587 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/PQCodeDistances.cuh>
+#include <faiss/gpu/impl/PQCodeLoad.cuh>
+#include <faiss/gpu/impl/IVFUtils.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/LoadStoreOperators.cuh>
+#include <faiss/gpu/utils/NoTypeTensor.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+
+#include <faiss/gpu/utils/HostTensor.cuh>
+
+namespace faiss { namespace gpu {
+
+// This must be kept in sync with PQCodeDistances.cu
+bool isSupportedNoPrecomputedSubDimSize(int dims) {
+  switch (dims) {
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+    case 6:
+    case 8:
+    case 10:
+    case 12:
+    case 16:
+    case 20:
+    case 24:
+    case 28:
+    case 32:
+      return true;
+    default:
+      // FIXME: larger sizes require too many registers - we need the
+      // MM implementation working
+      return false;
+  }
+}
+
+template <typename LookupT, typename LookupVecT>
+struct LoadCodeDistances {
+  static inline __device__ void load(LookupT* smem,
+                                     LookupT* codes,
+                                     int numCodes) {
+    constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT);
+
+    // We can only use the vector type if the data is guaranteed to be
+    // aligned. The codes are innermost, so if it is evenly divisible,
+    // then any slice will be aligned.
+    if (numCodes % kWordSize == 0) {
+      // Load the data by float4 for efficiency, and then handle any remainder
+      // limitVec is the number of whole vec words we can load, in terms
+      // of whole blocks performing the load
+      constexpr int kUnroll = 2;
+      int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
+      limitVec *= kUnroll * blockDim.x;
+
+      LookupVecT* smemV = (LookupVecT*) smem;
+      LookupVecT* codesV = (LookupVecT*) codes;
+
+      for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
+        LookupVecT vals[kUnroll];
+
+#pragma unroll
+        for (int j = 0; j < kUnroll; ++j) {
+          vals[j] =
+            LoadStore<LookupVecT>::load(&codesV[i + j * blockDim.x]);
+        }
+
+#pragma unroll
+        for (int j = 0; j < kUnroll; ++j) {
+          LoadStore<LookupVecT>::store(&smemV[i + j * blockDim.x], vals[j]);
+        }
+      }
+
+      // This is where we start loading the remainder that does not evenly
+      // fit into kUnroll x blockDim.x
+      int remainder = limitVec * kWordSize;
+
+      for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
+        smem[i] = codes[i];
+      }
+    } else {
+      // Potential unaligned load
+      constexpr int kUnroll = 4;
+
+      int limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
+
+      int i = threadIdx.x;
+      for (; i < limit; i += kUnroll * blockDim.x) {
+        LookupT vals[kUnroll];
+
+#pragma unroll
+        for (int j = 0; j < kUnroll; ++j) {
+          vals[j] = codes[i + j * blockDim.x];
+        }
+
+#pragma unroll
+        for (int j = 0; j < kUnroll; ++j) {
+          smem[i + j * blockDim.x] = vals[j];
+        }
+      }
+
+      for (; i < numCodes; i += blockDim.x) {
+        smem[i] = codes[i];
+      }
+    }
+  }
+};
+
+template <int NumSubQuantizers, typename LookupT, typename LookupVecT>
+__global__ void
+pqScanNoPrecomputedMultiPass(Tensor<float, 2, true> queries,
+                             Tensor<float, 3, true> pqCentroids,
+                             Tensor<int, 2, true> topQueryToCentroid,
+                             Tensor<LookupT, 4, true> codeDistances,
+                             void** listCodes,
+                             int* listLengths,
+                             Tensor<int, 2, true> prefixSumOffsets,
+                             Tensor<float, 1, true> distance) {
+  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
+
+  // Where the pq code -> residual distance is stored
+  extern __shared__ char smemCodeDistances[];
+  LookupT* codeDist = (LookupT*) smemCodeDistances;
+
+  // Each block handles a single query
+  auto queryId = blockIdx.y;
+  auto probeId = blockIdx.x;
+
+  // This is where we start writing out data
+  // We ensure that before the array (at offset -1), there is a 0 value
+  int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
+  float* distanceOut = distance[outBase].data();
+
+  auto listId = topQueryToCentroid[queryId][probeId];
+  // Safety guard in case NaNs in input cause no list ID to be generated
+  if (listId == -1) {
+    return;
+  }
+
+  unsigned char* codeList = (unsigned char*) listCodes[listId];
+  int limit = listLengths[listId];
+
+  constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 :
+    (NumSubQuantizers / 4);
+  unsigned int code32[kNumCode32];
+  unsigned int nextCode32[kNumCode32];
+
+  // We double-buffer the code loading, which improves memory utilization
+  if (threadIdx.x < limit) {
+    LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x);
+  }
+
+  LoadCodeDistances<LookupT, LookupVecT>::load(
+    codeDist,
+    codeDistances[queryId][probeId].data(),
+    codeDistances.getSize(2) * codeDistances.getSize(3));
+
+  // Prevent WAR dependencies
+  __syncthreads();
+
+  // Each thread handles one code element in the list, with a
+  // block-wide stride
+  for (int codeIndex = threadIdx.x;
+       codeIndex < limit;
+       codeIndex += blockDim.x) {
+    // Prefetch next codes
+    if (codeIndex + blockDim.x < limit) {
+      LoadCode32<NumSubQuantizers>::load(
+        nextCode32, codeList, codeIndex + blockDim.x);
+    }
+
+    float dist = 0.0f;
+
+#pragma unroll
+    for (int word = 0; word < kNumCode32; ++word) {
+      constexpr int kBytesPerCode32 =
+        NumSubQuantizers < 4 ? NumSubQuantizers : 4;
+
+      if (kBytesPerCode32 == 1) {
+        auto code = code32[0];
+        dist = ConvertTo<float>::to(codeDist[code]);
+
+      } else {
+#pragma unroll
+        for (int byte = 0; byte < kBytesPerCode32; ++byte) {
+          auto code = getByte(code32[word], byte * 8, 8);
+
+          auto offset =
+            codesPerSubQuantizer * (word * kBytesPerCode32 + byte);
+
+          dist += ConvertTo<float>::to(codeDist[offset + code]);
+        }
+      }
+    }
+
+    // Write out intermediate distance result
+    // We do not maintain indices here, in order to reduce global
+    // memory traffic. Those are recovered in the final selection step.
+    distanceOut[codeIndex] = dist;
+
+    // Rotate buffers
+#pragma unroll
+    for (int word = 0; word < kNumCode32; ++word) {
+      code32[word] = nextCode32[word];
+    }
+  }
+}
+
+void
+runMultiPassTile(Tensor<float, 2, true>& queries,
+                 Tensor<float, 2, true>& centroids,
+                 Tensor<float, 3, true>& pqCentroidsInnermostCode,
+                 NoTypeTensor<4, true>& codeDistances,
+                 Tensor<int, 2, true>& topQueryToCentroid,
+                 bool useFloat16Lookup,
+                 int bytesPerCode,
+                 int numSubQuantizers,
+                 int numSubQuantizerCodes,
+                 thrust::device_vector<void*>& listCodes,
+                 thrust::device_vector<void*>& listIndices,
+                 IndicesOptions indicesOptions,
+                 thrust::device_vector<int>& listLengths,
+                 Tensor<char, 1, true>& thrustMem,
+                 Tensor<int, 2, true>& prefixSumOffsets,
+                 Tensor<float, 1, true>& allDistances,
+                 Tensor<float, 3, true>& heapDistances,
+                 Tensor<int, 3, true>& heapIndices,
+                 int k,
+                 Tensor<float, 2, true>& outDistances,
+                 Tensor<long, 2, true>& outIndices,
+                 cudaStream_t stream) {
+  // Calculate offset lengths, so we know where to write out
+  // intermediate results
+  runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets,
+                     thrustMem, stream);
+
+  // Calculate residual code distances, since this is without
+  // precomputed codes
+  runPQCodeDistances(pqCentroidsInnermostCode,
+                     queries,
+                     centroids,
+                     topQueryToCentroid,
+                     codeDistances,
+                     useFloat16Lookup,
+                     stream);
+
+  // Convert all codes to a distance, and write out (distance,
+  // index) values for all intermediate results
+  {
+    auto kThreadsPerBlock = 256;
+
+    auto grid = dim3(topQueryToCentroid.getSize(1),
+                     topQueryToCentroid.getSize(0));
+    auto block = dim3(kThreadsPerBlock);
+
+    // pq centroid distances
+    auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float);
+
+    smem *= numSubQuantizers * numSubQuantizerCodes;
+    FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
+
+#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T)                   \
+    do {                                                                \
+      auto codeDistancesT = codeDistances.toTensor<LOOKUP_T>();         \
+                                                                        \
+      pqScanNoPrecomputedMultiPass<NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T>   \
+        <<<grid, block, smem, stream>>>(                                \
+          queries,                                                      \
+          pqCentroidsInnermostCode,                                     \
+          topQueryToCentroid,                                           \
+          codeDistancesT,                                               \
+          listCodes.data().get(),                                       \
+          listLengths.data().get(),                                     \
+          prefixSumOffsets,                                             \
+          allDistances);                                                \
+    } while (0)
+
+#define RUN_PQ(NUM_SUB_Q)                       \
+    do {                                        \
+      if (useFloat16Lookup) {                   \
+        RUN_PQ_OPT(NUM_SUB_Q, half, Half8);     \
+      } else {                                  \
+        RUN_PQ_OPT(NUM_SUB_Q, float, float4);   \
+      }                                         \
+    } while (0)
+
+    switch (bytesPerCode) {
+      case 1:
+        RUN_PQ(1);
+        break;
+      case 2:
+        RUN_PQ(2);
+        break;
+      case 3:
+        RUN_PQ(3);
+        break;
+      case 4:
+        RUN_PQ(4);
+        break;
+      case 8:
+        RUN_PQ(8);
+        break;
+      case 12:
+        RUN_PQ(12);
+        break;
+      case 16:
+        RUN_PQ(16);
+        break;
+      case 20:
+        RUN_PQ(20);
+        break;
+      case 24:
+        RUN_PQ(24);
+        break;
+      case 28:
+        RUN_PQ(28);
+        break;
+      case 32:
+        RUN_PQ(32);
+        break;
+      case 40:
+        RUN_PQ(40);
+        break;
+      case 48:
+        RUN_PQ(48);
+        break;
+      case 56:
+        RUN_PQ(56);
+        break;
+      case 64:
+        RUN_PQ(64);
+        break;
+      case 96:
+        RUN_PQ(96);
+        break;
+      default:
+        FAISS_ASSERT(false);
+        break;
+    }
+
+#undef RUN_PQ
+#undef RUN_PQ_OPT
+  }
+
+  CUDA_TEST_ERROR();
+
+  // k-select the output in chunks, to increase parallelism
+  runPass1SelectLists(prefixSumOffsets,
+                      allDistances,
+                      topQueryToCentroid.getSize(1),
+                      k,
+                      false, // L2 distance chooses smallest
+                      heapDistances,
+                      heapIndices,
+                      stream);
+
+  // k-select final output
+  auto flatHeapDistances = heapDistances.downcastInner<2>();
+  auto flatHeapIndices = heapIndices.downcastInner<2>();
+
+  runPass2SelectLists(flatHeapDistances,
+                      flatHeapIndices,
+                      listIndices,
+                      indicesOptions,
+                      prefixSumOffsets,
+                      topQueryToCentroid,
+                      k,
+                      false, // L2 distance chooses smallest
+                      outDistances,
+                      outIndices,
+                      stream);
+}
+
+void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
+                                     Tensor<float, 2, true>& centroids,
+                                     Tensor<float, 3, true>& pqCentroidsInnermostCode,
+                                     Tensor<int, 2, true>& topQueryToCentroid,
+                                     bool useFloat16Lookup,
+                                     int bytesPerCode,
+                                     int numSubQuantizers,
+                                     int numSubQuantizerCodes,
+                                     thrust::device_vector<void*>& listCodes,
+                                     thrust::device_vector<void*>& listIndices,
+                                     IndicesOptions indicesOptions,
+                                     thrust::device_vector<int>& listLengths,
+                                     int maxListLength,
+                                     int k,
+                                     // output
+                                     Tensor<float, 2, true>& outDistances,
+                                     // output
+                                     Tensor<long, 2, true>& outIndices,
+                                     GpuResources* res) {
+  constexpr int kMinQueryTileSize = 8;
+  constexpr int kMaxQueryTileSize = 128;
+  constexpr int kThrustMemSize = 16384;
+
+  int nprobe = topQueryToCentroid.getSize(1);
+
+  auto& mem = res->getMemoryManagerCurrentDevice();
+  auto stream = res->getDefaultStreamCurrentDevice();
+
+  // Make a reservation for Thrust to do its dirty work (global memory
+  // cross-block reduction space); hopefully this is large enough.
+  DeviceTensor<char, 1, true> thrustMem1(
+    mem, {kThrustMemSize}, stream);
+  DeviceTensor<char, 1, true> thrustMem2(
+    mem, {kThrustMemSize}, stream);
+  DeviceTensor<char, 1, true>* thrustMem[2] =
+    {&thrustMem1, &thrustMem2};
+
+  // How much temporary storage is available?
+  // If possible, we'd like to fit within the space available.
+  size_t sizeAvailable = mem.getSizeAvailable();
+
+  // We run two passes of heap selection
+  // This is the size of the first-level heap passes
+  constexpr int kNProbeSplit = 8;
+  int pass2Chunks = std::min(nprobe, kNProbeSplit);
+
+  size_t sizeForFirstSelectPass =
+    pass2Chunks * k * (sizeof(float) + sizeof(int));
+
+  // How much temporary storage we need per each query
+  size_t sizePerQuery =
+    2 * // streams
+    ((nprobe * sizeof(int) + sizeof(int)) + // prefixSumOffsets
+     nprobe * maxListLength * sizeof(float) + // allDistances
+     // residual distances
+     nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) +
+     sizeForFirstSelectPass);
+
+  int queryTileSize = (int) (sizeAvailable / sizePerQuery);
+
+  if (queryTileSize < kMinQueryTileSize) {
+    queryTileSize = kMinQueryTileSize;
+  } else if (queryTileSize > kMaxQueryTileSize) {
+    queryTileSize = kMaxQueryTileSize;
+  }
+
+  // FIXME: we should adjust queryTileSize to deal with this, since
+  // indexing is in int32
+  FAISS_ASSERT(queryTileSize * nprobe * maxListLength <
+         std::numeric_limits<int>::max());
+
+  // Temporary memory buffers
+  // Make sure there is space prior to the start which will be 0, and
+  // will handle the boundary condition without branches
+  DeviceTensor<int, 1, true> prefixSumOffsetSpace1(
+    mem, {queryTileSize * nprobe + 1}, stream);
+  DeviceTensor<int, 1, true> prefixSumOffsetSpace2(
+    mem, {queryTileSize * nprobe + 1}, stream);
+
+  DeviceTensor<int, 2, true> prefixSumOffsets1(
+    prefixSumOffsetSpace1[1].data(),
+    {queryTileSize, nprobe});
+  DeviceTensor<int, 2, true> prefixSumOffsets2(
+    prefixSumOffsetSpace2[1].data(),
+    {queryTileSize, nprobe});
+  DeviceTensor<int, 2, true>* prefixSumOffsets[2] =
+    {&prefixSumOffsets1, &prefixSumOffsets2};
+
+  // Make sure the element before prefixSumOffsets is 0, since we
+  // depend upon simple, boundary-less indexing to get proper results
+  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(),
+                              0,
+                              sizeof(int),
+                              stream));
+  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(),
+                              0,
+                              sizeof(int),
+                              stream));
+
+  int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float);
+
+  int totalCodeDistancesSize =
+    queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes *
+    codeDistanceTypeSize;
+
+  DeviceTensor<char, 1, true> codeDistances1Mem(
+    mem, {totalCodeDistancesSize}, stream);
+  NoTypeTensor<4, true> codeDistances1(
+    codeDistances1Mem.data(),
+    codeDistanceTypeSize,
+    {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
+
+  DeviceTensor<char, 1, true> codeDistances2Mem(
+    mem, {totalCodeDistancesSize}, stream);
+  NoTypeTensor<4, true> codeDistances2(
+    codeDistances2Mem.data(),
+    codeDistanceTypeSize,
+    {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
+
+  NoTypeTensor<4, true>* codeDistances[2] =
+    {&codeDistances1, &codeDistances2};
+
+  DeviceTensor<float, 1, true> allDistances1(
+    mem, {queryTileSize * nprobe * maxListLength}, stream);
+  DeviceTensor<float, 1, true> allDistances2(
+    mem, {queryTileSize * nprobe * maxListLength}, stream);
+  DeviceTensor<float, 1, true>* allDistances[2] =
+    {&allDistances1, &allDistances2};
+
+  DeviceTensor<float, 3, true> heapDistances1(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<float, 3, true> heapDistances2(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<float, 3, true>* heapDistances[2] =
+    {&heapDistances1, &heapDistances2};
+
+  DeviceTensor<int, 3, true> heapIndices1(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<int, 3, true> heapIndices2(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<int, 3, true>* heapIndices[2] =
+    {&heapIndices1, &heapIndices2};
+
+  auto streams = res->getAlternateStreamsCurrentDevice();
+  streamWait(streams, {stream});
+
+  int curStream = 0;
+
+  for (int query = 0; query < queries.getSize(0); query += queryTileSize) {
+    int numQueriesInTile =
+      std::min(queryTileSize, queries.getSize(0) - query);
+
+    auto prefixSumOffsetsView =
+      prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile);
+
+    auto codeDistancesView =
+      codeDistances[curStream]->narrowOutermost(0, numQueriesInTile);
+    auto coarseIndicesView =
+      topQueryToCentroid.narrowOutermost(query, numQueriesInTile);
+    auto queryView =
+      queries.narrowOutermost(query, numQueriesInTile);
+
+    auto heapDistancesView =
+      heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);
+    auto heapIndicesView =
+      heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);
+
+    auto outDistanceView =
+      outDistances.narrowOutermost(query, numQueriesInTile);
+    auto outIndicesView =
+      outIndices.narrowOutermost(query, numQueriesInTile);
+
+    runMultiPassTile(queryView,
+                     centroids,
+                     pqCentroidsInnermostCode,
+                     codeDistancesView,
+                     coarseIndicesView,
+                     useFloat16Lookup,
+                     bytesPerCode,
+                     numSubQuantizers,
+                     numSubQuantizerCodes,
+                     listCodes,
+                     listIndices,
+                     indicesOptions,
+                     listLengths,
+                     *thrustMem[curStream],
+                     prefixSumOffsetsView,
+                     *allDistances[curStream],
+                     heapDistancesView,
+                     heapIndicesView,
+                     k,
+                     outDistanceView,
+                     outIndicesView,
+                     streams[curStream]);
+
+    curStream = (curStream + 1) % 2;
+  }
+
+  streamWait({stream}, streams);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh
new file mode 100644
index 0000000000..3d77a0ff5c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh
@@ -0,0 +1,43 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/GpuIndicesOptions.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <thrust/device_vector.h>
+
+namespace faiss { namespace gpu {
+
+class GpuResources;
+
+/// For no precomputed codes, is this a supported number of dimensions
+/// per subquantizer?
+bool isSupportedNoPrecomputedSubDimSize(int dims);
+
+void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
+                                     Tensor<float, 2, true>& centroids,
+                                     Tensor<float, 3, true>& pqCentroidsInnermostCode,
+                                     Tensor<int, 2, true>& topQueryToCentroid,
+                                     bool useFloat16Lookup,
+                                     int bytesPerCode,
+                                     int numSubQuantizers,
+                                     int numSubQuantizerCodes,
+                                     thrust::device_vector<void*>& listCodes,
+                                     thrust::device_vector<void*>& listIndices,
+                                     IndicesOptions indicesOptions,
+                                     thrust::device_vector<int>& listLengths,
+                                     int maxListLength,
+                                     int k,
+                                     // output
+                                     Tensor<float, 2, true>& outDistances,
+                                     // output
+                                     Tensor<long, 2, true>& outIndices,
+                                     GpuResources* res);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu
new file mode 100644
index 0000000000..58c2114595
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu
@@ -0,0 +1,554 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/PQCodeLoad.cuh>
+#include <faiss/gpu/impl/IVFUtils.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/LoadStoreOperators.cuh>
+#include <faiss/gpu/utils/MathOperators.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <limits>
+
+namespace faiss { namespace gpu {
+
+// For precomputed codes, this calculates and loads code distances
+// into smem
+template <typename LookupT, typename LookupVecT>
+inline __device__ void
+loadPrecomputedTerm(LookupT* smem,
+                    LookupT* term2Start,
+                    LookupT* term3Start,
+                    int numCodes) {
+  constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT);
+
+  // We can only use vector loads if the data is guaranteed to be
+  // aligned. The codes are innermost, so if it is evenly divisible,
+  // then any slice will be aligned.
+  if (numCodes % kWordSize == 0) {
+    constexpr int kUnroll = 2;
+
+    // Load the data by float4 for efficiency, and then handle any remainder
+    // limitVec is the number of whole vec words we can load, in terms
+    // of whole blocks performing the load
+    int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
+    limitVec *= kUnroll * blockDim.x;
+
+    LookupVecT* smemV = (LookupVecT*) smem;
+    LookupVecT* term2StartV = (LookupVecT*) term2Start;
+    LookupVecT* term3StartV = (LookupVecT*) term3Start;
+
+    for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
+      LookupVecT vals[kUnroll];
+
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        vals[j] =
+          LoadStore<LookupVecT>::load(&term2StartV[i + j * blockDim.x]);
+      }
+
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        LookupVecT q =
+          LoadStore<LookupVecT>::load(&term3StartV[i + j * blockDim.x]);
+
+        vals[j] = Math<LookupVecT>::add(vals[j], q);
+      }
+
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        LoadStore<LookupVecT>::store(&smemV[i + j * blockDim.x], vals[j]);
+      }
+    }
+
+    // This is where we start loading the remainder that does not evenly
+    // fit into kUnroll x blockDim.x
+    int remainder = limitVec * kWordSize;
+
+    for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
+      smem[i] = Math<LookupT>::add(term2Start[i], term3Start[i]);
+    }
+  } else {
+    // Potential unaligned load
+    constexpr int kUnroll = 4;
+
+    int limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
+
+    int i = threadIdx.x;
+    for (; i < limit; i += kUnroll * blockDim.x) {
+      LookupT vals[kUnroll];
+
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        vals[j] = term2Start[i + j * blockDim.x];
+      }
+
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        vals[j] = Math<LookupT>::add(vals[j], term3Start[i + j * blockDim.x]);
+      }
+
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        smem[i + j * blockDim.x] = vals[j];
+      }
+    }
+
+    for (; i < numCodes; i += blockDim.x) {
+      smem[i] = Math<LookupT>::add(term2Start[i], term3Start[i]);
+    }
+  }
+}
+
+template <int NumSubQuantizers, typename LookupT, typename LookupVecT>
+__global__ void
+pqScanPrecomputedMultiPass(Tensor<float, 2, true> queries,
+                           Tensor<float, 2, true> precompTerm1,
+                           Tensor<LookupT, 3, true> precompTerm2,
+                           Tensor<LookupT, 3, true> precompTerm3,
+                           Tensor<int, 2, true> topQueryToCentroid,
+                           void** listCodes,
+                           int* listLengths,
+                           Tensor<int, 2, true> prefixSumOffsets,
+                           Tensor<float, 1, true> distance) {
+  // precomputed term 2 + 3 storage
+  // (sub q)(code id)
+  extern __shared__ char smemTerm23[];
+  LookupT* term23 = (LookupT*) smemTerm23;
+
+  // Each block handles a single query
+  auto queryId = blockIdx.y;
+  auto probeId = blockIdx.x;
+  auto codesPerSubQuantizer = precompTerm2.getSize(2);
+  auto precompTermSize = precompTerm2.getSize(1) * codesPerSubQuantizer;
+
+  // This is where we start writing out data
+  // We ensure that before the array (at offset -1), there is a 0 value
+  int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
+  float* distanceOut = distance[outBase].data();
+
+  auto listId = topQueryToCentroid[queryId][probeId];
+  // Safety guard in case NaNs in input cause no list ID to be generated
+  if (listId == -1) {
+    return;
+  }
+
+  unsigned char* codeList = (unsigned char*) listCodes[listId];
+  int limit = listLengths[listId];
+
+  constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 :
+    (NumSubQuantizers / 4);
+  unsigned int code32[kNumCode32];
+  unsigned int nextCode32[kNumCode32];
+
+  // We double-buffer the code loading, which improves memory utilization
+  if (threadIdx.x < limit) {
+    LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x);
+  }
+
+  // Load precomputed terms 1, 2, 3
+  float term1 = precompTerm1[queryId][probeId];
+  loadPrecomputedTerm<LookupT, LookupVecT>(term23,
+                                           precompTerm2[listId].data(),
+                                           precompTerm3[queryId].data(),
+                                           precompTermSize);
+
+  // Prevent WAR dependencies
+  __syncthreads();
+
+  // Each thread handles one code element in the list, with a
+  // block-wide stride
+  for (int codeIndex = threadIdx.x;
+       codeIndex < limit;
+       codeIndex += blockDim.x) {
+    // Prefetch next codes
+    if (codeIndex + blockDim.x < limit) {
+      LoadCode32<NumSubQuantizers>::load(
+        nextCode32, codeList, codeIndex + blockDim.x);
+    }
+
+    float dist = term1;
+
+#pragma unroll
+    for (int word = 0; word < kNumCode32; ++word) {
+      constexpr int kBytesPerCode32 =
+        NumSubQuantizers < 4 ? NumSubQuantizers : 4;
+
+      if (kBytesPerCode32 == 1) {
+        auto code = code32[0];
+        dist = ConvertTo<float>::to(term23[code]);
+
+      } else {
+#pragma unroll
+        for (int byte = 0; byte < kBytesPerCode32; ++byte) {
+          auto code = getByte(code32[word], byte * 8, 8);
+
+          auto offset =
+            codesPerSubQuantizer * (word * kBytesPerCode32 + byte);
+
+          dist += ConvertTo<float>::to(term23[offset + code]);
+        }
+      }
+    }
+
+    // Write out intermediate distance result
+    // We do not maintain indices here, in order to reduce global
+    // memory traffic. Those are recovered in the final selection step.
+    distanceOut[codeIndex] = dist;
+
+    // Rotate buffers
+#pragma unroll
+    for (int word = 0; word < kNumCode32; ++word) {
+      code32[word] = nextCode32[word];
+    }
+  }
+}
+
+void
+runMultiPassTile(Tensor<float, 2, true>& queries,
+                 Tensor<float, 2, true>& precompTerm1,
+                 NoTypeTensor<3, true>& precompTerm2,
+                 NoTypeTensor<3, true>& precompTerm3,
+                 Tensor<int, 2, true>& topQueryToCentroid,
+                 bool useFloat16Lookup,
+                 int bytesPerCode,
+                 int numSubQuantizers,
+                 int numSubQuantizerCodes,
+                 thrust::device_vector<void*>& listCodes,
+                 thrust::device_vector<void*>& listIndices,
+                 IndicesOptions indicesOptions,
+                 thrust::device_vector<int>& listLengths,
+                 Tensor<char, 1, true>& thrustMem,
+                 Tensor<int, 2, true>& prefixSumOffsets,
+                 Tensor<float, 1, true>& allDistances,
+                 Tensor<float, 3, true>& heapDistances,
+                 Tensor<int, 3, true>& heapIndices,
+                 int k,
+                 Tensor<float, 2, true>& outDistances,
+                 Tensor<long, 2, true>& outIndices,
+                 cudaStream_t stream) {
+  // Calculate offset lengths, so we know where to write out
+  // intermediate results
+  runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets,
+                     thrustMem, stream);
+
+  // Convert all codes to a distance, and write out (distance,
+  // index) values for all intermediate results
+  {
+    auto kThreadsPerBlock = 256;
+
+    auto grid = dim3(topQueryToCentroid.getSize(1),
+                     topQueryToCentroid.getSize(0));
+    auto block = dim3(kThreadsPerBlock);
+
+    // pq precomputed terms (2 + 3)
+    auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float);
+
+    smem *= numSubQuantizers * numSubQuantizerCodes;
+    FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
+
+#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T)                   \
+    do {                                                                \
+      auto precompTerm2T = precompTerm2.toTensor<LOOKUP_T>();           \
+      auto precompTerm3T = precompTerm3.toTensor<LOOKUP_T>();           \
+                                                                        \
+      pqScanPrecomputedMultiPass<NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T>     \
+        <<<grid, block, smem, stream>>>(                                \
+          queries,                                                      \
+          precompTerm1,                                                 \
+          precompTerm2T,                                                \
+          precompTerm3T,                                                \
+          topQueryToCentroid,                                           \
+          listCodes.data().get(),                                       \
+          listLengths.data().get(),                                     \
+          prefixSumOffsets,                                             \
+          allDistances);                                                \
+    } while (0)
+
+#define RUN_PQ(NUM_SUB_Q)                       \
+    do {                                        \
+      if (useFloat16Lookup) {                   \
+        RUN_PQ_OPT(NUM_SUB_Q, half, Half8);     \
+      } else {                                  \
+        RUN_PQ_OPT(NUM_SUB_Q, float, float4);   \
+      }                                         \
+    } while (0)
+
+    switch (bytesPerCode) {
+      case 1:
+        RUN_PQ(1);
+        break;
+      case 2:
+        RUN_PQ(2);
+        break;
+      case 3:
+        RUN_PQ(3);
+        break;
+      case 4:
+        RUN_PQ(4);
+        break;
+      case 8:
+        RUN_PQ(8);
+        break;
+      case 12:
+        RUN_PQ(12);
+        break;
+      case 16:
+        RUN_PQ(16);
+        break;
+      case 20:
+        RUN_PQ(20);
+        break;
+      case 24:
+        RUN_PQ(24);
+        break;
+      case 28:
+        RUN_PQ(28);
+        break;
+      case 32:
+        RUN_PQ(32);
+        break;
+      case 40:
+        RUN_PQ(40);
+        break;
+      case 48:
+        RUN_PQ(48);
+        break;
+      case 56:
+        RUN_PQ(56);
+        break;
+      case 64:
+        RUN_PQ(64);
+        break;
+      case 96:
+        RUN_PQ(96);
+        break;
+      default:
+        FAISS_ASSERT(false);
+        break;
+    }
+
+    CUDA_TEST_ERROR();
+
+#undef RUN_PQ
+#undef RUN_PQ_OPT
+  }
+
+  // k-select the output in chunks, to increase parallelism
+  runPass1SelectLists(prefixSumOffsets,
+                      allDistances,
+                      topQueryToCentroid.getSize(1),
+                      k,
+                      false, // L2 distance chooses smallest
+                      heapDistances,
+                      heapIndices,
+                      stream);
+
+  // k-select final output
+  auto flatHeapDistances = heapDistances.downcastInner<2>();
+  auto flatHeapIndices = heapIndices.downcastInner<2>();
+
+  runPass2SelectLists(flatHeapDistances,
+                      flatHeapIndices,
+                      listIndices,
+                      indicesOptions,
+                      prefixSumOffsets,
+                      topQueryToCentroid,
+                      k,
+                      false, // L2 distance chooses smallest
+                      outDistances,
+                      outIndices,
+                      stream);
+
+  CUDA_TEST_ERROR();
+}
+
+void runPQScanMultiPassPrecomputed(Tensor<float, 2, true>& queries,
+                                   Tensor<float, 2, true>& precompTerm1,
+                                   NoTypeTensor<3, true>& precompTerm2,
+                                   NoTypeTensor<3, true>& precompTerm3,
+                                   Tensor<int, 2, true>& topQueryToCentroid,
+                                   bool useFloat16Lookup,
+                                   int bytesPerCode,
+                                   int numSubQuantizers,
+                                   int numSubQuantizerCodes,
+                                   thrust::device_vector<void*>& listCodes,
+                                   thrust::device_vector<void*>& listIndices,
+                                   IndicesOptions indicesOptions,
+                                   thrust::device_vector<int>& listLengths,
+                                   int maxListLength,
+                                   int k,
+                                   // output
+                                   Tensor<float, 2, true>& outDistances,
+                                   // output
+                                   Tensor<long, 2, true>& outIndices,
+                                   GpuResources* res) {
+  constexpr int kMinQueryTileSize = 8;
+  constexpr int kMaxQueryTileSize = 128;
+  constexpr int kThrustMemSize = 16384;
+
+  int nprobe = topQueryToCentroid.getSize(1);
+
+  auto& mem = res->getMemoryManagerCurrentDevice();
+  auto stream = res->getDefaultStreamCurrentDevice();
+
+  // Make a reservation for Thrust to do its dirty work (global memory
+  // cross-block reduction space); hopefully this is large enough.
+  DeviceTensor<char, 1, true> thrustMem1(
+    mem, {kThrustMemSize}, stream);
+  DeviceTensor<char, 1, true> thrustMem2(
+    mem, {kThrustMemSize}, stream);
+  DeviceTensor<char, 1, true>* thrustMem[2] =
+    {&thrustMem1, &thrustMem2};
+
+  // How much temporary storage is available?
+  // If possible, we'd like to fit within the space available.
+  size_t sizeAvailable = mem.getSizeAvailable();
+
+  // We run two passes of heap selection
+  // This is the size of the first-level heap passes
+  constexpr int kNProbeSplit = 8;
+  int pass2Chunks = std::min(nprobe, kNProbeSplit);
+
+  size_t sizeForFirstSelectPass =
+    pass2Chunks * k * (sizeof(float) + sizeof(int));
+
+  // How much temporary storage we need per each query
+  size_t sizePerQuery =
+    2 * // # streams
+    ((nprobe * sizeof(int) + sizeof(int)) + // prefixSumOffsets
+     nprobe * maxListLength * sizeof(float) + // allDistances
+     sizeForFirstSelectPass);
+
+  int queryTileSize = (int) (sizeAvailable / sizePerQuery);
+
+  if (queryTileSize < kMinQueryTileSize) {
+    queryTileSize = kMinQueryTileSize;
+  } else if (queryTileSize > kMaxQueryTileSize) {
+    queryTileSize = kMaxQueryTileSize;
+  }
+
+  // FIXME: we should adjust queryTileSize to deal with this, since
+  // indexing is in int32
+  FAISS_ASSERT(queryTileSize * nprobe * maxListLength <=
+         std::numeric_limits<int>::max());
+
+  // Temporary memory buffers
+ // Make sure there is space prior to the start which will be 0, and
+  // will handle the boundary condition without branches
+  DeviceTensor<int, 1, true> prefixSumOffsetSpace1(
+    mem, {queryTileSize * nprobe + 1}, stream);
+  DeviceTensor<int, 1, true> prefixSumOffsetSpace2(
+    mem, {queryTileSize * nprobe + 1}, stream);
+
+  DeviceTensor<int, 2, true> prefixSumOffsets1(
+    prefixSumOffsetSpace1[1].data(),
+    {queryTileSize, nprobe});
+  DeviceTensor<int, 2, true> prefixSumOffsets2(
+    prefixSumOffsetSpace2[1].data(),
+    {queryTileSize, nprobe});
+  DeviceTensor<int, 2, true>* prefixSumOffsets[2] =
+    {&prefixSumOffsets1, &prefixSumOffsets2};
+
+  // Make sure the element before prefixSumOffsets is 0, since we
+  // depend upon simple, boundary-less indexing to get proper results
+  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(),
+                              0,
+                              sizeof(int),
+                              stream));
+  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(),
+                              0,
+                              sizeof(int),
+                              stream));
+
+  DeviceTensor<float, 1, true> allDistances1(
+    mem, {queryTileSize * nprobe * maxListLength}, stream);
+  DeviceTensor<float, 1, true> allDistances2(
+    mem, {queryTileSize * nprobe * maxListLength}, stream);
+  DeviceTensor<float, 1, true>* allDistances[2] =
+    {&allDistances1, &allDistances2};
+
+  DeviceTensor<float, 3, true> heapDistances1(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<float, 3, true> heapDistances2(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<float, 3, true>* heapDistances[2] =
+    {&heapDistances1, &heapDistances2};
+
+  DeviceTensor<int, 3, true> heapIndices1(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<int, 3, true> heapIndices2(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<int, 3, true>* heapIndices[2] =
+    {&heapIndices1, &heapIndices2};
+
+  auto streams = res->getAlternateStreamsCurrentDevice();
+  streamWait(streams, {stream});
+
+  int curStream = 0;
+
+  for (int query = 0; query < queries.getSize(0); query += queryTileSize) {
+    int numQueriesInTile =
+      std::min(queryTileSize, queries.getSize(0) - query);
+
+    auto prefixSumOffsetsView =
+      prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile);
+
+    auto coarseIndicesView =
+      topQueryToCentroid.narrowOutermost(query, numQueriesInTile);
+    auto queryView =
+      queries.narrowOutermost(query, numQueriesInTile);
+    auto term1View =
+      precompTerm1.narrowOutermost(query, numQueriesInTile);
+    auto term3View =
+      precompTerm3.narrowOutermost(query, numQueriesInTile);
+
+    auto heapDistancesView =
+      heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);
+    auto heapIndicesView =
+      heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);
+
+    auto outDistanceView =
+      outDistances.narrowOutermost(query, numQueriesInTile);
+    auto outIndicesView =
+      outIndices.narrowOutermost(query, numQueriesInTile);
+
+    runMultiPassTile(queryView,
+                     term1View,
+                     precompTerm2,
+                     term3View,
+                     coarseIndicesView,
+                     useFloat16Lookup,
+                     bytesPerCode,
+                     numSubQuantizers,
+                     numSubQuantizerCodes,
+                     listCodes,
+                     listIndices,
+                     indicesOptions,
+                     listLengths,
+                     *thrustMem[curStream],
+                     prefixSumOffsetsView,
+                     *allDistances[curStream],
+                     heapDistancesView,
+                     heapIndicesView,
+                     k,
+                     outDistanceView,
+                     outIndicesView,
+                     streams[curStream]);
+
+    curStream = (curStream + 1) % 2;
+  }
+
+  streamWait({stream}, streams);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh
new file mode 100644
index 0000000000..ffe548b785
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh
@@ -0,0 +1,41 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/GpuIndicesOptions.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/gpu/utils/NoTypeTensor.cuh>
+#include <thrust/device_vector.h>
+
+namespace faiss { namespace gpu {
+
+class GpuResources;
+
+void runPQScanMultiPassPrecomputed(Tensor<float, 2, true>& queries,
+                                   Tensor<float, 2, true>& precompTerm1,
+                                   NoTypeTensor<3, true>& precompTerm2,
+                                   NoTypeTensor<3, true>& precompTerm3,
+                                   Tensor<int, 2, true>& topQueryToCentroid,
+                                   bool useFloat16Lookup,
+                                   int bytesPerCode,
+                                   int numSubQuantizers,
+                                   int numSubQuantizerCodes,
+                                   thrust::device_vector<void*>& listCodes,
+                                   thrust::device_vector<void*>& listIndices,
+                                   IndicesOptions indicesOptions,
+                                   thrust::device_vector<int>& listLengths,
+                                   int maxListLength,
+                                   int k,
+                                   // output
+                                   Tensor<float, 2, true>& outDistances,
+                                   // output
+                                   Tensor<long, 2, true>& outIndices,
+                                   GpuResources* res);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/RemapIndices.cpp b/core/src/index/thirdparty/faiss/gpu/impl/RemapIndices.cpp
new file mode 100644
index 0000000000..a3df65c91c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/RemapIndices.cpp
@@ -0,0 +1,43 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/RemapIndices.h>
+#include <faiss/impl/FaissAssert.h>
+
+namespace faiss { namespace gpu {
+
+// Utility function to translate (list id, offset) to a user index on
+// the CPU. In a cpp in order to use OpenMP
+void ivfOffsetToUserIndex(
+  long* indices,
+  int numLists,
+  int queries,
+  int k,
+  const std::vector<std::vector<long>>& listOffsetToUserIndex) {
+  FAISS_ASSERT(numLists == listOffsetToUserIndex.size());
+
+#pragma omp parallel for
+  for (int q = 0; q < queries; ++q) {
+    for (int r = 0; r < k; ++r) {
+      long offsetIndex = indices[q * k + r];
+
+      if (offsetIndex < 0) continue;
+
+      int listId = (int) (offsetIndex >> 32);
+      int listOffset = (int) (offsetIndex & 0xffffffff);
+
+      FAISS_ASSERT(listId < numLists);
+      auto& listIndices = listOffsetToUserIndex[listId];
+
+      FAISS_ASSERT(listOffset < listIndices.size());
+      indices[q * k + r] = listIndices[listOffset];
+    }
+  }
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/RemapIndices.h b/core/src/index/thirdparty/faiss/gpu/impl/RemapIndices.h
new file mode 100644
index 0000000000..234148451f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/RemapIndices.h
@@ -0,0 +1,24 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <vector>
+
+namespace faiss { namespace gpu {
+
+/// Utility function to translate (list id, offset) to a user index on
+/// the CPU. In a cpp in order to use OpenMP.
+void ivfOffsetToUserIndex(
+  long* indices,
+  int numLists,
+  int queries,
+  int k,
+  const std::vector<std::vector<long>>& listOffsetToUserIndex);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/VectorResidual.cu b/core/src/index/thirdparty/faiss/gpu/impl/VectorResidual.cu
new file mode 100644
index 0000000000..078e660417
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/VectorResidual.cu
@@ -0,0 +1,144 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/impl/VectorResidual.cuh>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <math_constants.h> // in CUDA SDK, for CUDART_NAN_F
+
+namespace faiss { namespace gpu {
+
+template <typename CentroidT, bool LargeDim>
+__global__ void calcResidual(Tensor<float, 2, true> vecs,
+                             Tensor<CentroidT, 2, true> centroids,
+                             Tensor<int, 1, true> vecToCentroid,
+                             Tensor<float, 2, true> residuals) {
+  auto vec = vecs[blockIdx.x];
+  auto residual = residuals[blockIdx.x];
+
+  int centroidId = vecToCentroid[blockIdx.x];
+  // Vector could be invalid (containing NaNs), so -1 was the
+  // classified centroid
+  if (centroidId == -1) {
+    if (LargeDim) {
+      for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
+        residual[i] = CUDART_NAN_F;
+      }
+    } else {
+      residual[threadIdx.x] = CUDART_NAN_F;
+    }
+
+    return;
+  }
+
+  auto centroid = centroids[centroidId];
+
+  if (LargeDim) {
+    for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
+      residual[i] = vec[i] - ConvertTo<float>::to(centroid[i]);
+    }
+  } else {
+    residual[threadIdx.x] = vec[threadIdx.x] -
+      ConvertTo<float>::to(centroid[threadIdx.x]);
+  }
+}
+
+template <typename T>
+__global__ void gatherReconstruct(Tensor<int, 1, true> listIds,
+                                  Tensor<T, 2, true> vecs,
+                                  Tensor<float, 2, true> out) {
+  auto id = listIds[blockIdx.x];
+  auto vec = vecs[id];
+  auto outVec = out[blockIdx.x];
+
+  Convert<T, float> conv;
+
+  for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
+    outVec[i] = id == -1 ? 0.0f : conv(vec[i]);
+  }
+}
+
+template <typename CentroidT>
+void calcResidual(Tensor<float, 2, true>& vecs,
+                  Tensor<CentroidT, 2, true>& centroids,
+                  Tensor<int, 1, true>& vecToCentroid,
+                  Tensor<float, 2, true>& residuals,
+                  cudaStream_t stream) {
+  FAISS_ASSERT(vecs.getSize(1) == centroids.getSize(1));
+  FAISS_ASSERT(vecs.getSize(1) == residuals.getSize(1));
+  FAISS_ASSERT(vecs.getSize(0) == vecToCentroid.getSize(0));
+  FAISS_ASSERT(vecs.getSize(0) == residuals.getSize(0));
+
+  dim3 grid(vecs.getSize(0));
+
+  int maxThreads = getMaxThreadsCurrentDevice();
+  bool largeDim = vecs.getSize(1) > maxThreads;
+  dim3 block(std::min(vecs.getSize(1), maxThreads));
+
+  if (largeDim) {
+    calcResidual<CentroidT, true><<<grid, block, 0, stream>>>(
+      vecs, centroids, vecToCentroid, residuals);
+  } else {
+    calcResidual<CentroidT, false><<<grid, block, 0, stream>>>(
+      vecs, centroids, vecToCentroid, residuals);
+  }
+
+  CUDA_TEST_ERROR();
+}
+
+template <typename T>
+void gatherReconstruct(Tensor<int, 1, true>& listIds,
+                       Tensor<T, 2, true>& vecs,
+                       Tensor<float, 2, true>& out,
+                       cudaStream_t stream) {
+  FAISS_ASSERT(listIds.getSize(0) == out.getSize(0));
+  FAISS_ASSERT(vecs.getSize(1) == out.getSize(1));
+
+  dim3 grid(listIds.getSize(0));
+
+  int maxThreads = getMaxThreadsCurrentDevice();
+  dim3 block(std::min(vecs.getSize(1), maxThreads));
+
+  gatherReconstruct<T><<<grid, block, 0, stream>>>(listIds, vecs, out);
+
+  CUDA_TEST_ERROR();
+}
+
+void runCalcResidual(Tensor<float, 2, true>& vecs,
+                     Tensor<float, 2, true>& centroids,
+                     Tensor<int, 1, true>& vecToCentroid,
+                     Tensor<float, 2, true>& residuals,
+                     cudaStream_t stream) {
+  calcResidual<float>(vecs, centroids, vecToCentroid, residuals, stream);
+}
+
+void runCalcResidual(Tensor<float, 2, true>& vecs,
+                     Tensor<half, 2, true>& centroids,
+                     Tensor<int, 1, true>& vecToCentroid,
+                     Tensor<float, 2, true>& residuals,
+                     cudaStream_t stream) {
+  calcResidual<half>(vecs, centroids, vecToCentroid, residuals, stream);
+}
+
+void runReconstruct(Tensor<int, 1, true>& listIds,
+                    Tensor<float, 2, true>& vecs,
+                    Tensor<float, 2, true>& out,
+                    cudaStream_t stream) {
+  gatherReconstruct<float>(listIds, vecs, out, stream);
+}
+
+void runReconstruct(Tensor<int, 1, true>& listIds,
+                    Tensor<half, 2, true>& vecs,
+                    Tensor<float, 2, true>& out,
+                    cudaStream_t stream) {
+  gatherReconstruct<half>(listIds, vecs, out, stream);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/VectorResidual.cuh b/core/src/index/thirdparty/faiss/gpu/impl/VectorResidual.cuh
new file mode 100644
index 0000000000..ca7bcaa0b6
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/VectorResidual.cuh
@@ -0,0 +1,39 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/Tensor.cuh>
+
+namespace faiss { namespace gpu {
+
+// Calculates residual v_i - c_j for all v_i in vecs where j = vecToCentroid[i]
+void runCalcResidual(Tensor<float, 2, true>& vecs,
+                     Tensor<float, 2, true>& centroids,
+                     Tensor<int, 1, true>& vecToCentroid,
+                     Tensor<float, 2, true>& residuals,
+                     cudaStream_t stream);
+
+void runCalcResidual(Tensor<float, 2, true>& vecs,
+                     Tensor<half, 2, true>& centroids,
+                     Tensor<int, 1, true>& vecToCentroid,
+                     Tensor<float, 2, true>& residuals,
+                     cudaStream_t stream);
+
+// Gather vectors
+void runReconstruct(Tensor<int, 1, true>& listIds,
+                    Tensor<float, 2, true>& vecs,
+                    Tensor<float, 2, true>& out,
+                    cudaStream_t stream);
+
+void runReconstruct(Tensor<int, 1, true>& listIds,
+                    Tensor<half, 2, true>& vecs,
+                    Tensor<float, 2, true>& out,
+                    cudaStream_t stream);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/perf/IndexWrapper-inl.h b/core/src/index/thirdparty/faiss/gpu/perf/IndexWrapper-inl.h
new file mode 100644
index 0000000000..90eb629509
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/perf/IndexWrapper-inl.h
@@ -0,0 +1,71 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/impl/FaissAssert.h>
+
+namespace faiss { namespace gpu {
+
+template <typename GpuIndex>
+IndexWrapper<GpuIndex>::IndexWrapper(
+  int numGpus,
+  std::function<std::unique_ptr<GpuIndex>(GpuResources*, int)> init) {
+  FAISS_ASSERT(numGpus <= faiss::gpu::getNumDevices());
+  for (int i = 0; i < numGpus; ++i) {
+    auto res = std::unique_ptr<faiss::gpu::StandardGpuResources>(
+      new StandardGpuResources);
+
+    subIndex.emplace_back(init(res.get(), i));
+    resources.emplace_back(std::move(res));
+  }
+
+  if (numGpus > 1) {
+    // create proxy
+    replicaIndex =
+      std::unique_ptr<faiss::IndexReplicas>(new faiss::IndexReplicas);
+
+    for (auto& index : subIndex) {
+      replicaIndex->addIndex(index.get());
+    }
+  }
+}
+
+template <typename GpuIndex>
+faiss::Index*
+IndexWrapper<GpuIndex>::getIndex() {
+  if ((bool) replicaIndex) {
+    return replicaIndex.get();
+  } else {
+    FAISS_ASSERT(!subIndex.empty());
+    return subIndex.front().get();
+  }
+}
+
+template <typename GpuIndex>
+void
+IndexWrapper<GpuIndex>::runOnIndices(std::function<void(GpuIndex*)> f) {
+
+  if ((bool) replicaIndex) {
+    replicaIndex->runOnIndex(
+      [f](int, faiss::Index* index) {
+        f(dynamic_cast<GpuIndex*>(index));
+      });
+  } else {
+    FAISS_ASSERT(!subIndex.empty());
+    f(subIndex.front().get());
+  }
+}
+
+template <typename GpuIndex>
+void
+IndexWrapper<GpuIndex>::setNumProbes(int nprobe) {
+  runOnIndices([nprobe](GpuIndex* index) {
+      index->setNumProbes(nprobe);
+    });
+}
+
+} }
diff --git a/core/src/index/thirdparty/faiss/gpu/perf/IndexWrapper.h b/core/src/index/thirdparty/faiss/gpu/perf/IndexWrapper.h
new file mode 100644
index 0000000000..df36255a26
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/perf/IndexWrapper.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/IndexReplicas.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace faiss { namespace gpu {
+
+// If we want to run multi-GPU, create a proxy to wrap the indices.
+// If we don't want multi-GPU, don't involve the proxy, so it doesn't
+// affect the timings.
+template <typename GpuIndex>
+struct IndexWrapper {
+  std::vector<std::unique_ptr<faiss::gpu::StandardGpuResources>> resources;
+  std::vector<std::unique_ptr<GpuIndex>> subIndex;
+  std::unique_ptr<faiss::IndexReplicas> replicaIndex;
+
+  IndexWrapper(
+    int numGpus,
+    std::function<std::unique_ptr<GpuIndex>(GpuResources*, int)> init);
+  faiss::Index* getIndex();
+
+  void runOnIndices(std::function<void(GpuIndex*)> f);
+  void setNumProbes(int nprobe);
+};
+
+} }
+
+#include <faiss/gpu/perf/IndexWrapper-inl.h>
diff --git a/core/src/index/thirdparty/faiss/gpu/perf/PerfBinaryFlat.cu b/core/src/index/thirdparty/faiss/gpu/perf/PerfBinaryFlat.cu
new file mode 100644
index 0000000000..3e921c50da
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/perf/PerfBinaryFlat.cu
@@ -0,0 +1,125 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/utils/random.h>
+#include <faiss/gpu/GpuIndexBinaryFlat.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/HostTensor.cuh>
+#include <faiss/gpu/utils/Timer.h>
+#include <gflags/gflags.h>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include <cuda_profiler_api.h>
+
+DEFINE_int32(k, 3, "final number of closest results returned");
+DEFINE_int32(num, 128, "# of vecs");
+DEFINE_int32(dim, 128, "# of dimensions");
+DEFINE_int32(num_queries, 3, "number of query vectors");
+DEFINE_int64(seed, -1, "specify random seed");
+DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use");
+DEFINE_bool(cpu, true, "run the CPU code for timing and comparison");
+DEFINE_bool(use_unified_mem, false, "use Pascal unified memory for the index");
+
+using namespace faiss::gpu;
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  cudaProfilerStop();
+
+  auto seed = FLAGS_seed != -1L ? FLAGS_seed : time(nullptr);
+  printf("using seed %ld\n", seed);
+
+  auto numQueries = FLAGS_num_queries;
+
+  auto index = std::unique_ptr<faiss::IndexBinaryFlat>(
+    new faiss::IndexBinaryFlat(FLAGS_dim));
+
+  HostTensor<unsigned char, 2, true> vecs({FLAGS_num, FLAGS_dim / 8});
+  faiss::byte_rand(vecs.data(), vecs.numElements(), seed);
+
+  index->add(FLAGS_num, vecs.data());
+
+  printf("Database: dim %d num vecs %d\n", FLAGS_dim, FLAGS_num);
+  printf("Hamming lookup: %d queries, total k %d\n",
+         numQueries, FLAGS_k);
+
+  // Convert to GPU index
+  printf("Copying index to GPU...\n");
+
+  GpuIndexBinaryFlatConfig config;
+  config.memorySpace = FLAGS_use_unified_mem ?
+    MemorySpace::Unified : MemorySpace::Device;
+
+  faiss::gpu::StandardGpuResources res;
+
+  faiss::gpu::GpuIndexBinaryFlat gpuIndex(&res,
+                                          index.get(),
+                                          config);
+  printf("copy done\n");
+
+  // Build query vectors
+  HostTensor<unsigned char, 2, true> cpuQuery({numQueries, FLAGS_dim / 8});
+  faiss::byte_rand(cpuQuery.data(), cpuQuery.numElements(), seed);
+
+  // Time faiss CPU
+  HostTensor<int, 2, true>
+    cpuDistances({numQueries, FLAGS_k});
+  HostTensor<faiss::IndexBinary::idx_t, 2, true>
+    cpuIndices({numQueries, FLAGS_k});
+
+  if (FLAGS_cpu) {
+    float cpuTime = 0.0f;
+
+    CpuTimer timer;
+    index->search(numQueries,
+                  cpuQuery.data(),
+                  FLAGS_k,
+                  cpuDistances.data(),
+                  cpuIndices.data());
+
+    cpuTime = timer.elapsedMilliseconds();
+    printf("CPU time %.3f ms\n", cpuTime);
+  }
+
+  HostTensor<int, 2, true> gpuDistances({numQueries, FLAGS_k});
+  HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
+
+  CUDA_VERIFY(cudaProfilerStart());
+  faiss::gpu::synchronizeAllDevices();
+
+  float gpuTime = 0.0f;
+
+  // Time GPU
+  {
+    CpuTimer timer;
+
+    gpuIndex.search(cpuQuery.getSize(0),
+                    cpuQuery.data(),
+                    FLAGS_k,
+                    gpuDistances.data(),
+                    gpuIndices.data());
+
+    // There is a device -> host copy above, so no need to time
+    // additional synchronization with the GPU
+    gpuTime = timer.elapsedMilliseconds();
+  }
+
+  CUDA_VERIFY(cudaProfilerStop());
+  printf("GPU time %.3f ms\n", gpuTime);
+
+  CUDA_VERIFY(cudaDeviceSynchronize());
+
+  return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/gpu/perf/PerfClustering.cpp b/core/src/index/thirdparty/faiss/gpu/perf/PerfClustering.cpp
new file mode 100644
index 0000000000..6171e77926
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/perf/PerfClustering.cpp
@@ -0,0 +1,115 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/utils/random.h>
+#include <faiss/Clustering.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/perf/IndexWrapper.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Timer.h>
+#include <gflags/gflags.h>
+#include <memory>
+#include <vector>
+
+#include <cuda_profiler_api.h>
+
+DEFINE_int32(num, 10000, "# of vecs");
+DEFINE_int32(k, 100, "# of clusters");
+DEFINE_int32(dim, 128, "# of dimensions");
+DEFINE_int32(niter, 10, "# of iterations");
+DEFINE_bool(L2_metric, true, "If true, use L2 metric. If false, use IP metric");
+DEFINE_bool(use_float16, false, "use float16 vectors and math");
+DEFINE_bool(transposed, false, "transposed vector storage");
+DEFINE_bool(verbose, false, "turn on clustering logging");
+DEFINE_int64(seed, -1, "specify random seed");
+DEFINE_int32(num_gpus, 1, "number of gpus to use");
+DEFINE_int64(min_paging_size, -1, "minimum size to use CPU -> GPU paged copies");
+DEFINE_int64(pinned_mem, -1, "pinned memory allocation to use");
+DEFINE_int32(max_points, -1, "max points per centroid");
+
+using namespace faiss::gpu;
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  cudaProfilerStop();
+
+  auto seed = FLAGS_seed != -1L ? FLAGS_seed : time(nullptr);
+  printf("using seed %ld\n", seed);
+
+  std::vector<float> vecs((size_t) FLAGS_num * FLAGS_dim);
+  faiss::float_rand(vecs.data(), vecs.size(), seed);
+
+  printf("K-means metric %s dim %d centroids %d num train %d niter %d\n",
+         FLAGS_L2_metric ? "L2" : "IP",
+         FLAGS_dim, FLAGS_k, FLAGS_num, FLAGS_niter);
+  printf("float16 math %s\n", FLAGS_use_float16 ? "enabled" : "disabled");
+  printf("transposed storage %s\n", FLAGS_transposed ? "enabled" : "disabled");
+  printf("verbose %s\n", FLAGS_verbose ? "enabled" : "disabled");
+
+  auto initFn = [](faiss::gpu::GpuResources* res, int dev) ->
+    std::unique_ptr<faiss::gpu::GpuIndexFlat> {
+    if (FLAGS_pinned_mem >= 0) {
+      ((faiss::gpu::StandardGpuResources*) res)->setPinnedMemory(
+        FLAGS_pinned_mem);
+    }
+
+    GpuIndexFlatConfig config;
+    config.device = dev;
+    config.useFloat16 = FLAGS_use_float16;
+    config.storeTransposed = FLAGS_transposed;
+
+    auto p = std::unique_ptr<faiss::gpu::GpuIndexFlat>(
+      FLAGS_L2_metric ?
+      (faiss::gpu::GpuIndexFlat*)
+      new faiss::gpu::GpuIndexFlatL2(res, FLAGS_dim, config) :
+      (faiss::gpu::GpuIndexFlat*)
+      new faiss::gpu::GpuIndexFlatIP(res, FLAGS_dim, config));
+
+    if (FLAGS_min_paging_size >= 0) {
+      p->setMinPagingSize(FLAGS_min_paging_size);
+    }
+    return p;
+  };
+
+  IndexWrapper<faiss::gpu::GpuIndexFlat> gpuIndex(FLAGS_num_gpus, initFn);
+
+  CUDA_VERIFY(cudaProfilerStart());
+  faiss::gpu::synchronizeAllDevices();
+
+  float gpuTime = 0.0f;
+
+  faiss::ClusteringParameters cp;
+  cp.niter = FLAGS_niter;
+  cp.verbose = FLAGS_verbose;
+
+  if (FLAGS_max_points > 0) {
+    cp.max_points_per_centroid = FLAGS_max_points;
+  }
+
+  faiss::Clustering kmeans(FLAGS_dim, FLAGS_k, cp);
+
+  // Time k-means
+  {
+    CpuTimer timer;
+
+    kmeans.train(FLAGS_num, vecs.data(), *(gpuIndex.getIndex()));
+
+    // There is a device -> host copy above, so no need to time
+    // additional synchronization with the GPU
+    gpuTime = timer.elapsedMilliseconds();
+  }
+
+  CUDA_VERIFY(cudaProfilerStop());
+  printf("k-means time %.3f ms\n", gpuTime);
+
+  CUDA_VERIFY(cudaDeviceSynchronize());
+
+  return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/gpu/perf/PerfFlat.cu b/core/src/index/thirdparty/faiss/gpu/perf/PerfFlat.cu
new file mode 100644
index 0000000000..3b0e36ba13
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/perf/PerfFlat.cu
@@ -0,0 +1,150 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/IndexFlat.h>
+#include <faiss/utils/random.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/perf/IndexWrapper.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/HostTensor.cuh>
+#include <faiss/gpu/utils/Timer.h>
+#include <gflags/gflags.h>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include <cuda_profiler_api.h>
+
+DEFINE_bool(l2, true, "L2 or inner product");
+DEFINE_int32(k, 3, "final number of closest results returned");
+DEFINE_int32(num, 128, "# of vecs");
+DEFINE_int32(dim, 128, "# of dimensions");
+DEFINE_int32(num_queries, 3, "number of query vectors");
+DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
+DEFINE_bool(use_float16, false, "use encodings in float16");
+DEFINE_bool(use_float16_math, false, "perform math in float16");
+DEFINE_bool(transposed, false, "store vectors transposed");
+DEFINE_int64(seed, -1, "specify random seed");
+DEFINE_int32(num_gpus, 1, "number of gpus to use");
+DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use");
+DEFINE_bool(cpu, true, "run the CPU code for timing and comparison");
+DEFINE_bool(use_unified_mem, false, "use Pascal unified memory for the index");
+
+using namespace faiss::gpu;
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  cudaProfilerStop();
+
+  auto seed = FLAGS_seed != -1L ? FLAGS_seed : time(nullptr);
+  printf("using seed %ld\n", seed);
+
+  auto numQueries = FLAGS_num_queries;
+
+  auto index = std::unique_ptr<faiss::IndexFlat>(
+    new faiss::IndexFlat(FLAGS_dim, FLAGS_l2 ?
+                         faiss::METRIC_L2 : faiss::METRIC_INNER_PRODUCT));
+
+  HostTensor<float, 2, true> vecs({FLAGS_num, FLAGS_dim});
+  faiss::float_rand(vecs.data(), vecs.numElements(), seed);
+
+  index->add(FLAGS_num, vecs.data());
+
+  printf("Database: dim %d num vecs %d\n", FLAGS_dim, FLAGS_num);
+  printf("%s lookup: %d queries, total k %d\n",
+         FLAGS_l2 ? "L2" : "IP",
+         numQueries, FLAGS_k);
+  printf("float16 encoding %s\n", FLAGS_use_float16 ? "enabled" : "disabled");
+  printf("transposed storage %s\n", FLAGS_transposed ? "enabled" : "disabled");
+
+  // Convert to GPU index
+  printf("Copying index to %d GPU(s)...\n", FLAGS_num_gpus);
+
+  auto initFn = [&index](faiss::gpu::GpuResources* res, int dev) ->
+    std::unique_ptr<faiss::gpu::GpuIndexFlat> {
+    ((faiss::gpu::StandardGpuResources*) res)->setPinnedMemory(
+      FLAGS_pinned_mem);
+
+    GpuIndexFlatConfig config;
+    config.device = dev;
+    config.useFloat16 = FLAGS_use_float16;
+    config.useFloat16Accumulator = FLAGS_use_float16_math;
+    config.storeTransposed = FLAGS_transposed;
+    config.memorySpace = FLAGS_use_unified_mem ?
+    MemorySpace::Unified : MemorySpace::Device;
+
+    auto p = std::unique_ptr<faiss::gpu::GpuIndexFlat>(
+      new faiss::gpu::GpuIndexFlat(res, index.get(), config));
+    return p;
+  };
+
+  IndexWrapper<faiss::gpu::GpuIndexFlat> gpuIndex(FLAGS_num_gpus, initFn);
+  printf("copy done\n");
+
+  // Build query vectors
+  HostTensor<float, 2, true> cpuQuery({numQueries, FLAGS_dim});
+  faiss::float_rand(cpuQuery.data(), cpuQuery.numElements(), seed);
+
+  // Time faiss CPU
+  HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
+  HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
+
+  if (FLAGS_cpu) {
+    float cpuTime = 0.0f;
+
+    CpuTimer timer;
+    index->search(numQueries,
+                  cpuQuery.data(),
+                  FLAGS_k,
+                  cpuDistances.data(),
+                  cpuIndices.data());
+
+    cpuTime = timer.elapsedMilliseconds();
+    printf("CPU time %.3f ms\n", cpuTime);
+  }
+
+  HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
+  HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
+
+  CUDA_VERIFY(cudaProfilerStart());
+  faiss::gpu::synchronizeAllDevices();
+
+  float gpuTime = 0.0f;
+
+  // Time GPU
+  {
+    CpuTimer timer;
+
+    gpuIndex.getIndex()->search(cpuQuery.getSize(0),
+                                cpuQuery.data(),
+                                FLAGS_k,
+                                gpuDistances.data(),
+                                gpuIndices.data());
+
+    // There is a device -> host copy above, so no need to time
+    // additional synchronization with the GPU
+    gpuTime = timer.elapsedMilliseconds();
+  }
+
+  CUDA_VERIFY(cudaProfilerStop());
+  printf("GPU time %.3f ms\n", gpuTime);
+
+  if (FLAGS_cpu) {
+    compareLists(cpuDistances.data(), cpuIndices.data(),
+                 gpuDistances.data(), gpuIndices.data(),
+                 numQueries, FLAGS_k,
+                 "", true, FLAGS_diff, false);
+  }
+
+  CUDA_VERIFY(cudaDeviceSynchronize());
+
+  return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/gpu/perf/PerfIVFFlat.cu b/core/src/index/thirdparty/faiss/gpu/perf/PerfIVFFlat.cu
new file mode 100644
index 0000000000..8b51b90ecf
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/perf/PerfIVFFlat.cu
@@ -0,0 +1,146 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/index_io.h>
+#include <faiss/utils/random.h>
+
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/perf/IndexWrapper.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/HostTensor.cuh>
+#include <faiss/gpu/utils/Timer.h>
+#include <gflags/gflags.h>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include <cuda_profiler_api.h>
+
+DEFINE_int32(nprobe, 5, "number of coarse centroids to probe");
+DEFINE_int32(k, 3, "final number of closest results returned");
+DEFINE_int32(num_queries, 3, "number of query vectors");
+DEFINE_string(in, "/home/jhj/local/index.out", "index file for input");
+DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
+DEFINE_bool(use_float16_coarse, false, "coarse quantizer in float16");
+DEFINE_int64(seed, -1, "specify random seed");
+DEFINE_int32(num_gpus, 1, "number of gpus to use");
+DEFINE_int32(index, 2, "0 = no indices on GPU; 1 = 32 bit, 2 = 64 bit on GPU");
+
+using namespace faiss::gpu;
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  cudaProfilerStop();
+
+  auto seed = FLAGS_seed != -1L ? FLAGS_seed : time(nullptr);
+  printf("using seed %ld\n", seed);
+
+  auto numQueries = FLAGS_num_queries;
+
+  auto index = std::unique_ptr<faiss::IndexIVFFlat>(
+    dynamic_cast<faiss::IndexIVFFlat*>(faiss::read_index(FLAGS_in.c_str())));
+  FAISS_ASSERT((bool) index);
+  index->nprobe = FLAGS_nprobe;
+
+  auto dim = index->d;
+
+  printf("Database: dim %d num vecs %ld\n", dim, index->ntotal);
+  printf("Coarse centroids: %ld\n", index->quantizer->ntotal);
+  printf("L2 lookup: %d queries, nprobe %d, total k %d\n",
+         numQueries, FLAGS_nprobe, FLAGS_k);
+  printf("float16 coarse quantizer %s\n",
+         FLAGS_use_float16_coarse ? "enabled" : "disabled");
+
+  // Convert to GPU index
+  printf("Copying index to %d GPU(s)...\n", FLAGS_num_gpus);
+
+  auto initFn = [&index](faiss::gpu::GpuResources* res, int dev) ->
+    std::unique_ptr<faiss::gpu::GpuIndexIVFFlat> {
+    GpuIndexIVFFlatConfig config;
+    config.device = dev;
+    config.indicesOptions = (faiss::gpu::IndicesOptions) FLAGS_index;
+    config.flatConfig.useFloat16 = FLAGS_use_float16_coarse;
+
+    auto p = std::unique_ptr<faiss::gpu::GpuIndexIVFFlat>(
+      new faiss::gpu::GpuIndexIVFFlat(res,
+                                      index->d,
+                                      index->nlist,
+                                      index->metric_type,
+                                      config));
+    p->copyFrom(index.get());
+    return p;
+  };
+
+  IndexWrapper<faiss::gpu::GpuIndexIVFFlat> gpuIndex(FLAGS_num_gpus, initFn);
+  gpuIndex.setNumProbes(FLAGS_nprobe);
+  printf("copy done\n");
+
+  // Build query vectors
+  HostTensor<float, 2, true> cpuQuery({numQueries, dim});
+  faiss::float_rand(cpuQuery.data(), cpuQuery.numElements(), seed);
+
+  // Time faiss CPU
+  HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
+  HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
+
+  float cpuTime = 0.0f;
+
+  {
+    CpuTimer timer;
+    index->search(numQueries,
+                  cpuQuery.data(),
+                  FLAGS_k,
+                  cpuDistances.data(),
+                  cpuIndices.data());
+
+    cpuTime = timer.elapsedMilliseconds();
+  }
+
+  printf("CPU time %.3f ms\n", cpuTime);
+
+  HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
+  HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
+
+  CUDA_VERIFY(cudaProfilerStart());
+  faiss::gpu::synchronizeAllDevices();
+
+  float gpuTime = 0.0f;
+
+  // Time GPU
+  {
+    CpuTimer timer;
+
+    gpuIndex.getIndex()->search(cpuQuery.getSize(0),
+                                     cpuQuery.data(),
+                                     FLAGS_k,
+                                     gpuDistances.data(),
+                                     gpuIndices.data());
+
+    // There is a device -> host copy above, so no need to time
+    // additional synchronization with the GPU
+    gpuTime = timer.elapsedMilliseconds();
+  }
+
+  CUDA_VERIFY(cudaProfilerStop());
+  printf("GPU time %.3f ms\n", gpuTime);
+
+  compareLists(cpuDistances.data(), cpuIndices.data(),
+               gpuDistances.data(), gpuIndices.data(),
+               numQueries, FLAGS_k,
+               "", true, FLAGS_diff, false);
+
+  CUDA_VERIFY(cudaDeviceSynchronize());
+  // printf("\ncudaMalloc usage %zd\n",
+  //        resources.getMemoryManager().getHighWaterCudaMalloc());
+
+  return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/gpu/perf/PerfIVFPQ.cu b/core/src/index/thirdparty/faiss/gpu/perf/PerfIVFPQ.cu
new file mode 100644
index 0000000000..82eb648a1f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/perf/PerfIVFPQ.cu
@@ -0,0 +1,157 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/index_io.h>
+#include <faiss/utils/random.h>
+
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/perf/IndexWrapper.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/HostTensor.cuh>
+#include <faiss/gpu/utils/Timer.h>
+
+#include <cuda_profiler_api.h>
+#include <gflags/gflags.h>
+#include <map>
+#include <memory>
+#include <vector>
+
+DEFINE_int32(nprobe, 5, "number of coarse centroids to probe");
+DEFINE_int32(k, 3, "final number of closest results returned");
+DEFINE_int32(num_queries, 3, "number of query vectors");
+DEFINE_string(in, "/home/jhj/local/index.out", "index file for input");
+DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
+DEFINE_bool(use_precomputed, true, "enable or disable precomputed codes");
+DEFINE_bool(float16_lookup, false, "use float16 residual distance tables");
+DEFINE_int64(seed, -1, "specify random seed");
+DEFINE_int32(num_gpus, 1, "number of gpus to use");
+DEFINE_int32(index, 2, "0 = no indices on GPU; 1 = 32 bit, 2 = 64 bit on GPU");
+
+using namespace faiss::gpu;
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  CUDA_VERIFY(cudaProfilerStop());
+
+  auto seed = FLAGS_seed != -1L ? FLAGS_seed : time(nullptr);
+  printf("using seed %ld\n", seed);
+
+  auto numQueries = FLAGS_num_queries;
+
+  auto index = std::unique_ptr<faiss::IndexIVFPQ>(
+    dynamic_cast<faiss::IndexIVFPQ*>(faiss::read_index(FLAGS_in.c_str())));
+  FAISS_ASSERT((bool) index);
+  index->nprobe = FLAGS_nprobe;
+
+  if (!FLAGS_use_precomputed) {
+    index->use_precomputed_table = 0;
+  }
+
+  auto dim = index->d;
+  auto codes = index->pq.M;
+  auto bitsPerCode = index->pq.nbits;
+
+  printf("Database: dim %d num vecs %ld\n", dim, index->ntotal);
+  printf("Coarse centroids: %ld\n", index->quantizer->ntotal);
+  printf("PQ centroids: codes %ld bits per code %ld\n", codes, bitsPerCode);
+  printf("L2 lookup: %d queries, nprobe %d, total k %d, "
+         "precomputed codes %d\n\n",
+         numQueries, FLAGS_nprobe, FLAGS_k,
+         FLAGS_use_precomputed);
+
+  // Convert to GPU index
+  printf("Copying index to %d GPU(s)...\n", FLAGS_num_gpus);
+
+  auto precomp = FLAGS_use_precomputed;
+  auto indicesOpt = (faiss::gpu::IndicesOptions) FLAGS_index;
+  auto useFloat16Lookup = FLAGS_float16_lookup;
+
+  auto initFn = [precomp, indicesOpt, useFloat16Lookup, &index]
+    (faiss::gpu::GpuResources* res, int dev) ->
+    std::unique_ptr<faiss::gpu::GpuIndexIVFPQ> {
+
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.device = dev;
+    config.usePrecomputedTables = precomp;
+    config.indicesOptions = indicesOpt;
+    config.useFloat16LookupTables = useFloat16Lookup;
+
+    auto p = std::unique_ptr<faiss::gpu::GpuIndexIVFPQ>(
+      new faiss::gpu::GpuIndexIVFPQ(res, index.get(), config));
+
+    return p;
+  };
+
+  IndexWrapper<faiss::gpu::GpuIndexIVFPQ> gpuIndex(FLAGS_num_gpus, initFn);
+  gpuIndex.setNumProbes(FLAGS_nprobe);
+  printf("copy done\n");
+
+  // Build query vectors
+  HostTensor<float, 2, true> cpuQuery({numQueries, dim});
+  faiss::float_rand(cpuQuery.data(), cpuQuery.numElements(), seed);
+
+  // Time faiss CPU
+  HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
+  HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
+
+  float cpuTime = 0.0f;
+
+  {
+    CpuTimer timer;
+    index->search(numQueries,
+                  cpuQuery.data(),
+                  FLAGS_k,
+                  cpuDistances.data(),
+                  cpuIndices.data());
+
+    cpuTime = timer.elapsedMilliseconds();
+  }
+
+  printf("CPU time %.3f ms\n", cpuTime);
+
+  HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
+  HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
+
+  CUDA_VERIFY(cudaProfilerStart());
+  faiss::gpu::synchronizeAllDevices();
+
+  float gpuTime = 0.0f;
+
+  // Time GPU
+  {
+    CpuTimer timer;
+
+    gpuIndex.getIndex()->search(cpuQuery.getSize(0),
+                                     cpuQuery.data(),
+                                     FLAGS_k,
+                                     gpuDistances.data(),
+                                     gpuIndices.data());
+
+    // There is a device -> host copy above, so no need to time
+    // additional synchronization with the GPU
+    gpuTime = timer.elapsedMilliseconds();
+  }
+
+  CUDA_VERIFY(cudaProfilerStop());
+  printf("GPU time %.3f ms\n", gpuTime);
+
+  compareLists(cpuDistances.data(), cpuIndices.data(),
+               gpuDistances.data(), gpuIndices.data(),
+               numQueries, FLAGS_k,
+               "", true, FLAGS_diff, false);
+
+  CUDA_VERIFY(cudaDeviceSynchronize());
+  // printf("\ncudaMalloc usage %zd\n",
+  //        resources.getMemoryManager().getHighWaterCudaMalloc());
+
+  return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/gpu/perf/PerfIVFPQAdd.cpp b/core/src/index/thirdparty/faiss/gpu/perf/PerfIVFPQAdd.cpp
new file mode 100644
index 0000000000..1e45d635a5
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/perf/PerfIVFPQAdd.cpp
@@ -0,0 +1,139 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+
+#include <cuda_profiler_api.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Timer.h>
+#include <gflags/gflags.h>
+#include <map>
+#include <vector>
+
+DEFINE_int32(batches, 10, "number of batches of vectors to add");
+DEFINE_int32(batch_size, 10000, "number of vectors in each batch");
+DEFINE_int32(dim, 256, "dimension of vectors");
+DEFINE_int32(centroids, 4096, "num coarse centroids to use");
+DEFINE_int32(bytes_per_vec, 32, "bytes per encoded vector");
+DEFINE_int32(bits_per_code, 8, "bits per PQ code");
+DEFINE_int32(index, 2, "0 = no indices on GPU; 1 = 32 bit, 2 = 64 bit on GPU");
+DEFINE_bool(time_gpu, true, "time add to GPU");
+DEFINE_bool(time_cpu, false, "time add to CPU");
+DEFINE_bool(per_batch_time, false, "print per-batch times");
+DEFINE_bool(reserve_memory, false, "whether or not to pre-reserve memory");
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  cudaProfilerStop();
+
+  int dim = FLAGS_dim;
+  int numCentroids = FLAGS_centroids;
+  int bytesPerVec = FLAGS_bytes_per_vec;
+  int bitsPerCode = FLAGS_bits_per_code;
+
+  faiss::gpu::StandardGpuResources res;
+
+  // IndexIVFPQ will complain, but just give us enough to get through this
+  int numTrain = 4 * numCentroids;
+  std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
+
+  faiss::IndexFlatL2 coarseQuantizer(dim);
+  faiss::IndexIVFPQ cpuIndex(&coarseQuantizer, dim, numCentroids,
+                             bytesPerVec, bitsPerCode);
+  if (FLAGS_time_cpu) {
+    cpuIndex.train(numTrain, trainVecs.data());
+  }
+
+  faiss::gpu::GpuIndexIVFPQConfig config;
+  config.device = 0;
+  config.indicesOptions = (faiss::gpu::IndicesOptions) FLAGS_index;
+
+  faiss::gpu::GpuIndexIVFPQ gpuIndex(
+    &res, dim, numCentroids, bytesPerVec, bitsPerCode,
+    faiss::METRIC_L2, config);
+
+  if (FLAGS_time_gpu) {
+    gpuIndex.train(numTrain, trainVecs.data());
+    if (FLAGS_reserve_memory) {
+      size_t numVecs = (size_t) FLAGS_batches * (size_t) FLAGS_batch_size;
+      gpuIndex.reserveMemory(numVecs);
+    }
+  }
+
+  cudaDeviceSynchronize();
+  CUDA_VERIFY(cudaProfilerStart());
+
+  float totalGpuTime = 0.0f;
+  float totalCpuTime = 0.0f;
+
+  for (int i = 0; i < FLAGS_batches; ++i) {
+    if (!FLAGS_per_batch_time) {
+      if (i % 10 == 0) {
+        printf("Adding batch %d\n", i + 1);
+      }
+    }
+
+    auto addVecs = faiss::gpu::randVecs(FLAGS_batch_size, dim);
+
+    if (FLAGS_time_gpu) {
+      faiss::gpu::CpuTimer timer;
+      gpuIndex.add(FLAGS_batch_size, addVecs.data());
+      CUDA_VERIFY(cudaDeviceSynchronize());
+      auto time = timer.elapsedMilliseconds();
+
+      totalGpuTime += time;
+
+      if (FLAGS_per_batch_time) {
+      printf("Batch %d | GPU time to add %d vecs: %.3f ms (%.5f ms per)\n",
+             i + 1, FLAGS_batch_size, time, time / (float) FLAGS_batch_size);
+      }
+    }
+
+    if (FLAGS_time_cpu) {
+      faiss::gpu::CpuTimer timer;
+      cpuIndex.add(FLAGS_batch_size, addVecs.data());
+      auto time = timer.elapsedMilliseconds();
+
+      totalCpuTime += time;
+
+      if (FLAGS_per_batch_time) {
+        printf("Batch %d | CPU time to add %d vecs: %.3f ms (%.5f ms per)\n",
+               i + 1, FLAGS_batch_size, time, time / (float) FLAGS_batch_size);
+      }
+    }
+  }
+
+  CUDA_VERIFY(cudaProfilerStop());
+
+  int total = FLAGS_batch_size * FLAGS_batches;
+
+  if (FLAGS_time_gpu) {
+    printf("%d dim, %d centroids, %d x %d encoding\n"
+           "GPU time to add %d vectors (%d batches, %d per batch): "
+           "%.3f ms (%.3f us per)\n",
+           dim, numCentroids, bytesPerVec, bitsPerCode,
+           total, FLAGS_batches, FLAGS_batch_size,
+           totalGpuTime, totalGpuTime * 1000.0f / (float) total);
+  }
+
+  if (FLAGS_time_cpu) {
+    printf("%d dim, %d centroids, %d x %d encoding\n"
+           "CPU time to add %d vectors (%d batches, %d per batch): "
+           "%.3f ms (%.3f us per)\n",
+           dim, numCentroids, bytesPerVec, bitsPerCode,
+           total, FLAGS_batches, FLAGS_batch_size,
+           totalCpuTime, totalCpuTime * 1000.0f / (float) total);
+  }
+
+  return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/gpu/perf/PerfSelect.cu b/core/src/index/thirdparty/faiss/gpu/perf/PerfSelect.cu
new file mode 100644
index 0000000000..890fe5fb1e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/perf/PerfSelect.cu
@@ -0,0 +1,70 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/BlockSelectKernel.cuh>
+#include <faiss/gpu/utils/WarpSelectKernel.cuh>
+#include <faiss/gpu/utils/HostTensor.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/test/TestUtils.h>
+#include <algorithm>
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+
+DEFINE_int32(rows, 10000, "rows in matrix");
+DEFINE_int32(cols, 40000, "cols in matrix");
+DEFINE_int32(k, 100, "k");
+DEFINE_bool(dir, false, "direction of sort");
+DEFINE_bool(warp, false, "warp select");
+DEFINE_int32(iter, 5, "iterations to run");
+DEFINE_bool(k_powers, false, "test k powers of 2 from 1 -> max k");
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  std::vector<float> v = faiss::gpu::randVecs(FLAGS_rows, FLAGS_cols);
+  faiss::gpu::HostTensor<float, 2, true> hostVal({FLAGS_rows, FLAGS_cols});
+
+  for (int r = 0; r < FLAGS_rows; ++r) {
+    for (int c = 0; c < FLAGS_cols; ++c) {
+      hostVal[r][c] = v[r * FLAGS_cols + c];
+    }
+  }
+
+  // Select top-k on GPU
+  faiss::gpu::DeviceTensor<float, 2, true> gpuVal(hostVal, 0);
+
+  int startK = FLAGS_k;
+  int limitK = FLAGS_k;
+
+  if (FLAGS_k_powers) {
+    startK = 1;
+    limitK = GPU_MAX_SELECTION_K;
+  }
+
+  for (int k = startK; k <= limitK; k *= 2) {
+    faiss::gpu::DeviceTensor<float, 2, true> gpuOutVal({FLAGS_rows, k});
+    faiss::gpu::DeviceTensor<int, 2, true> gpuOutInd({FLAGS_rows, k});
+
+    for (int i = 0; i < FLAGS_iter; ++i) {
+      if (FLAGS_warp) {
+        faiss::gpu::runWarpSelect(gpuVal, gpuOutVal, gpuOutInd,
+                                  FLAGS_dir, k, 0);
+      } else {
+        faiss::gpu::runBlockSelect(gpuVal, gpuOutVal, gpuOutInd,
+                                   FLAGS_dir, k, 0);
+      }
+    }
+  }
+
+  cudaDeviceSynchronize();
+}
diff --git a/core/src/index/thirdparty/faiss/gpu/perf/WriteIndex.cpp b/core/src/index/thirdparty/faiss/gpu/perf/WriteIndex.cpp
new file mode 100644
index 0000000000..af363787a9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/perf/WriteIndex.cpp
@@ -0,0 +1,102 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/index_io.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <vector>
+#include <gflags/gflags.h>
+
+// For IVFPQ:
+DEFINE_bool(ivfpq, false, "use IVFPQ encoding");
+DEFINE_int32(codes, 4, "number of PQ codes per vector");
+DEFINE_int32(bits_per_code, 8, "number of bits per PQ code");
+
+// For IVFFlat:
+DEFINE_bool(l2, true, "use L2 metric (versus IP metric)");
+DEFINE_bool(ivfflat, false, "use IVF flat encoding");
+
+// For both:
+DEFINE_string(out, "/home/jhj/local/index.out", "index file for output");
+DEFINE_int32(dim, 128, "vector dimension");
+DEFINE_int32(num_coarse, 100, "number of coarse centroids");
+DEFINE_int32(num, 100000, "total database size");
+DEFINE_int32(num_train, -1, "number of database vecs to train on");
+
+template <typename T>
+void fillAndSave(T& index, int numTrain, int num, int dim) {
+  auto trainVecs = faiss::gpu::randVecs(numTrain, dim);
+  index.train(numTrain, trainVecs.data());
+
+  constexpr int kAddChunk = 1000000;
+
+  for (int i = 0; i < num; i += kAddChunk) {
+    int numRemaining = (num - i) < kAddChunk ? (num - i) : kAddChunk;
+    auto vecs = faiss::gpu::randVecs(numRemaining, dim);
+
+    printf("adding at %d: %d\n", i, numRemaining);
+    index.add(numRemaining, vecs.data());
+  }
+
+  faiss::write_index(&index, FLAGS_out.c_str());
+}
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  // Either ivfpq or ivfflat must be set
+  if ((FLAGS_ivfpq && FLAGS_ivfflat) ||
+      (!FLAGS_ivfpq && !FLAGS_ivfflat)) {
+    printf("must specify either ivfpq or ivfflat\n");
+    return 1;
+  }
+
+  auto dim = FLAGS_dim;
+  auto numCentroids = FLAGS_num_coarse;
+  auto num = FLAGS_num;
+  auto numTrain = FLAGS_num_train;
+  numTrain = numTrain == -1 ? std::max((num / 4), 1) : numTrain;
+  numTrain = std::min(num, numTrain);
+
+  if (FLAGS_ivfpq) {
+    faiss::IndexFlatL2 quantizer(dim);
+    faiss::IndexIVFPQ index(&quantizer, dim, numCentroids,
+                            FLAGS_codes, FLAGS_bits_per_code);
+    index.verbose = true;
+
+    printf("IVFPQ: codes %d bits per code %d\n",
+           FLAGS_codes, FLAGS_bits_per_code);
+    printf("Lists: %d\n", numCentroids);
+    printf("Database: dim %d num vecs %d trained on %d\n", dim, num, numTrain);
+    printf("output file: %s\n", FLAGS_out.c_str());
+
+    fillAndSave(index, numTrain, num, dim);
+  } else if (FLAGS_ivfflat) {
+    faiss::IndexFlatL2 quantizerL2(dim);
+    faiss::IndexFlatIP quantizerIP(dim);
+
+    faiss::IndexFlat* quantizer = FLAGS_l2 ?
+      (faiss::IndexFlat*) &quantizerL2 :
+      (faiss::IndexFlat*) &quantizerIP;
+
+    faiss::IndexIVFFlat index(quantizer, dim, numCentroids,
+                              FLAGS_l2 ? faiss::METRIC_L2 :
+                              faiss::METRIC_INNER_PRODUCT);
+
+    printf("IVFFlat: metric %s\n", FLAGS_l2 ? "L2" : "IP");
+    printf("Lists: %d\n", numCentroids);
+    printf("Database: dim %d num vecs %d trained on %d\n", dim, num, numTrain);
+    printf("output file: %s\n", FLAGS_out.c_str());
+
+    fillAndSave(index, numTrain, num, dim);
+  }
+
+  return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/gpu/perf/slow.py b/core/src/index/thirdparty/faiss/gpu/perf/slow.py
new file mode 100644
index 0000000000..a096311c4e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/perf/slow.py
@@ -0,0 +1,24 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python3
+# this is a slow computation to test whether ctrl-C handling works
+import faiss
+import numpy as np
+
+def test_slow():
+    d = 256
+    index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(),
+                               0, faiss.IndexFlatL2(d))
+    x = np.random.rand(10 ** 6, d).astype('float32')
+    print('add')
+    index.add(x)
+    print('search')
+    index.search(x, 10)
+    print('done')
+
+
+if __name__ == '__main__':
+    test_slow()
diff --git a/core/src/index/thirdparty/faiss/gpu/test/Makefile b/core/src/index/thirdparty/faiss/gpu/test/Makefile
new file mode 100644
index 0000000000..6836314810
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/test/Makefile
@@ -0,0 +1,51 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+-include ../../makefile.inc
+
+TESTS_SRC = TestGpuIndexFlat.cpp TestGpuIndexIVFPQ.cpp \
+TestGpuIndexBinaryFlat.cpp TestGpuIndexIVFFlat.cpp TestGpuMemoryException.cpp
+CUDA_TESTS_SRC = TestGpuSelect.cu
+
+TESTS_OBJ = $(TESTS_SRC:.cpp=.o)
+CUDA_TESTS_OBJ = $(CUDA_TESTS_SRC:.cu=.o)
+
+TESTS_BIN = $(TESTS_OBJ:.o=) $(CUDA_TESTS_OBJ:.o=)
+
+
+# test_gpu_index.py test_pytorch_faiss.py
+
+run: $(TESTS_BIN) $(CUDA_TESTS_BIN)
+	for t in $(TESTS_BIN) $(CUDA_TESTS_BIN); do ./$$t || exit; done
+
+$(CUDA_TESTS_OBJ): %.o: %.cu gtest
+	$(NVCC) $(NVCCFLAGS) -g -O3 -o $@ -c $< -Igtest/include
+
+$(TESTS_OBJ): %.o: %.cpp gtest
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $< -Igtest/include
+
+$(TESTS_BIN): %: %.o TestUtils.o ../../libfaiss.a gtest/make/gtest.a
+	$(CXX) -o $@ $^ $(LDFLAGS) $(LIBS)
+
+demo_ivfpq_indexing_gpu: demo_ivfpq_indexing_gpu.o ../../libfaiss.a
+	$(CXX) -o $@ $^ $(LDFLAGS) $(LIBS)
+
+demo_ivfpq_indexing_gpu.o: demo_ivfpq_indexing_gpu.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $^
+
+gtest/make/gtest.a: gtest
+	$(MAKE) -C gtest/make CXX="$(CXX)" CXXFLAGS="$(CXXFLAGS)" gtest.a
+
+gtest:
+	curl -L https://github.com/google/googletest/archive/release-1.8.0.tar.gz | tar xz && \
+	mv googletest-release-1.8.0/googletest gtest && \
+	rm -rf googletest-release-1.8.0
+
+clean:
+	rm -f *.o $(TESTS_BIN)
+	rm -rf gtest
+	rm -f demo_ivfpq_indexing_gpu
+
+.PHONY: clean run
diff --git a/core/src/index/thirdparty/faiss/gpu/test/TestGpuDistance.cu b/core/src/index/thirdparty/faiss/gpu/test/TestGpuDistance.cu
new file mode 100644
index 0000000000..a287ef8444
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/test/TestGpuDistance.cu
@@ -0,0 +1,135 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/IndexFlat.h>
+#include <faiss/gpu/GpuDistance.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/Transpose.cuh>
+#include <faiss/gpu/test/TestUtils.h>
+#include <gtest/gtest.h>
+#include <sstream>
+#include <vector>
+
+void testTransposition(bool colMajorVecs,
+                       bool colMajorQueries,
+                       faiss::MetricType metric) {
+  int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  int dim = faiss::gpu::randVal(20, 150);
+  int numVecs = faiss::gpu::randVal(10, 30000);
+  int numQuery = faiss::gpu::randVal(1, 1024);
+  int k = faiss::gpu::randVal(20, 70);
+
+  // Input data for CPU
+  std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
+  std::vector<float> queries = faiss::gpu::randVecs(numQuery, dim);
+
+  // The CPU index is our reference for the results
+  faiss::IndexFlatL2 cpuIndexL2(dim);
+  cpuIndexL2.add(numVecs, vecs.data());
+
+  std::vector<float> cpuDistanceL2(numQuery * k, 0);
+  std::vector<faiss::Index::idx_t> cpuIndicesL2(numQuery * k, -1);
+
+  cpuIndexL2.search(numQuery, queries.data(), k,
+                    cpuDistanceL2.data(), cpuIndicesL2.data());
+
+  faiss::IndexFlatIP cpuIndexIP(dim);
+  cpuIndexIP.add(numVecs, vecs.data());
+
+  std::vector<float> cpuDistanceIP(numQuery * k, 0);
+  std::vector<faiss::Index::idx_t> cpuIndicesIP(numQuery * k, -1);
+
+  cpuIndexIP.search(numQuery, queries.data(), k,
+                    cpuDistanceIP.data(), cpuIndicesIP.data());
+
+  // The transpose and distance code assumes the desired device is already set
+  faiss::gpu::DeviceScope scope(device);
+  auto stream = res.getDefaultStream(device);
+
+  // Copy input data to GPU, and pre-transpose both vectors and queries for
+  // passing
+  auto gpuVecs = faiss::gpu::toDevice<float, 2>(
+    nullptr, device, vecs.data(), stream, {numVecs, dim});
+  auto gpuQueries = faiss::gpu::toDevice<float, 2>(
+    nullptr, device, queries.data(), stream, {numQuery, dim});
+
+  faiss::gpu::DeviceTensor<float, 2, true> vecsT({dim, numVecs});
+  faiss::gpu::runTransposeAny(gpuVecs, 0, 1, vecsT, stream);
+
+  faiss::gpu::DeviceTensor<float, 2, true> queriesT({dim, numQuery});
+  faiss::gpu::runTransposeAny(gpuQueries, 0, 1, queriesT, stream);
+
+  std::vector<float> gpuDistance(numQuery * k, 0);
+  std::vector<faiss::Index::idx_t> gpuIndices(numQuery * k, -1);
+
+  faiss::gpu::bruteForceKnn(
+    &res,
+    metric,
+    colMajorVecs ? vecsT.data() : gpuVecs.data(),
+    !colMajorVecs,
+    numVecs,
+    colMajorQueries ? queriesT.data() : gpuQueries.data(),
+    !colMajorQueries,
+    numQuery,
+    dim,
+    k,
+    gpuDistance.data(),
+    gpuIndices.data());
+
+  std::stringstream str;
+  str << "metric " << metric
+      << " colMajorVecs " << colMajorVecs
+      << " colMajorQueries " << colMajorQueries;
+
+  faiss::gpu::compareLists(metric == faiss::MetricType::METRIC_L2 ?
+                           cpuDistanceL2.data() : cpuDistanceIP.data(),
+                           metric == faiss::MetricType::METRIC_L2 ?
+                           cpuIndicesL2.data() : cpuIndicesIP.data(),
+                           gpuDistance.data(),
+                           gpuIndices.data(),
+                           numQuery, k,
+                           str.str(),
+                           false, false, true,
+                           6e-3f, 0.1f, 0.015f);
+}
+
+// Test different memory layouts for brute-force k-NN
+TEST(TestGpuDistance, Transposition_RR) {
+  testTransposition(false, false, faiss::MetricType::METRIC_L2);
+//  testTransposition(false, false, faiss::MetricType::METRIC_INNER_PRODUCT);
+}
+
+TEST(TestGpuDistance, Transposition_RC) {
+  testTransposition(false, true, faiss::MetricType::METRIC_L2);
+//  testTransposition(false, true, faiss::MetricType::METRIC_INNER_PRODUCT);
+}
+
+TEST(TestGpuDistance, Transposition_CR) {
+  testTransposition(true, false, faiss::MetricType::METRIC_L2);
+//  testTransposition(true, false, faiss::MetricType::METRIC_INNER_PRODUCT);
+}
+
+TEST(TestGpuDistance, Transposition_CC) {
+  testTransposition(true, true, faiss::MetricType::METRIC_L2);
+//  testTransposition(true, true, faiss::MetricType::METRIC_INNER_PRODUCT);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  // just run with a fixed test seed
+  faiss::gpu::setTestSeed(100);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp b/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp
new file mode 100644
index 0000000000..14c28c155a
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp
@@ -0,0 +1,130 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/gpu/GpuIndexBinaryFlat.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <faiss/utils/utils.h>
+#include <gtest/gtest.h>
+#include <sstream>
+#include <vector>
+
+void compareBinaryDist(const std::vector<int>& cpuDist,
+                       const std::vector<faiss::IndexBinary::idx_t>& cpuLabels,
+                       const std::vector<int>& gpuDist,
+                       const std::vector<faiss::IndexBinary::idx_t>& gpuLabels,
+                       int numQuery,
+                       int k) {
+  for (int i = 0; i < numQuery; ++i) {
+    // The index order can be permuted within a group that has the same
+    // distance, since this is based on the order in which the algorithm
+    // encounters the values. The last set of equivalent distances seen in the
+    // min-k might be truncated, so we can't check that set, but all others we
+    // can check.
+    std::set<faiss::IndexBinary::idx_t> cpuLabelSet;
+    std::set<faiss::IndexBinary::idx_t> gpuLabelSet;
+
+    int curDist = -1;
+
+    for (int j = 0; j < k; ++j) {
+      int idx = i * k + j;
+
+      if (curDist == -1) {
+        curDist = cpuDist[idx];
+      }
+
+      if (curDist != cpuDist[idx]) {
+        // Distances must be monotonically increasing
+        EXPECT_LT(curDist, cpuDist[idx]);
+
+        // This is a new set of distances
+        EXPECT_EQ(cpuLabelSet, gpuLabelSet);
+        curDist = cpuDist[idx];
+        cpuLabelSet.clear();
+        gpuLabelSet.clear();
+      }
+
+      cpuLabelSet.insert(cpuLabels[idx]);
+      gpuLabelSet.insert(gpuLabels[idx]);
+
+      // Because the distances are reproducible, they must be exactly the same
+      EXPECT_EQ(cpuDist[idx], gpuDist[idx]);
+    }
+  }
+}
+
+template <int DimMultiple>
+void testGpuIndexBinaryFlat(int kOverride = -1) {
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  faiss::gpu::GpuIndexBinaryFlatConfig config;
+  config.device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+  // multiples of 8 and multiples of 32 use different implementations
+  int dims = faiss::gpu::randVal(1, 20) * DimMultiple;
+  faiss::gpu::GpuIndexBinaryFlat gpuIndex(&res, dims, config);
+
+  faiss::IndexBinaryFlat cpuIndex(dims);
+
+  int k = kOverride > 0 ?
+    kOverride : faiss::gpu::randVal(1, faiss::gpu::getMaxKSelection());
+  int numVecs = faiss::gpu::randVal(k + 1, 20000);
+  int numQuery = faiss::gpu::randVal(1, 1000);
+
+  auto data = faiss::gpu::randBinaryVecs(numVecs, dims);
+  gpuIndex.add(numVecs, data.data());
+  cpuIndex.add(numVecs, data.data());
+
+  auto query = faiss::gpu::randBinaryVecs(numQuery, dims);
+
+  std::vector<int> cpuDist(numQuery * k);
+  std::vector<faiss::IndexBinary::idx_t> cpuLabels(numQuery * k);
+
+  cpuIndex.search(numQuery,
+                  query.data(),
+                  k,
+                  cpuDist.data(),
+                  cpuLabels.data());
+
+  std::vector<int> gpuDist(numQuery * k);
+  std::vector<faiss::IndexBinary::idx_t> gpuLabels(numQuery * k);
+
+  gpuIndex.search(numQuery,
+                  query.data(),
+                  k,
+                  gpuDist.data(),
+                  gpuLabels.data());
+
+  compareBinaryDist(cpuDist, cpuLabels,
+                    gpuDist, gpuLabels,
+                    numQuery, k);
+}
+
+TEST(TestGpuIndexBinaryFlat, Test8) {
+  for (int tries = 0; tries < 4; ++tries) {
+    testGpuIndexBinaryFlat<8>();
+  }
+}
+
+TEST(TestGpuIndexBinaryFlat, Test32) {
+  for (int tries = 0; tries < 4; ++tries) {
+    testGpuIndexBinaryFlat<32>();
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  // just run with a fixed test seed
+  faiss::gpu::setTestSeed(100);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexFlat.cpp b/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexFlat.cpp
new file mode 100644
index 0000000000..7847b63e21
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexFlat.cpp
@@ -0,0 +1,371 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/IndexFlat.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <gtest/gtest.h>
+#include <sstream>
+#include <vector>
+
+// FIXME: figure out a better way to test fp16
+constexpr float kF16MaxRelErr = 0.07f;
+constexpr float kF32MaxRelErr = 6e-3f;
+
+struct TestFlatOptions {
+  TestFlatOptions()
+      : useL2(true),
+        useFloat16(false),
+        useTransposed(false),
+        numVecsOverride(-1),
+        numQueriesOverride(-1),
+        kOverride(-1),
+        dimOverride(-1) {
+  }
+
+  bool useL2;
+  bool useFloat16;
+  bool useTransposed;
+  int numVecsOverride;
+  int numQueriesOverride;
+  int kOverride;
+  int dimOverride;
+};
+
+void testFlat(const TestFlatOptions& opt) {
+  int numVecs = opt.numVecsOverride > 0 ?
+    opt.numVecsOverride : faiss::gpu::randVal(1000, 20000);
+  int dim = opt.dimOverride > 0 ?
+    opt.dimOverride : faiss::gpu::randVal(50, 800);
+  int numQuery = opt.numQueriesOverride > 0 ?
+    opt.numQueriesOverride : faiss::gpu::randVal(1, 512);
+
+  // Due to loss of precision in a float16 accumulator, for large k,
+  // the number of differences is pretty huge. Restrict ourselves to a
+  // fairly small `k` for float16
+  int k = opt.useFloat16 ?
+    std::min(faiss::gpu::randVal(1, 50), numVecs) :
+    std::min(faiss::gpu::randVal(1, faiss::gpu::getMaxKSelection()), numVecs);
+  if (opt.kOverride > 0) {
+    k = opt.kOverride;
+  }
+
+  faiss::IndexFlatIP cpuIndexIP(dim);
+  faiss::IndexFlatL2 cpuIndexL2(dim);
+
+  faiss::IndexFlat* cpuIndex =
+    opt.useL2 ? (faiss::IndexFlat*) &cpuIndexL2 :
+    (faiss::IndexFlat*) &cpuIndexIP;
+
+  // Construct on a random device to test multi-device, if we have
+  // multiple devices
+  int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+
+  faiss::gpu::GpuIndexFlatConfig config;
+  config.device = device;
+  config.useFloat16 = opt.useFloat16;
+  config.storeTransposed = opt.useTransposed;
+
+  faiss::gpu::GpuIndexFlatIP gpuIndexIP(&res, dim, config);
+  faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
+
+  faiss::gpu::GpuIndexFlat* gpuIndex =
+    opt.useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
+    (faiss::gpu::GpuIndexFlat*) &gpuIndexIP;
+
+  std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
+  cpuIndex->add(numVecs, vecs.data());
+  gpuIndex->add(numVecs, vecs.data());
+
+  std::stringstream str;
+  str << (opt.useL2 ? "L2" : "IP") << " numVecs " << numVecs
+      << " dim " << dim
+      << " useFloat16 " << opt.useFloat16
+      << " transposed " << opt.useTransposed
+      << " numQuery " << numQuery
+      << " k " << k;
+
+  // To some extent, we depend upon the relative error for the test
+  // for float16
+  faiss::gpu::compareIndices(*cpuIndex, *gpuIndex, numQuery, dim, k, str.str(),
+                             opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                             // FIXME: the fp16 bounds are
+                             // useless when math (the accumulator) is
+                             // in fp16. Figure out another way to test
+                             opt.useFloat16 ? 0.99f : 0.1f,
+                             opt.useFloat16 ? 0.65f : 0.015f);
+}
+
+TEST(TestGpuIndexFlat, IP_Float32) {
+  for (int tries = 0; tries < 3; ++tries) {
+    TestFlatOptions opt;
+    opt.useL2 = false;
+    opt.useFloat16 = false;
+    opt.useTransposed = false;
+
+    testFlat(opt);
+
+    opt.useTransposed = true;
+    testFlat(opt);
+  }
+}
+
+TEST(TestGpuIndexFlat, L2_Float32) {
+  for (int tries = 0; tries < 3; ++tries) {
+    TestFlatOptions opt;
+    opt.useL2 = true;
+    opt.useFloat16 = false;
+    opt.useTransposed = false;
+
+    testFlat(opt);
+
+    opt.useTransposed = true;
+    testFlat(opt);
+  }
+}
+
+// test specialized k == 1 codepath
+TEST(TestGpuIndexFlat, L2_Float32_K1) {
+  for (int tries = 0; tries < 3; ++tries) {
+    TestFlatOptions opt;
+    opt.useL2 = true;
+    opt.useFloat16 = false;
+    opt.useTransposed = false;
+    opt.kOverride = 1;
+
+    testFlat(opt);
+  }
+}
+
+TEST(TestGpuIndexFlat, IP_Float16) {
+  for (int tries = 0; tries < 3; ++tries) {
+    TestFlatOptions opt;
+    opt.useL2 = false;
+    opt.useFloat16 = true;
+    opt.useTransposed = false;
+
+    testFlat(opt);
+
+    opt.useTransposed = true;
+    testFlat(opt);
+  }
+}
+
+TEST(TestGpuIndexFlat, L2_Float16) {
+  for (int tries = 0; tries < 3; ++tries) {
+    TestFlatOptions opt;
+    opt.useL2 = true;
+    opt.useFloat16 = true;
+    opt.useTransposed = false;
+
+    testFlat(opt);
+
+    opt.useTransposed = true;
+    testFlat(opt);
+  }
+}
+
+// test specialized k == 1 codepath
+TEST(TestGpuIndexFlat, L2_Float16_K1) {
+  for (int tries = 0; tries < 3; ++tries) {
+    TestFlatOptions opt;
+    opt.useL2 = true;
+    opt.useFloat16 = true;
+    opt.useTransposed = false;
+    opt.kOverride = 1;
+
+    testFlat(opt);
+  }
+}
+
+// test tiling along a huge vector set
+TEST(TestGpuIndexFlat, L2_Tiling) {
+  for (int tries = 0; tries < 2; ++tries) {
+    TestFlatOptions opt;
+    opt.useL2 = true;
+    opt.useFloat16 = false;
+    opt.useTransposed = false;
+    opt.numVecsOverride = 1000000;
+
+    // keep the rest of the problem reasonably small
+    opt.numQueriesOverride = 4;
+    opt.dimOverride = 64;
+    opt.kOverride = 64;
+
+    testFlat(opt);
+  }
+}
+
+TEST(TestGpuIndexFlat, QueryEmpty) {
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  faiss::gpu::GpuIndexFlatConfig config;
+  config.device = 0;
+  config.useFloat16 = false;
+  config.storeTransposed = false;
+
+  int dim = 128;
+  faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
+
+  // Querying an empty index should not blow up, and just return
+  // (FLT_MAX, -1)
+  int numQuery = 10;
+  int k = 50;
+  std::vector<float> queries(numQuery * dim, 1.0f);
+
+  std::vector<float> dist(numQuery * k, 0);
+  std::vector<faiss::Index::idx_t> ind(numQuery * k);
+
+  gpuIndex.search(numQuery, queries.data(), k, dist.data(), ind.data());
+
+  for (auto d : dist) {
+    EXPECT_EQ(d, std::numeric_limits<float>::max());
+  }
+
+  for (auto i : ind) {
+    EXPECT_EQ(i, -1);
+  }
+}
+
+TEST(TestGpuIndexFlat, CopyFrom) {
+  int numVecs = faiss::gpu::randVal(100, 200);
+  int dim = faiss::gpu::randVal(1, 1000);
+
+  faiss::IndexFlatL2 cpuIndex(dim);
+
+  std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
+  cpuIndex.add(numVecs, vecs.data());
+
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  // Fill with garbage values
+  int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+  faiss::gpu::GpuIndexFlatConfig config;
+  config.device = 0;
+  config.useFloat16 = false;
+  config.storeTransposed = false;
+
+  faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, 2000, config);
+  gpuIndex.copyFrom(&cpuIndex);
+
+  EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+  EXPECT_EQ(gpuIndex.ntotal, numVecs);
+
+  EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+  EXPECT_EQ(cpuIndex.d, dim);
+
+  int idx = faiss::gpu::randVal(0, numVecs - 1);
+
+  std::vector<float> gpuVals(dim);
+  gpuIndex.reconstruct(idx, gpuVals.data());
+
+  std::vector<float> cpuVals(dim);
+  cpuIndex.reconstruct(idx, cpuVals.data());
+
+  EXPECT_EQ(gpuVals, cpuVals);
+}
+
+TEST(TestGpuIndexFlat, CopyTo) {
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  int numVecs = faiss::gpu::randVal(100, 200);
+  int dim = faiss::gpu::randVal(1, 1000);
+
+  int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+  faiss::gpu::GpuIndexFlatConfig config;
+  config.device = device;
+  config.useFloat16 = false;
+  config.storeTransposed = false;
+
+  faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
+
+  std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
+  gpuIndex.add(numVecs, vecs.data());
+
+  // Fill with garbage values
+  faiss::IndexFlatL2 cpuIndex(2000);
+  gpuIndex.copyTo(&cpuIndex);
+
+  EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+  EXPECT_EQ(gpuIndex.ntotal, numVecs);
+
+  EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+  EXPECT_EQ(cpuIndex.d, dim);
+
+  int idx = faiss::gpu::randVal(0, numVecs - 1);
+
+  std::vector<float> gpuVals(dim);
+  gpuIndex.reconstruct(idx, gpuVals.data());
+
+  std::vector<float> cpuVals(dim);
+  cpuIndex.reconstruct(idx, cpuVals.data());
+
+  EXPECT_EQ(gpuVals, cpuVals);
+}
+
+TEST(TestGpuIndexFlat, UnifiedMemory) {
+  // Construct on a random device to test multi-device, if we have
+  // multiple devices
+  int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+  if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
+    return;
+  }
+
+  int dim = 256;
+
+  // FIXME: GpuIndexFlat doesn't support > 2^31 (vecs * dims) due to
+  // kernel indexing, so we can't test unified memory for memory
+  // oversubscription.
+  size_t numVecs = 50000;
+  int numQuery = 10;
+  int k = 10;
+
+  faiss::IndexFlatL2 cpuIndexL2(dim);
+
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  faiss::gpu::GpuIndexFlatConfig config;
+  config.device = device;
+  config.memorySpace = faiss::gpu::MemorySpace::Unified;
+
+  faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
+
+  std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
+  cpuIndexL2.add(numVecs, vecs.data());
+  gpuIndexL2.add(numVecs, vecs.data());
+
+  // To some extent, we depend upon the relative error for the test
+  // for float16
+  faiss::gpu::compareIndices(cpuIndexL2, gpuIndexL2,
+                             numQuery, dim, k, "Unified Memory",
+                             kF32MaxRelErr,
+                             0.1f,
+                             0.015f);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  // just run with a fixed test seed
+  faiss::gpu::setTestSeed(100);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
new file mode 100644
index 0000000000..6304252e6b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -0,0 +1,550 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <cmath>
+#include <gtest/gtest.h>
+#include <sstream>
+#include <vector>
+
+// FIXME: figure out a better way to test fp16
+constexpr float kF16MaxRelErr = 0.3f;
+constexpr float kF32MaxRelErr = 0.03f;
+
+
+struct Options {
+  Options() {
+    numAdd = 2 * faiss::gpu::randVal(2000, 5000);
+    dim = faiss::gpu::randVal(64, 200);
+
+    numCentroids = std::sqrt((float) numAdd / 2);
+    numTrain = numCentroids * 40;
+    nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids);
+    numQuery = faiss::gpu::randVal(32, 100);
+
+    // Due to the approximate nature of the query and of floating point
+    // differences between GPU and CPU, to stay within our error bounds, only
+    // use a small k
+    k = std::min(faiss::gpu::randVal(10, 30), numAdd / 40);
+    indicesOpt = faiss::gpu::randSelect({
+        faiss::gpu::INDICES_CPU,
+          faiss::gpu::INDICES_32_BIT,
+          faiss::gpu::INDICES_64_BIT});
+
+    device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+  }
+
+  std::string toString() const {
+    std::stringstream str;
+    str << "IVFFlat device " << device
+        << " numVecs " << numAdd
+        << " dim " << dim
+        << " numCentroids " << numCentroids
+        << " nprobe " << nprobe
+        << " numQuery " << numQuery
+        << " k " << k
+        << " indicesOpt " << indicesOpt;
+
+    return str.str();
+  }
+
+  int numAdd;
+  int dim;
+  int numCentroids;
+  int numTrain;
+  int nprobe;
+  int numQuery;
+  int k;
+  int device;
+  faiss::gpu::IndicesOptions indicesOpt;
+};
+
+void queryTest(faiss::MetricType metricType,
+               bool useFloat16CoarseQuantizer,
+               int dimOverride = -1) {
+  for (int tries = 0; tries < 2; ++tries) {
+    Options opt;
+    opt.dim = dimOverride != -1 ? dimOverride : opt.dim;
+
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+    faiss::IndexFlatL2 quantizerL2(opt.dim);
+    faiss::IndexFlatIP quantizerIP(opt.dim);
+    faiss::Index* quantizer =
+      metricType == faiss::METRIC_L2 ?
+      (faiss::Index*) &quantizerL2 : (faiss::Index*) &quantizerIP;
+
+    faiss::IndexIVFFlat cpuIndex(quantizer,
+                                 opt.dim, opt.numCentroids, metricType);
+    cpuIndex.train(opt.numTrain, trainVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
+    cpuIndex.nprobe = opt.nprobe;
+
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
+
+    faiss::gpu::GpuIndexIVFFlatConfig config;
+    config.device = opt.device;
+    config.indicesOptions = opt.indicesOpt;
+    config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+
+    faiss::gpu::GpuIndexIVFFlat gpuIndex(&res,
+                                         cpuIndex.d,
+                                         cpuIndex.nlist,
+                                         cpuIndex.metric_type,
+                                         config);
+    gpuIndex.copyFrom(&cpuIndex);
+    gpuIndex.setNumProbes(opt.nprobe);
+
+    bool compFloat16 = useFloat16CoarseQuantizer;
+    faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                               opt.numQuery, opt.dim, opt.k, opt.toString(),
+                               compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                               // FIXME: the fp16 bounds are
+                               // useless when math (the accumulator) is
+                               // in fp16. Figure out another way to test
+                               compFloat16 ? 0.70f : 0.1f,
+                               compFloat16 ? 0.65f : 0.015f);
+  }
+}
+
+void addTest(faiss::MetricType metricType,
+             bool useFloat16CoarseQuantizer) {
+  for (int tries = 0; tries < 2; ++tries) {
+    Options opt;
+
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+    faiss::IndexFlatL2 quantizerL2(opt.dim);
+    faiss::IndexFlatIP quantizerIP(opt.dim);
+    faiss::Index* quantizer =
+      metricType == faiss::METRIC_L2 ?
+      (faiss::Index*) &quantizerL2 : (faiss::Index*) &quantizerIP;
+
+    faiss::IndexIVFFlat cpuIndex(quantizer,
+                                 opt.dim,
+                                 opt.numCentroids,
+                                 metricType);
+    cpuIndex.train(opt.numTrain, trainVecs.data());
+    cpuIndex.nprobe = opt.nprobe;
+
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
+
+    faiss::gpu::GpuIndexIVFFlatConfig config;
+    config.device = opt.device;
+    config.indicesOptions = opt.indicesOpt;
+    config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+
+    faiss::gpu::GpuIndexIVFFlat gpuIndex(&res,
+                                         cpuIndex.d,
+                                         cpuIndex.nlist,
+                                         cpuIndex.metric_type,
+                                         config);
+    gpuIndex.copyFrom(&cpuIndex);
+    gpuIndex.setNumProbes(opt.nprobe);
+
+    cpuIndex.add(opt.numAdd, addVecs.data());
+    gpuIndex.add(opt.numAdd, addVecs.data());
+
+    bool compFloat16 = useFloat16CoarseQuantizer;
+    faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                               opt.numQuery, opt.dim, opt.k, opt.toString(),
+                               compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                               compFloat16 ? 0.70f : 0.1f,
+                               compFloat16 ? 0.30f : 0.015f);
+  }
+}
+
+void copyToTest(bool useFloat16CoarseQuantizer) {
+  Options opt;
+  std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+  std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  faiss::gpu::GpuIndexIVFFlatConfig config;
+  config.device = opt.device;
+  config.indicesOptions = opt.indicesOpt;
+  config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+
+  faiss::gpu::GpuIndexIVFFlat gpuIndex(&res,
+                                       opt.dim,
+                                       opt.numCentroids,
+                                       faiss::METRIC_L2,
+                                       config);
+  gpuIndex.train(opt.numTrain, trainVecs.data());
+  gpuIndex.add(opt.numAdd, addVecs.data());
+  gpuIndex.setNumProbes(opt.nprobe);
+
+  // use garbage values to see if we overwrite then
+  faiss::IndexFlatL2 cpuQuantizer(1);
+  faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2);
+  cpuIndex.nprobe = 1;
+
+  gpuIndex.copyTo(&cpuIndex);
+
+  EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+  EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
+
+  EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+  EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d);
+  EXPECT_EQ(cpuIndex.d, opt.dim);
+  EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
+  EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
+
+  // Query both objects; results should be equivalent
+  bool compFloat16 = useFloat16CoarseQuantizer;
+  faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                             opt.numQuery, opt.dim, opt.k, opt.toString(),
+                             compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                             compFloat16 ? 0.70f : 0.1f,
+                             compFloat16 ? 0.30f : 0.015f);
+}
+
+void copyFromTest(bool useFloat16CoarseQuantizer) {
+  Options opt;
+  std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+  std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+  faiss::IndexFlatL2 cpuQuantizer(opt.dim);
+  faiss::IndexIVFFlat cpuIndex(&cpuQuantizer,
+                               opt.dim,
+                               opt.numCentroids,
+                               faiss::METRIC_L2);
+  cpuIndex.nprobe = opt.nprobe;
+  cpuIndex.train(opt.numTrain, trainVecs.data());
+  cpuIndex.add(opt.numAdd, addVecs.data());
+
+  // use garbage values to see if we overwrite then
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  faiss::gpu::GpuIndexIVFFlatConfig config;
+  config.device = opt.device;
+  config.indicesOptions = opt.indicesOpt;
+  config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+
+  faiss::gpu::GpuIndexIVFFlat gpuIndex(&res,
+                                       1,
+                                       1,
+                                       faiss::METRIC_L2,
+                                       config);
+  gpuIndex.setNumProbes(1);
+
+  gpuIndex.copyFrom(&cpuIndex);
+
+  EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+  EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
+
+  EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+  EXPECT_EQ(cpuIndex.d, opt.dim);
+  EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
+  EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
+
+  // Query both objects; results should be equivalent
+  bool compFloat16 = useFloat16CoarseQuantizer;
+  faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                             opt.numQuery, opt.dim, opt.k, opt.toString(),
+                             compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                             compFloat16 ? 0.70f : 0.1f,
+                             compFloat16 ? 0.30f : 0.015f);
+}
+
+TEST(TestGpuIndexIVFFlat, Float32_32_Add_L2) {
+  addTest(faiss::METRIC_L2, false);
+}
+
+TEST(TestGpuIndexIVFFlat, Float32_32_Add_IP) {
+  addTest(faiss::METRIC_INNER_PRODUCT, false);
+}
+
+TEST(TestGpuIndexIVFFlat, Float16_32_Add_L2) {
+  addTest(faiss::METRIC_L2, true);
+}
+
+TEST(TestGpuIndexIVFFlat, Float16_32_Add_IP) {
+  addTest(faiss::METRIC_INNER_PRODUCT, true);
+}
+
+//
+// General query tests
+//
+
+TEST(TestGpuIndexIVFFlat, Float32_Query_L2) {
+  queryTest(faiss::METRIC_L2, false);
+}
+
+TEST(TestGpuIndexIVFFlat, Float32_Query_IP) {
+  queryTest(faiss::METRIC_INNER_PRODUCT, false);
+}
+
+// float16 coarse quantizer
+
+TEST(TestGpuIndexIVFFlat, Float16_32_Query_L2) {
+  queryTest(faiss::METRIC_L2, true);
+}
+
+TEST(TestGpuIndexIVFFlat, Float16_32_Query_IP) {
+  queryTest(faiss::METRIC_INNER_PRODUCT, true);
+}
+
+//
+// There are IVF list scanning specializations for 64-d and 128-d that we
+// make sure we explicitly test here
+//
+
+TEST(TestGpuIndexIVFFlat, Float32_Query_L2_64) {
+  queryTest(faiss::METRIC_L2, false, 64);
+}
+
+TEST(TestGpuIndexIVFFlat, Float32_Query_IP_64) {
+  queryTest(faiss::METRIC_INNER_PRODUCT, false, 64);
+}
+
+TEST(TestGpuIndexIVFFlat, Float32_Query_L2_128) {
+  queryTest(faiss::METRIC_L2, false, 128);
+}
+
+TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) {
+  queryTest(faiss::METRIC_INNER_PRODUCT, false, 128);
+}
+
+//
+// Copy tests
+//
+
+TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) {
+  copyToTest(false);
+}
+
+TEST(TestGpuIndexIVFFlat, Float32_32_CopyFrom) {
+  copyFromTest(false);
+}
+
+TEST(TestGpuIndexIVFFlat, Float32_negative) {
+  Options opt;
+
+  auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+  auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+  // Put all vecs on negative side
+  for (auto& f : trainVecs) {
+    f = std::abs(f) * -1.0f;
+  }
+
+  for (auto& f : addVecs) {
+    f *= std::abs(f) * -1.0f;
+  }
+
+  faiss::IndexFlatIP quantizerIP(opt.dim);
+  faiss::Index* quantizer = (faiss::Index*) &quantizerIP;
+
+  faiss::IndexIVFFlat cpuIndex(quantizer,
+                               opt.dim, opt.numCentroids,
+                               faiss::METRIC_INNER_PRODUCT);
+  cpuIndex.train(opt.numTrain, trainVecs.data());
+  cpuIndex.add(opt.numAdd, addVecs.data());
+  cpuIndex.nprobe = opt.nprobe;
+
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  faiss::gpu::GpuIndexIVFFlatConfig config;
+  config.device = opt.device;
+  config.indicesOptions = opt.indicesOpt;
+
+  faiss::gpu::GpuIndexIVFFlat gpuIndex(&res,
+                                       cpuIndex.d,
+                                       cpuIndex.nlist,
+                                       cpuIndex.metric_type,
+                                       config);
+  gpuIndex.copyFrom(&cpuIndex);
+  gpuIndex.setNumProbes(opt.nprobe);
+
+  // Construct a positive test set
+  auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+
+  // Put all vecs on positive size
+  for (auto& f : queryVecs) {
+    f = std::abs(f);
+  }
+
+  bool compFloat16 = false;
+  faiss::gpu::compareIndices(queryVecs,
+                             cpuIndex, gpuIndex,
+                             opt.numQuery, opt.dim, opt.k, opt.toString(),
+                             compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                             // FIXME: the fp16 bounds are
+                             // useless when math (the accumulator) is
+                             // in fp16. Figure out another way to test
+                             compFloat16 ? 0.99f : 0.1f,
+                             compFloat16 ? 0.65f : 0.015f);
+}
+
+//
+// NaN tests
+//
+
+TEST(TestGpuIndexIVFFlat, QueryNaN) {
+  Options opt;
+
+  std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+  std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  faiss::gpu::GpuIndexIVFFlatConfig config;
+  config.device = opt.device;
+  config.indicesOptions = opt.indicesOpt;
+  config.flatConfig.useFloat16 = faiss::gpu::randBool();
+
+  faiss::gpu::GpuIndexIVFFlat gpuIndex(&res,
+                                       opt.dim,
+                                       opt.numCentroids,
+                                       faiss::METRIC_L2,
+                                       config);
+  gpuIndex.setNumProbes(opt.nprobe);
+
+  gpuIndex.train(opt.numTrain, trainVecs.data());
+  gpuIndex.add(opt.numAdd, addVecs.data());
+
+  int numQuery = 10;
+  std::vector<float> nans(numQuery * opt.dim,
+                          std::numeric_limits<float>::quiet_NaN());
+
+  std::vector<float> distances(numQuery * opt.k, 0);
+  std::vector<faiss::Index::idx_t> indices(numQuery * opt.k, 0);
+
+  gpuIndex.search(numQuery,
+                  nans.data(),
+                  opt.k,
+                  distances.data(),
+                  indices.data());
+
+  for (int q = 0; q < numQuery; ++q) {
+    for (int k = 0; k < opt.k; ++k) {
+      EXPECT_EQ(indices[q * opt.k + k], -1);
+      EXPECT_EQ(distances[q * opt.k + k], std::numeric_limits<float>::max());
+    }
+  }
+}
+
+TEST(TestGpuIndexIVFFlat, AddNaN) {
+  Options opt;
+
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  faiss::gpu::GpuIndexIVFFlatConfig config;
+  config.device = opt.device;
+  config.indicesOptions = opt.indicesOpt;
+  config.flatConfig.useFloat16 = faiss::gpu::randBool();
+
+  faiss::gpu::GpuIndexIVFFlat gpuIndex(&res,
+                                       opt.dim,
+                                       opt.numCentroids,
+                                       faiss::METRIC_L2,
+                                       config);
+  gpuIndex.setNumProbes(opt.nprobe);
+
+  int numNans = 10;
+  std::vector<float> nans(numNans * opt.dim,
+                          std::numeric_limits<float>::quiet_NaN());
+
+  // Make one vector valid, which should actually add
+  for (int i = 0; i < opt.dim; ++i) {
+    nans[i] = 0.0f;
+  }
+
+  std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+  gpuIndex.train(opt.numTrain, trainVecs.data());
+
+  // should not crash
+  EXPECT_EQ(gpuIndex.ntotal, 0);
+  gpuIndex.add(numNans, nans.data());
+
+  std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+  std::vector<float> distance(opt.numQuery * opt.k, 0);
+  std::vector<faiss::Index::idx_t> indices(opt.numQuery * opt.k, 0);
+
+  // should not crash
+  gpuIndex.search(opt.numQuery, queryVecs.data(), opt.k,
+                  distance.data(), indices.data());
+}
+
+TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
+  // Construct on a random device to test multi-device, if we have
+  // multiple devices
+  int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+  if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
+    return;
+  }
+
+  int dim = 128;
+
+  int numCentroids = 256;
+  // Unfortunately it would take forever to add 24 GB in IVFPQ data,
+  // so just perform a small test with data allocated in the unified
+  // memory address space
+  size_t numAdd = 10000;
+  size_t numTrain = numCentroids * 40;
+  int numQuery = 10;
+  int k = 10;
+  int nprobe = 8;
+
+  std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
+  std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
+
+  faiss::IndexFlatL2 quantizer(dim);
+  faiss::IndexIVFFlat cpuIndex(&quantizer, dim, numCentroids, faiss::METRIC_L2);
+
+  cpuIndex.train(numTrain, trainVecs.data());
+  cpuIndex.add(numAdd, addVecs.data());
+  cpuIndex.nprobe = nprobe;
+
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  faiss::gpu::GpuIndexIVFFlatConfig config;
+  config.device = device;
+  config.memorySpace = faiss::gpu::MemorySpace::Unified;
+
+  faiss::gpu::GpuIndexIVFFlat gpuIndex(&res,
+                                       dim,
+                                       numCentroids,
+                                       faiss::METRIC_L2,
+                                       config);
+  gpuIndex.copyFrom(&cpuIndex);
+  gpuIndex.setNumProbes(nprobe);
+
+  faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                             numQuery, dim, k, "Unified Memory",
+                             kF32MaxRelErr,
+                             0.1f,
+                             0.015f);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  // just run with a fixed test seed
+  faiss::gpu::setTestSeed(100);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexIVFPQ.cpp b/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
new file mode 100644
index 0000000000..0a461b63c3
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
@@ -0,0 +1,450 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <cmath>
+#include <gtest/gtest.h>
+#include <sstream>
+#include <vector>
+
+
+void pickEncoding(int& codes, int& dim) {
+  std::vector<int> codeSizes{
+    3, 4, 8, 12, 16, 20, 24,
+      28, 32, 40, 48, 56, 64, 96
+  };
+
+  // Above 32 doesn't work with no precomputed codes
+  std::vector<int> dimSizes{4, 8, 10, 12, 16, 20, 24, 28, 32};
+
+  while (true) {
+    codes = codeSizes[faiss::gpu::randVal(0, codeSizes.size() - 1)];
+    dim = codes * dimSizes[faiss::gpu::randVal(0, dimSizes.size() - 1)];
+
+    // for such a small test, super-low or high dim is more likely to
+    // generate comparison errors
+    if (dim < 256 && dim >= 64) {
+      return;
+    }
+  }
+}
+
+struct Options {
+  Options() {
+    numAdd = faiss::gpu::randVal(2000, 5000);
+    numCentroids = std::sqrt((float) numAdd);
+    numTrain = numCentroids * 40;
+
+    pickEncoding(codes, dim);
+
+    // TODO: Change back to `faiss::gpu::randVal(3, 7)` when we officially
+    //   support non-multiple of 8 subcodes for IVFPQ.
+    bitsPerCode = 8;
+    nprobe = std::min(faiss::gpu::randVal(40, 1000), numCentroids);
+    numQuery = faiss::gpu::randVal(1, 8);
+
+    // Due to the approximate nature of the query and of floating point
+    // differences between GPU and CPU, to stay within our error bounds, only
+    // use a small k
+    k = std::min(faiss::gpu::randVal(5, 20), numAdd / 40);
+    usePrecomputed = faiss::gpu::randBool();
+    indicesOpt = faiss::gpu::randSelect({
+        faiss::gpu::INDICES_CPU,
+          faiss::gpu::INDICES_32_BIT,
+          faiss::gpu::INDICES_64_BIT});
+    if (codes > 48) {
+      // large codes can only fit using float16
+      useFloat16 = true;
+    } else {
+      useFloat16 = faiss::gpu::randBool();
+    }
+
+    device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+  }
+
+  std::string toString() const {
+    std::stringstream str;
+    str << "IVFPQ device " << device
+        << " numVecs " << numAdd
+        << " dim " << dim
+        << " numCentroids " << numCentroids
+        << " codes " << codes
+        << " bitsPerCode " << bitsPerCode
+        << " nprobe " << nprobe
+        << " numQuery " << numQuery
+        << " k " << k
+        << " usePrecomputed " << usePrecomputed
+        << " indicesOpt " << indicesOpt
+        << " useFloat16 " << useFloat16;
+
+    return str.str();
+  }
+
+  float getCompareEpsilon() const {
+    return 0.03f;
+  }
+
+  float getPctMaxDiff1() const {
+    return useFloat16 ? 0.30f : 0.10f;
+  }
+
+  float getPctMaxDiffN() const {
+    return useFloat16 ? 0.05f : 0.02f;
+  }
+
+  int numAdd;
+  int numCentroids;
+  int numTrain;
+  int codes;
+  int dim;
+  int bitsPerCode;
+  int nprobe;
+  int numQuery;
+  int k;
+  bool usePrecomputed;
+  faiss::gpu::IndicesOptions indicesOpt;
+  bool useFloat16;
+  int device;
+};
+
+TEST(TestGpuIndexIVFPQ, Query) {
+  for (int tries = 0; tries < 2; ++tries) {
+    Options opt;
+
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+    faiss::IndexFlatL2 coarseQuantizer(opt.dim);
+    faiss::IndexIVFPQ cpuIndex(&coarseQuantizer, opt.dim, opt.numCentroids,
+                               opt.codes, opt.bitsPerCode);
+    cpuIndex.nprobe = opt.nprobe;
+    cpuIndex.train(opt.numTrain, trainVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
+
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
+
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.device = opt.device;
+    config.usePrecomputedTables = opt.usePrecomputed;
+    config.indicesOptions = opt.indicesOpt;
+    config.useFloat16LookupTables = opt.useFloat16;
+
+    faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
+    gpuIndex.setNumProbes(opt.nprobe);
+
+    faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                               opt.numQuery, opt.dim, opt.k, opt.toString(),
+                               opt.getCompareEpsilon(),
+                               opt.getPctMaxDiff1(),
+                               opt.getPctMaxDiffN());
+  }
+}
+
+TEST(TestGpuIndexIVFPQ, Add) {
+  for (int tries = 0; tries < 2; ++tries) {
+    Options opt;
+
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+    faiss::IndexFlatL2 coarseQuantizer(opt.dim);
+    faiss::IndexIVFPQ cpuIndex(&coarseQuantizer, opt.dim, opt.numCentroids,
+                               opt.codes, opt.bitsPerCode);
+    cpuIndex.nprobe = opt.nprobe;
+    cpuIndex.train(opt.numTrain, trainVecs.data());
+
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
+
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.device = opt.device;
+    config.usePrecomputedTables = opt.usePrecomputed;
+    config.indicesOptions = opt.indicesOpt;
+    config.useFloat16LookupTables = opt.useFloat16;
+
+    faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
+    gpuIndex.setNumProbes(opt.nprobe);
+
+    gpuIndex.add(opt.numAdd, addVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
+
+    faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                               opt.numQuery, opt.dim, opt.k, opt.toString(),
+                               opt.getCompareEpsilon(),
+                               opt.getPctMaxDiff1(),
+                               opt.getPctMaxDiffN());
+  }
+}
+
+TEST(TestGpuIndexIVFPQ, CopyTo) {
+  Options opt;
+  std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+  std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  faiss::gpu::GpuIndexIVFPQConfig config;
+  config.device = opt.device;
+  config.usePrecomputedTables = opt.usePrecomputed;
+  config.indicesOptions = opt.indicesOpt;
+  config.useFloat16LookupTables = opt.useFloat16;
+
+  faiss::gpu::GpuIndexIVFPQ gpuIndex(&res,
+                                     opt.dim,
+                                     opt.numCentroids,
+                                     opt.codes,
+                                     opt.bitsPerCode,
+                                     faiss::METRIC_L2,
+                                     config);
+  gpuIndex.setNumProbes(opt.nprobe);
+  gpuIndex.train(opt.numTrain, trainVecs.data());
+  gpuIndex.add(opt.numAdd, addVecs.data());
+
+  // Use garbage values to see if we overwrite them
+  faiss::IndexFlatL2 cpuQuantizer(1);
+  faiss::IndexIVFPQ cpuIndex(&cpuQuantizer, 1, 1, 1, 1);
+
+  gpuIndex.copyTo(&cpuIndex);
+
+  EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+  EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
+
+  EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+  EXPECT_EQ(cpuIndex.d, opt.dim);
+  EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
+  EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
+  EXPECT_EQ(cpuIndex.pq.M, gpuIndex.getNumSubQuantizers());
+  EXPECT_EQ(gpuIndex.getNumSubQuantizers(), opt.codes);
+  EXPECT_EQ(cpuIndex.pq.nbits, gpuIndex.getBitsPerCode());
+  EXPECT_EQ(gpuIndex.getBitsPerCode(), opt.bitsPerCode);
+
+  // Query both objects; results should be equivalent
+  faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                             opt.numQuery, opt.dim, opt.k, opt.toString(),
+                             opt.getCompareEpsilon(),
+                             opt.getPctMaxDiff1(),
+                             opt.getPctMaxDiffN());
+}
+
+TEST(TestGpuIndexIVFPQ, CopyFrom) {
+  Options opt;
+  std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+  std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+  faiss::IndexFlatL2 coarseQuantizer(opt.dim);
+  faiss::IndexIVFPQ cpuIndex(&coarseQuantizer, opt.dim, opt.numCentroids,
+                             opt.codes, opt.bitsPerCode);
+  cpuIndex.nprobe = opt.nprobe;
+  cpuIndex.train(opt.numTrain, trainVecs.data());
+  cpuIndex.add(opt.numAdd, addVecs.data());
+
+  // Use garbage values to see if we overwrite them
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  faiss::gpu::GpuIndexIVFPQConfig config;
+  config.device = opt.device;
+  config.usePrecomputedTables = opt.usePrecomputed;
+  config.indicesOptions = opt.indicesOpt;
+  config.useFloat16LookupTables = opt.useFloat16;
+
+  faiss::gpu::GpuIndexIVFPQ
+    gpuIndex(&res, 1, 1, 1, 1, faiss::METRIC_L2, config);
+  gpuIndex.setNumProbes(1);
+
+  gpuIndex.copyFrom(&cpuIndex);
+
+  // Make sure we are equivalent
+  EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+  EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
+
+  EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+  EXPECT_EQ(cpuIndex.d, opt.dim);
+  EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
+  EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
+  EXPECT_EQ(cpuIndex.pq.M, gpuIndex.getNumSubQuantizers());
+  EXPECT_EQ(gpuIndex.getNumSubQuantizers(), opt.codes);
+  EXPECT_EQ(cpuIndex.pq.nbits, gpuIndex.getBitsPerCode());
+  EXPECT_EQ(gpuIndex.getBitsPerCode(), opt.bitsPerCode);
+
+  // Query both objects; results should be equivalent
+  faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                             opt.numQuery, opt.dim, opt.k, opt.toString(),
+                             opt.getCompareEpsilon(),
+                             opt.getPctMaxDiff1(),
+                             opt.getPctMaxDiffN());
+}
+
+TEST(TestGpuIndexIVFPQ, QueryNaN) {
+  Options opt;
+
+  std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+  std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  faiss::gpu::GpuIndexIVFPQConfig config;
+  config.device = opt.device;
+  config.usePrecomputedTables = opt.usePrecomputed;
+  config.indicesOptions = opt.indicesOpt;
+  config.useFloat16LookupTables = opt.useFloat16;
+
+  faiss::gpu::GpuIndexIVFPQ gpuIndex(&res,
+                                     opt.dim,
+                                     opt.numCentroids,
+                                     opt.codes,
+                                     opt.bitsPerCode,
+                                     faiss::METRIC_L2,
+                                     config);
+
+  gpuIndex.setNumProbes(opt.nprobe);
+
+  gpuIndex.train(opt.numTrain, trainVecs.data());
+  gpuIndex.add(opt.numAdd, addVecs.data());
+
+  int numQuery = 5;
+  std::vector<float> nans(numQuery * opt.dim,
+                          std::numeric_limits<float>::quiet_NaN());
+
+  std::vector<float> distances(numQuery * opt.k, 0);
+  std::vector<faiss::Index::idx_t> indices(numQuery * opt.k, 0);
+
+  gpuIndex.search(numQuery,
+                  nans.data(),
+                  opt.k,
+                  distances.data(),
+                  indices.data());
+
+  for (int q = 0; q < numQuery; ++q) {
+    for (int k = 0; k < opt.k; ++k) {
+      EXPECT_EQ(indices[q * opt.k + k], -1);
+      EXPECT_EQ(distances[q * opt.k + k], std::numeric_limits<float>::max());
+    }
+  }
+}
+
+TEST(TestGpuIndexIVFPQ, AddNaN) {
+  Options opt;
+
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  faiss::gpu::GpuIndexIVFPQConfig config;
+  config.device = opt.device;
+  config.usePrecomputedTables = opt.usePrecomputed;
+  config.indicesOptions = opt.indicesOpt;
+  config.useFloat16LookupTables = opt.useFloat16;
+
+  faiss::gpu::GpuIndexIVFPQ gpuIndex(&res,
+                                     opt.dim,
+                                     opt.numCentroids,
+                                     opt.codes,
+                                     opt.bitsPerCode,
+                                     faiss::METRIC_L2,
+                                     config);
+
+  gpuIndex.setNumProbes(opt.nprobe);
+
+  int numNans = 10;
+  std::vector<float> nans(numNans * opt.dim,
+                          std::numeric_limits<float>::quiet_NaN());
+
+  // Make one vector valid, which should actually add
+  for (int i = 0; i < opt.dim; ++i) {
+    nans[i] = 0.0f;
+  }
+
+  std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+  gpuIndex.train(opt.numTrain, trainVecs.data());
+
+  // should not crash
+  EXPECT_EQ(gpuIndex.ntotal, 0);
+  gpuIndex.add(numNans, nans.data());
+
+  std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+  std::vector<float> distance(opt.numQuery * opt.k, 0);
+  std::vector<faiss::Index::idx_t> indices(opt.numQuery * opt.k, 0);
+
+  // should not crash
+  gpuIndex.search(opt.numQuery, queryVecs.data(), opt.k,
+                  distance.data(), indices.data());
+}
+
+TEST(TestGpuIndexIVFPQ, UnifiedMemory) {
+  // Construct on a random device to test multi-device, if we have
+  // multiple devices
+  int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+  if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
+    return;
+  }
+
+  int dim = 128;
+
+  int numCentroids = 256;
+  // Unfortunately it would take forever to add 24 GB in IVFPQ data,
+  // so just perform a small test with data allocated in the unified
+  // memory address space
+  size_t numAdd = 10000;
+  size_t numTrain = numCentroids * 40;
+  int numQuery = 10;
+  int k = 10;
+  int nprobe = 8;
+  int codes = 8;
+  int bitsPerCode = 8;
+
+  std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
+  std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
+
+  faiss::IndexFlatL2 quantizer(dim);
+  faiss::IndexIVFPQ cpuIndex(&quantizer, dim, numCentroids, codes, bitsPerCode);
+
+  cpuIndex.train(numTrain, trainVecs.data());
+  cpuIndex.add(numAdd, addVecs.data());
+  cpuIndex.nprobe = nprobe;
+
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  faiss::gpu::GpuIndexIVFPQConfig config;
+  config.device = device;
+  config.memorySpace = faiss::gpu::MemorySpace::Unified;
+
+  faiss::gpu::GpuIndexIVFPQ gpuIndex(&res,
+                                     dim,
+                                     numCentroids,
+                                     codes,
+                                     bitsPerCode,
+                                     faiss::METRIC_L2,
+                                     config);
+  gpuIndex.copyFrom(&cpuIndex);
+  gpuIndex.setNumProbes(nprobe);
+
+  faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                             numQuery, dim, k, "Unified Memory",
+                             0.015f,
+                             0.1f,
+                             0.015f);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  // just run with a fixed test seed
+  faiss::gpu::setTestSeed(100);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/core/src/index/thirdparty/faiss/gpu/test/TestGpuMemoryException.cpp b/core/src/index/thirdparty/faiss/gpu/test/TestGpuMemoryException.cpp
new file mode 100644
index 0000000000..e3bca1d86a
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/test/TestGpuMemoryException.cpp
@@ -0,0 +1,84 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/IndexFlat.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <gtest/gtest.h>
+
+// Test to see if we can recover after attempting to allocate too much GPU
+// memory
+TEST(TestGpuMemoryException, AddException) {
+  size_t numBrokenAdd = std::numeric_limits<int>::max();
+  size_t numRealAdd = 10000;
+  size_t devFree = 0;
+  size_t devTotal = 0;
+
+  CUDA_VERIFY(cudaMemGetInfo(&devFree, &devTotal));
+
+  // Figure out the dimensionality needed to get at least greater than devTotal
+  size_t brokenAddDims = ((devTotal / sizeof(float)) / numBrokenAdd) + 1;
+  size_t realAddDims = 128;
+
+  faiss::gpu::StandardGpuResources res;
+
+  faiss::gpu::GpuIndexFlatConfig config;
+  config.device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+  faiss::gpu::GpuIndexFlatL2
+    gpuIndexL2Broken(&res, (int) brokenAddDims, config);
+  faiss::gpu::GpuIndexFlatL2
+    gpuIndexL2(&res, (int) realAddDims, config);
+  faiss::IndexFlatL2
+    cpuIndex((int) realAddDims);
+
+  // Should throw on attempting to allocate too much data
+  {
+    // allocate memory without initialization
+    auto vecs =
+      std::unique_ptr<float[]>(new float[numBrokenAdd * brokenAddDims]);
+    EXPECT_THROW(gpuIndexL2Broken.add(numBrokenAdd, vecs.get()),
+                 faiss::FaissException);
+  }
+
+  // Should be able to add a smaller set of data now
+  {
+    auto vecs = faiss::gpu::randVecs(numRealAdd, realAddDims);
+    EXPECT_NO_THROW(gpuIndexL2.add(numRealAdd, vecs.data()));
+    cpuIndex.add(numRealAdd, vecs.data());
+  }
+
+  // Should throw on attempting to allocate too much data
+  {
+    // allocate memory without initialization
+    auto vecs =
+      std::unique_ptr<float[]>(new float[numBrokenAdd * brokenAddDims]);
+    EXPECT_THROW(gpuIndexL2Broken.add(numBrokenAdd, vecs.get()),
+                 faiss::FaissException);
+  }
+
+  // Should be able to query results from what we had before
+  {
+    size_t numQuery = 10;
+    auto vecs = faiss::gpu::randVecs(numQuery, realAddDims);
+    EXPECT_NO_THROW(compareIndices(vecs, cpuIndex, gpuIndexL2,
+                                   numQuery, realAddDims, 50, "",
+                                   6e-3f, 0.1f, 0.015f));
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  // just run with a fixed test seed
+  faiss::gpu::setTestSeed(100);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/core/src/index/thirdparty/faiss/gpu/test/TestGpuSelect.cu b/core/src/index/thirdparty/faiss/gpu/test/TestGpuSelect.cu
new file mode 100644
index 0000000000..35d5b95505
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/test/TestGpuSelect.cu
@@ -0,0 +1,190 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/test/TestUtils.h>
+#include <faiss/gpu/utils/BlockSelectKernel.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/HostTensor.cuh>
+#include <faiss/gpu/utils/WarpSelectKernel.cuh>
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+
+void testForSize(int rows, int cols, int k, bool dir, bool warp) {
+  std::vector<float> v = faiss::gpu::randVecs(rows, cols);
+  faiss::gpu::HostTensor<float, 2, true> hostVal({rows, cols});
+
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      hostVal[r][c] = v[r * cols + c];
+    }
+  }
+
+  // row -> (val -> idx)
+  std::unordered_map<int, std::vector<std::pair<int, float>>> hostOutValAndInd;
+  for (int r = 0; r < rows; ++r) {
+    std::vector<std::pair<int, float>> closest;
+
+    for (int c = 0; c < cols; ++c) {
+      closest.emplace_back(c, (float) hostVal[r][c]);
+    }
+
+    auto dirFalseFn =
+      [](std::pair<int, float>& a, std::pair<int, float>& b) {
+      return a.second < b.second;
+    };
+    auto dirTrueFn =
+      [](std::pair<int, float>& a, std::pair<int, float>& b) {
+      return a.second > b.second;
+    };
+
+    std::sort(closest.begin(), closest.end(), dir ? dirTrueFn : dirFalseFn);
+    hostOutValAndInd.emplace(r, closest);
+  }
+
+  // Select top-k on GPU
+  faiss::gpu::DeviceTensor<float, 2, true> gpuVal(hostVal, 0);
+  faiss::gpu::DeviceTensor<float, 2, true> gpuOutVal({rows, k});
+  faiss::gpu::DeviceTensor<int, 2, true> gpuOutInd({rows, k});
+
+  if (warp) {
+    faiss::gpu::runWarpSelect(gpuVal, gpuOutVal, gpuOutInd, dir, k, 0);
+  } else {
+    faiss::gpu::runBlockSelect(gpuVal, gpuOutVal, gpuOutInd, dir, k, 0);
+  }
+
+  // Copy back to CPU
+  faiss::gpu::HostTensor<float, 2, true> outVal(gpuOutVal, 0);
+  faiss::gpu::HostTensor<int, 2, true> outInd(gpuOutInd, 0);
+
+  for (int r = 0; r < rows; ++r) {
+    std::unordered_map<int, int> seenIndices;
+
+    for (int i = 0; i < k; ++i) {
+      float gpuV = outVal[r][i];
+      float cpuV = hostOutValAndInd[r][i].second;
+
+      EXPECT_EQ(gpuV, cpuV) <<
+        "rows " << rows << " cols " << cols << " k " << k << " dir " << dir
+                << " row " << r << " ind " << i;
+
+      // If there are identical elements in a row that should be
+      // within the top-k, then it is possible that the index can
+      // differ, because the order in which the GPU will see the
+      // equivalent values is different than the CPU (and will remain
+      // unspecified, since this is affected by the choice of
+      // k-selection algorithm that we use)
+      int gpuInd = outInd[r][i];
+      int cpuInd = hostOutValAndInd[r][i].first;
+
+      // We should never see duplicate indices, however
+      auto itSeenIndex = seenIndices.find(gpuInd);
+
+      EXPECT_EQ(itSeenIndex, seenIndices.end()) <<
+        "Row " << r << " user index " << gpuInd << " was seen at both " <<
+        itSeenIndex->second << " and " << i;
+
+      seenIndices[gpuInd] = i;
+
+      if (gpuInd != cpuInd) {
+        // Gather the values from the original data via index; the
+        // values should be the same
+        float gpuGatherV = hostVal[r][gpuInd];
+        float cpuGatherV = hostVal[r][cpuInd];
+
+        EXPECT_EQ(gpuGatherV, cpuGatherV) <<
+          "rows " << rows << " cols " << cols << " k " << k << " dir " << dir
+                  << " row " << r << " ind " << i << " source ind "
+                  << gpuInd << " " << cpuInd;
+      }
+    }
+  }
+}
+
+// General test
+TEST(TestGpuSelect, test) {
+  for (int i = 0; i < 10; ++i) {
+    int rows = faiss::gpu::randVal(10, 100);
+    int cols = faiss::gpu::randVal(1, 30000);
+    int k = std::min(cols, faiss::gpu::randVal(1, GPU_MAX_SELECTION_K));
+    bool dir = faiss::gpu::randBool();
+
+    testForSize(rows, cols, k, dir, false);
+  }
+}
+
+// Test for k = 1
+TEST(TestGpuSelect, test1) {
+  for (int i = 0; i < 5; ++i) {
+    int rows = faiss::gpu::randVal(10, 100);
+    int cols = faiss::gpu::randVal(1, 30000);
+    bool dir = faiss::gpu::randBool();
+
+    testForSize(rows, cols, 1, dir, false);
+  }
+}
+
+// Test for where k = #cols exactly (we are returning all the values,
+// just sorted)
+TEST(TestGpuSelect, testExact) {
+  for (int i = 0; i < 5; ++i) {
+    int rows = faiss::gpu::randVal(10, 100);
+    int cols = faiss::gpu::randVal(1, GPU_MAX_SELECTION_K);
+    bool dir = faiss::gpu::randBool();
+
+    testForSize(rows, cols, cols, dir, false);
+  }
+}
+
+// General test
+TEST(TestGpuSelect, testWarp) {
+  for (int i = 0; i < 10; ++i) {
+    int rows = faiss::gpu::randVal(10, 100);
+    int cols = faiss::gpu::randVal(1, 30000);
+    int k = std::min(cols, faiss::gpu::randVal(1, GPU_MAX_SELECTION_K));
+    bool dir = faiss::gpu::randBool();
+
+    testForSize(rows, cols, k, dir, true);
+  }
+}
+
+// Test for k = 1
+TEST(TestGpuSelect, test1Warp) {
+  for (int i = 0; i < 5; ++i) {
+    int rows = faiss::gpu::randVal(10, 100);
+    int cols = faiss::gpu::randVal(1, 30000);
+    bool dir = faiss::gpu::randBool();
+
+    testForSize(rows, cols, 1, dir, true);
+  }
+}
+
+// Test for where k = #cols exactly (we are returning all the values,
+// just sorted)
+TEST(TestGpuSelect, testExactWarp) {
+  for (int i = 0; i < 5; ++i) {
+    int rows = faiss::gpu::randVal(10, 100);
+    int cols = faiss::gpu::randVal(1, GPU_MAX_SELECTION_K);
+    bool dir = faiss::gpu::randBool();
+
+    testForSize(rows, cols, cols, dir, true);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  // just run with a fixed test seed
+  faiss::gpu::setTestSeed(100);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/core/src/index/thirdparty/faiss/gpu/test/TestUtils.cpp b/core/src/index/thirdparty/faiss/gpu/test/TestUtils.cpp
new file mode 100644
index 0000000000..423d58b87d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/test/TestUtils.cpp
@@ -0,0 +1,315 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/test/TestUtils.h>
+#include <faiss/utils/random.h>
+#include <cmath>
+#include <gtest/gtest.h>
+#include <set>
+#include <sstream>
+#include <time.h>
+#include <unordered_map>
+
+namespace faiss { namespace gpu {
+
+inline float relativeError(float a, float b) {
+  return std::abs(a - b) / (0.5f * (std::abs(a) + std::abs(b)));
+}
+
+// This seed is also used for the faiss float_rand API; in a test it
+// is all within a single thread, so it is ok
+long s_seed = 1;
+
+void newTestSeed() {
+  struct timespec t;
+  clock_gettime(CLOCK_REALTIME, &t);
+
+  setTestSeed(t.tv_nsec);
+}
+
+void setTestSeed(long seed) {
+  printf("testing with random seed %ld\n", seed);
+
+  srand48(seed);
+  s_seed = seed;
+}
+
+int randVal(int a, int b) {
+  EXPECT_GE(a, 0);
+  EXPECT_LE(a, b);
+
+  return a + (lrand48() % (b + 1 - a));
+}
+
+bool randBool() {
+  return randSelect<bool>({true, false});
+}
+
+std::vector<float> randVecs(size_t num, size_t dim) {
+  std::vector<float> v(num * dim);
+
+  faiss::float_rand(v.data(), v.size(), s_seed);
+  // unfortunately we generate separate sets of vectors, and don't
+  // want the same values
+  ++s_seed;
+
+  return v;
+}
+
+std::vector<unsigned char> randBinaryVecs(size_t num, size_t dim) {
+  std::vector<unsigned char> v(num * (dim / 8));
+
+  faiss::byte_rand(v.data(), v.size(), s_seed);
+  // unfortunately we generate separate sets of vectors, and don't
+  // want the same values
+  ++s_seed;
+
+  return v;
+}
+
+void compareIndices(
+    const std::vector<float>& queryVecs,
+    faiss::Index& refIndex,
+    faiss::Index& testIndex,
+    int numQuery,
+    int /*dim*/,
+    int k,
+    const std::string& configMsg,
+    float maxRelativeError,
+    float pctMaxDiff1,
+    float pctMaxDiffN) {
+  // Compare
+  std::vector<float> refDistance(numQuery * k, 0);
+  std::vector<faiss::Index::idx_t> refIndices(numQuery * k, -1);
+  refIndex.search(numQuery, queryVecs.data(),
+                  k, refDistance.data(), refIndices.data());
+
+  std::vector<float> testDistance(numQuery * k, 0);
+  std::vector<faiss::Index::idx_t> testIndices(numQuery * k, -1);
+  testIndex.search(numQuery, queryVecs.data(),
+                   k, testDistance.data(), testIndices.data());
+
+  faiss::gpu::compareLists(refDistance.data(),
+                           refIndices.data(),
+                           testDistance.data(),
+                           testIndices.data(),
+                           numQuery, k,
+                           configMsg,
+                           true, false, true,
+                           maxRelativeError, pctMaxDiff1, pctMaxDiffN);
+}
+
+void compareIndices(faiss::Index& refIndex,
+                    faiss::Index& testIndex,
+                    int numQuery, int dim, int k,
+                    const std::string& configMsg,
+                    float maxRelativeError,
+                    float pctMaxDiff1,
+                    float pctMaxDiffN) {
+  auto queryVecs = faiss::gpu::randVecs(numQuery, dim);
+
+  compareIndices(queryVecs,
+                 refIndex,
+                 testIndex,
+                 numQuery, dim, k,
+                 configMsg,
+                 maxRelativeError,
+                 pctMaxDiff1,
+                 pctMaxDiffN);
+}
+
+template <typename T>
+inline T lookup(const T* p, int i, int j, int /*dim1*/, int dim2) {
+  return p[i * dim2 + j];
+}
+
+void compareLists(const float* refDist,
+                  const faiss::Index::idx_t* refInd,
+                  const float* testDist,
+                  const faiss::Index::idx_t* testInd,
+                  int dim1, int dim2,
+                  const std::string& configMsg,
+                  bool printBasicStats, bool printDiffs, bool assertOnErr,
+                  float maxRelativeError,
+                  float pctMaxDiff1,
+                  float pctMaxDiffN) {
+
+  float maxAbsErr = 0.0f;
+  for (int i = 0; i < dim1 * dim2; ++i) {
+    maxAbsErr = std::max(maxAbsErr, std::abs(refDist[i] - testDist[i]));
+  }
+  int numResults = dim1 * dim2;
+
+  // query -> {index -> result position}
+  std::vector<std::unordered_map<faiss::Index::idx_t, int>> refIndexMap;
+
+  for (int query = 0; query < dim1; ++query) {
+    std::unordered_map<faiss::Index::idx_t, int> indices;
+
+    for (int result = 0; result < dim2; ++result) {
+      indices[lookup(refInd, query, result, dim1, dim2)] = result;
+    }
+
+    refIndexMap.emplace_back(std::move(indices));
+  }
+
+  // See how far off the indices are
+  // Keep track of the difference for each entry
+  std::vector<std::vector<int>> indexDiffs;
+
+  int diff1 = 0; // index differs by 1
+  int diffN = 0; // index differs by >1
+  int diffInf = 0; // index not found in the other
+  int nonUniqueIndices = 0;
+
+  double avgDiff = 0.0;
+  int maxDiff = 0;
+  float maxRelErr = 0.0f;
+
+  for (int query = 0; query < dim1; ++query) {
+    std::vector<int> diffs;
+    std::set<faiss::Index::idx_t> uniqueIndices;
+
+    auto& indices = refIndexMap[query];
+
+    for (int result = 0; result < dim2; ++result) {
+      auto t = lookup(testInd, query, result, dim1, dim2);
+
+      // All indices reported within a query should be unique; this is
+      // a serious error if is otherwise the case.
+      // If -1 is reported (no result due to IVF partitioning or not enough
+      // entries in the index), then duplicates are allowed, but both the
+      // reference and test must have -1 in the same position.
+      if (t == -1) {
+        EXPECT_EQ(lookup(refInd, query, result, dim1, dim2), t);
+      } else {
+        bool uniqueIndex = uniqueIndices.count(t) == 0;
+        if (assertOnErr) {
+          EXPECT_TRUE(uniqueIndex) << configMsg
+                                   << " " << query
+                                   << " " << result
+                                   << " " << t;
+        }
+
+        if (!uniqueIndex) {
+          ++nonUniqueIndices;
+        } else {
+          uniqueIndices.insert(t);
+        }
+
+        auto it = indices.find(t);
+        if (it != indices.end()) {
+          int diff = std::abs(result - it->second);
+          diffs.push_back(diff);
+
+          if (diff == 1) {
+            ++diff1;
+            maxDiff = std::max(diff, maxDiff);
+          } else if (diff > 1) {
+            ++diffN;
+            maxDiff = std::max(diff, maxDiff);
+          }
+
+          avgDiff += (double) diff;
+        } else {
+          ++diffInf;
+          diffs.push_back(-1);
+          // don't count this for maxDiff
+        }
+      }
+
+      auto refD = lookup(refDist, query, result, dim1, dim2);
+      auto testD = lookup(testDist, query, result, dim1, dim2);
+
+      float relErr = relativeError(refD, testD);
+
+      if (assertOnErr) {
+        EXPECT_LE(relErr, maxRelativeError) << configMsg
+                                            << " (" << query << ", " << result
+                                            << ") refD: " << refD
+                                            << " testD: " << testD;
+      }
+
+      maxRelErr = std::max(maxRelErr, relErr);
+    }
+
+    indexDiffs.emplace_back(std::move(diffs));
+  }
+
+  if (assertOnErr) {
+    EXPECT_LE((float) (diff1 + diffN + diffInf),
+              (float) numResults * pctMaxDiff1) << configMsg;
+
+    // Don't count diffInf because that could be diff1 as far as we
+    // know
+    EXPECT_LE((float) diffN, (float) numResults * pctMaxDiffN) << configMsg;
+  }
+
+  avgDiff /= (double) numResults;
+
+  if (printBasicStats) {
+    if (!configMsg.empty()) {
+      printf("Config\n"
+             "----------------------------\n"
+             "%s\n",
+             configMsg.c_str());
+    }
+
+    printf("Result error and differences\n"
+           "----------------------------\n"
+           "max abs diff %.7f rel diff %.7f\n"
+           "idx diff avg: %.5g max: %d\n"
+           "idx diff of 1:      %d (%.3f%% of queries)\n"
+           "idx diff of >1:     %d (%.3f%% of queries)\n"
+           "idx diff not found: %d (%.3f%% of queries)"
+           " [typically a last element inversion]\n"
+           "non-unique indices: %d (a serious error if >0)\n",
+           maxAbsErr, maxRelErr,
+           avgDiff, maxDiff,
+           diff1, 100.0f * (float) diff1 / (float) numResults,
+           diffN, 100.0f * (float) diffN / (float) numResults,
+           diffInf, 100.0f * (float) diffInf / (float) numResults,
+           nonUniqueIndices);
+  }
+
+  if (printDiffs) {
+    printf("differences:\n");
+    printf("==================\n");
+    for (int query = 0; query < dim1; ++query) {
+      for (int result = 0; result < dim2; ++result) {
+        long refI = lookup(refInd, query, result, dim1, dim2);
+        long testI = lookup(testInd, query, result, dim1, dim2);
+
+        if (refI != testI) {
+          float refD = lookup(refDist, query, result, dim1, dim2);
+          float testD = lookup(testDist, query, result, dim1, dim2);
+
+          float maxDist = std::max(refD, testD);
+          float delta = std::abs(refD - testD);
+
+          float relErr = delta / maxDist;
+
+          if (refD == testD) {
+            printf("(%d, %d [%d]) (ref %ld tst %ld dist ==)\n",
+                   query, result,
+                   indexDiffs[query][result],
+                   refI, testI);
+          } else {
+            printf("(%d, %d [%d]) (ref %ld tst %ld abs %.8f "
+                   "rel %.8f ref %a tst %a)\n",
+                   query, result,
+                   indexDiffs[query][result],
+                   refI, testI, delta, relErr, refD, testD);
+          }
+        }
+      }
+    }
+  }
+}
+
+} }
diff --git a/core/src/index/thirdparty/faiss/gpu/test/TestUtils.h b/core/src/index/thirdparty/faiss/gpu/test/TestUtils.h
new file mode 100644
index 0000000000..c59a4ab0ae
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/test/TestUtils.h
@@ -0,0 +1,93 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/Index.h>
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace faiss { namespace gpu {
+
+/// Generates and displays a new seed for the test
+void newTestSeed();
+
+/// Uses an explicit seed for the test
+void setTestSeed(long seed);
+
+/// Returns the relative error in difference between a and b
+/// (|a - b| / (0.5 * (|a| + |b|))
+float relativeError(float a, float b);
+
+/// Generates a random integer in the range [a, b]
+int randVal(int a, int b);
+
+/// Generates a random bool
+bool randBool();
+
+/// Select a random value from the given list of values provided as an
+/// initializer_list
+template <typename T>
+T randSelect(std::initializer_list<T> vals) {
+  FAISS_ASSERT(vals.size() > 0);
+  int sel = randVal(0, vals.size());
+
+  int i = 0;
+  for (auto v : vals) {
+    if (i++ == sel) {
+      return v;
+    }
+  }
+
+  // should not get here
+  return *vals.begin();
+}
+
+/// Generates a collection of random vectors in the range [0, 1]
+std::vector<float> randVecs(size_t num, size_t dim);
+
+/// Generates a collection of random bit vectors
+std::vector<unsigned char> randBinaryVecs(size_t num, size_t dim);
+
+/// Compare two indices via query for similarity, with a user-specified set of
+/// query vectors
+void compareIndices(const std::vector<float>& queryVecs,
+                    faiss::Index& refIndex,
+                    faiss::Index& testIndex,
+                    int numQuery, int dim, int k,
+                    const std::string& configMsg,
+                    float maxRelativeError = 6e-5f,
+                    float pctMaxDiff1 = 0.1f,
+                    float pctMaxDiffN = 0.005f);
+
+/// Compare two indices via query for similarity, generating random query
+/// vectors
+void compareIndices(faiss::Index& refIndex,
+                    faiss::Index& testIndex,
+                    int numQuery, int dim, int k,
+                    const std::string& configMsg,
+                    float maxRelativeError = 6e-5f,
+                    float pctMaxDiff1 = 0.1f,
+                    float pctMaxDiffN = 0.005f);
+
+/// Display specific differences in the two (distance, index) lists
+void compareLists(const float* refDist,
+                  const faiss::Index::idx_t* refInd,
+                  const float* testDist,
+                  const faiss::Index::idx_t* testInd,
+                  int dim1, int dim2,
+                  const std::string& configMsg,
+                  bool printBasicStats, bool printDiffs, bool assertOnErr,
+                  float maxRelativeError = 6e-5f,
+                  float pctMaxDiff1 = 0.1f,
+                  float pctMaxDiffN = 0.005f);
+
+} }
diff --git a/core/src/index/thirdparty/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp b/core/src/index/thirdparty/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp
new file mode 100644
index 0000000000..852a43cbe9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp
@@ -0,0 +1,159 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved
+
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+
+#include <sys/time.h>
+
+
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+
+#include <faiss/gpu/GpuAutoTune.h>
+#include <faiss/index_io.h>
+
+double elapsed ()
+{
+    struct timeval tv;
+    gettimeofday (&tv, NULL);
+    return  tv.tv_sec + tv.tv_usec * 1e-6;
+}
+
+
+int main ()
+{
+
+    double t0 = elapsed();
+
+    // dimension of the vectors to index
+    int d = 128;
+
+    // size of the database we plan to index
+    size_t nb = 200 * 1000;
+
+    // make a set of nt training vectors in the unit cube
+    // (could be the database)
+    size_t nt = 100 * 1000;
+
+    int dev_no = 0;
+    /*
+    printf ("[%.3f s] Begin d=%d nb=%ld nt=%nt dev_no=%d\n",
+            elapsed() - t0, d, nb, nt, dev_no);
+    */
+    // a reasonable number of centroids to index nb vectors
+    int ncentroids = int (4 * sqrt (nb));
+
+    faiss::gpu::StandardGpuResources resources;
+
+
+    // the coarse quantizer should not be dealloced before the index
+    // 4 = nb of bytes per code (d must be a multiple of this)
+    // 8 = nb of bits per sub-code (almost always 8)
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.device = dev_no;
+
+    faiss::gpu::GpuIndexIVFPQ index (
+      &resources, d, ncentroids, 4, 8, faiss::METRIC_L2, config);
+
+    { // training
+        printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
+                elapsed() - t0, nt, d);
+
+        std::vector <float> trainvecs (nt * d);
+        for (size_t i = 0; i < nt * d; i++) {
+            trainvecs[i] = drand48();
+        }
+
+        printf ("[%.3f s] Training the index\n",
+                elapsed() - t0);
+        index.verbose = true;
+
+        index.train (nt, trainvecs.data());
+    }
+
+    { // I/O demo
+        const char *outfilename = "/tmp/index_trained.faissindex";
+        printf ("[%.3f s] storing the pre-trained index to %s\n",
+                elapsed() - t0, outfilename);
+
+        faiss::Index * cpu_index = faiss::gpu::index_gpu_to_cpu (&index);
+
+        write_index (cpu_index, outfilename);
+
+        delete cpu_index;
+    }
+
+    size_t nq;
+    std::vector<float> queries;
+
+    { // populating the database
+        printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
+                elapsed() - t0, nb);
+
+        std::vector <float> database (nb * d);
+        for (size_t i = 0; i < nb * d; i++) {
+            database[i] = drand48();
+        }
+
+        printf ("[%.3f s] Adding the vectors to the index\n",
+                elapsed() - t0);
+
+        index.add (nb, database.data());
+
+        printf ("[%.3f s] done\n", elapsed() - t0);
+
+        // remember a few elements from the database as queries
+        int i0 = 1234;
+        int i1 = 1243;
+
+        nq = i1 - i0;
+        queries.resize (nq * d);
+        for (int i = i0; i < i1; i++) {
+            for (int j = 0; j < d; j++) {
+                queries [(i - i0) * d  + j]  = database [i * d + j];
+            }
+        }
+
+    }
+
+    { // searching the database
+        int k = 5;
+        printf ("[%.3f s] Searching the %d nearest neighbors "
+                "of %ld vectors in the index\n",
+                elapsed() - t0, k, nq);
+
+        std::vector<faiss::Index::idx_t> nns (k * nq);
+        std::vector<float>               dis (k * nq);
+
+        index.search (nq, queries.data(), k, dis.data(), nns.data());
+
+        printf ("[%.3f s] Query results (vector ids, then distances):\n",
+                elapsed() - t0);
+
+        for (int i = 0; i < nq; i++) {
+            printf ("query %2d: ", i);
+            for (int j = 0; j < k; j++) {
+                printf ("%7ld ", nns[j + i * k]);
+            }
+            printf ("\n     dis: ");
+            for (int j = 0; j < k; j++) {
+                printf ("%7g ", dis[j + i * k]);
+            }
+            printf ("\n");
+        }
+
+        printf ("note that the nearest neighbor is not at "
+                "distance 0 due to quantization errors\n");
+    }
+
+    return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/gpu/test/test_gpu_index.py b/core/src/index/thirdparty/faiss/gpu/test/test_gpu_index.py
new file mode 100644
index 0000000000..4b291febcb
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/test/test_gpu_index.py
@@ -0,0 +1,274 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+from __future__ import print_function
+import time
+import unittest
+import numpy as np
+import faiss
+
+
+class EvalIVFPQAccuracy(unittest.TestCase):
+
+    def get_dataset(self, small_one=False):
+        if not small_one:
+            d = 128
+            nb = 100000
+            nt = 15000
+            nq = 2000
+        else:
+            d = 32
+            nb = 1000
+            nt = 1000
+            nq = 200
+        np.random.seed(123)
+
+        # generate points in a low-dim subspace to make the resutls
+        # look better :-)
+        d1 = 16
+        q, r = np.linalg.qr(np.random.randn(d, d))
+        qc = q[:d1, :]
+        def make_mat(n):
+            return np.dot(
+                np.random.random(size=(nb, d1)), qc).astype('float32')
+
+        return (make_mat(nt), make_mat(nb), make_mat(nq))
+
+
+    def test_mm(self):
+        # trouble with MKL+fbmake that appears only at runtime. Check it here
+        x = np.random.random(size=(100, 20)).astype('float32')
+        mat = faiss.PCAMatrix(20, 10)
+        mat.train(x)
+        mat.apply_py(x)
+
+    def do_cpu_to_gpu(self, index_key):
+        ts = []
+        ts.append(time.time())
+        (xt, xb, xq) = self.get_dataset(small_one=True)
+        nb, d = xb.shape
+
+        index = faiss.index_factory(d, index_key)
+        if index.__class__ == faiss.IndexIVFPQ:
+            # speed up test
+            index.pq.cp.niter = 2
+            index.do_polysemous_training = False
+        ts.append(time.time())
+
+        index.train(xt)
+        ts.append(time.time())
+
+        # adding some ids because there was a bug in this case
+        index.add_with_ids(xb, np.arange(nb) * 3 + 12345)
+        ts.append(time.time())
+
+        index.nprobe = 4
+        D, Iref = index.search(xq, 10)
+        ts.append(time.time())
+
+        res = faiss.StandardGpuResources()
+        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
+        ts.append(time.time())
+
+        gpu_index.setNumProbes(4)
+
+        D, Inew = gpu_index.search(xq, 10)
+        ts.append(time.time())
+        print('times:', [t - ts[0] for t in ts])
+
+        self.assertGreaterEqual((Iref == Inew).sum(), Iref.size)
+
+        if faiss.get_num_gpus() == 1:
+            return
+
+        for shard in False, True:
+
+            # test on just 2 GPUs
+            res = [faiss.StandardGpuResources() for i in range(2)]
+            co = faiss.GpuMultipleClonerOptions()
+            co.shard = shard
+
+            gpu_index = faiss.index_cpu_to_gpu_multiple_py(res, index, co)
+
+            faiss.GpuParameterSpace().set_index_parameter(
+                gpu_index, 'nprobe', 4)
+
+            D, Inew = gpu_index.search(xq, 10)
+
+            self.assertGreaterEqual((Iref == Inew).sum(), Iref.size)
+
+    def test_cpu_to_gpu_IVFPQ(self):
+        self.do_cpu_to_gpu('IVF128,PQ4')
+
+    def test_cpu_to_gpu_IVFFlat(self):
+        self.do_cpu_to_gpu('IVF128,Flat')
+
+    def test_set_gpu_param(self):
+        index = faiss.index_factory(12, "PCAR8,IVF10,PQ4")
+        res = faiss.StandardGpuResources()
+        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
+        faiss.GpuParameterSpace().set_index_parameter(gpu_index, "nprobe", 3)
+
+
+class ReferencedObject(unittest.TestCase):
+
+    d = 16
+    xb = np.random.rand(256, d).astype('float32')
+    nlist = 128
+
+    d_bin = 256
+    xb_bin = np.random.randint(256, size=(10000, d_bin // 8)).astype('uint8')
+    xq_bin = np.random.randint(256, size=(1000, d_bin // 8)).astype('uint8')
+
+    def test_proxy(self):
+        index = faiss.IndexReplicas()
+        for i in range(3):
+            sub_index = faiss.IndexFlatL2(self.d)
+            sub_index.add(self.xb)
+            index.addIndex(sub_index)
+        assert index.d == self.d
+        index.search(self.xb, 10)
+
+    def test_resources(self):
+        # this used to crash!
+        index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0,
+                                       faiss.IndexFlatL2(self.d))
+        index.add(self.xb)
+
+    def test_flat(self):
+        index = faiss.GpuIndexFlat(faiss.StandardGpuResources(),
+                                   self.d, faiss.METRIC_L2)
+        index.add(self.xb)
+
+    def test_ivfflat(self):
+        index = faiss.GpuIndexIVFFlat(
+            faiss.StandardGpuResources(),
+            self.d, self.nlist, faiss.METRIC_L2)
+        index.train(self.xb)
+
+    def test_ivfpq(self):
+        index_cpu = faiss.IndexIVFPQ(
+            faiss.IndexFlatL2(self.d),
+            self.d, self.nlist, 2, 8)
+        # speed up test
+        index_cpu.pq.cp.niter = 2
+        index_cpu.do_polysemous_training = False
+        index_cpu.train(self.xb)
+
+        index = faiss.GpuIndexIVFPQ(
+            faiss.StandardGpuResources(), index_cpu)
+        index.add(self.xb)
+
+    def test_binary_flat(self):
+        k = 10
+
+        index_ref = faiss.IndexBinaryFlat(self.d_bin)
+        index_ref.add(self.xb_bin)
+        D_ref, I_ref = index_ref.search(self.xq_bin, k)
+
+        index = faiss.GpuIndexBinaryFlat(faiss.StandardGpuResources(),
+                                         self.d_bin)
+        index.add(self.xb_bin)
+        D, I = index.search(self.xq_bin, k)
+
+        for d_ref, i_ref, d_new, i_new in zip(D_ref, I_ref, D, I):
+            # exclude max distance
+            assert d_ref.max() == d_new.max()
+            dmax = d_ref.max()
+
+            # sort by (distance, id) pairs to be reproducible
+            ref = [(d, i) for d, i in zip(d_ref, i_ref) if d < dmax]
+            ref.sort()
+
+            new = [(d, i) for d, i in zip(d_new, i_new) if d < dmax]
+            new.sort()
+
+            assert ref == new
+
+    def test_stress(self):
+        # a mixture of the above, from issue #631
+        target = np.random.rand(50, 16).astype('float32')
+
+        index = faiss.IndexReplicas()
+        size, dim = target.shape
+        num_gpu = 4
+        for i in range(num_gpu):
+            config = faiss.GpuIndexFlatConfig()
+            config.device = 0   # simulate on a single GPU
+            sub_index = faiss.GpuIndexFlatIP(faiss.StandardGpuResources(), dim, config)
+            index.addIndex(sub_index)
+
+        index = faiss.IndexIDMap(index)
+        ids = np.arange(size)
+        index.add_with_ids(target, ids)
+
+
+
+class TestShardedFlat(unittest.TestCase):
+
+    def test_sharded(self):
+        d = 32
+        nb = 1000
+        nq = 200
+        k = 10
+        rs = np.random.RandomState(123)
+        xb = rs.rand(nb, d).astype('float32')
+        xq = rs.rand(nq, d).astype('float32')
+
+        index_cpu = faiss.IndexFlatL2(d)
+
+        assert faiss.get_num_gpus() > 1
+
+        co = faiss.GpuMultipleClonerOptions()
+        co.shard = True
+        index = faiss.index_cpu_to_all_gpus(index_cpu, co, ngpu=2)
+
+        index.add(xb)
+        D, I = index.search(xq, k)
+
+        index_cpu.add(xb)
+        D_ref, I_ref = index_cpu.search(xq, k)
+
+        assert np.all(I == I_ref)
+
+        del index
+        index2 = faiss.index_cpu_to_all_gpus(index_cpu, co, ngpu=2)
+        D2, I2 = index2.search(xq, k)
+
+        assert np.all(I2 == I_ref)
+
+        try:
+            index2.add(xb)
+        except RuntimeError:
+            pass
+        else:
+            assert False, "this call should fail!"
+
+
+class TestGPUKmeans(unittest.TestCase):
+
+    def test_kmeans(self):
+        d = 32
+        nb = 1000
+        k = 10
+        rs = np.random.RandomState(123)
+        xb = rs.rand(nb, d).astype('float32')
+
+        km1 = faiss.Kmeans(d, k)
+        obj1 = km1.train(xb)
+
+        km2 = faiss.Kmeans(d, k, gpu=True)
+        obj2 = km2.train(xb)
+
+        print(obj1, obj2)
+        assert np.allclose(obj1, obj2)
+
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/core/src/index/thirdparty/faiss/gpu/test/test_gpu_index_ivfsq.py b/core/src/index/thirdparty/faiss/gpu/test/test_gpu_index_ivfsq.py
new file mode 100644
index 0000000000..6c312af3e6
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/test/test_gpu_index_ivfsq.py
@@ -0,0 +1,229 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python3
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import faiss
+
+def make_t(num, d, clamp=False):
+    rs = np.random.RandomState(123)
+    x = rs.rand(num, d).astype('float32')
+    if clamp:
+        x = (x * 255).astype('uint8').astype('float32')
+    return x
+
+def make_indices_copy_from_cpu(nlist, d, qtype, by_residual, metric, clamp):
+    to_train = make_t(10000, d, clamp)
+
+    quantizer_cp = faiss.IndexFlat(d, metric)
+    idx_cpu = faiss.IndexIVFScalarQuantizer(quantizer_cp, d, nlist,
+                                            qtype, metric, by_residual)
+
+    idx_cpu.train(to_train)
+    idx_cpu.add(to_train)
+
+    res = faiss.StandardGpuResources()
+    res.noTempMemory()
+    idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, idx_cpu)
+
+    return idx_cpu, idx_gpu
+
+
+def make_indices_copy_from_gpu(nlist, d, qtype, by_residual, metric, clamp):
+    to_train = make_t(10000, d, clamp)
+
+    res = faiss.StandardGpuResources()
+    res.noTempMemory()
+    idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, d, nlist,
+                                               qtype, metric, by_residual)
+    idx_gpu.train(to_train)
+    idx_gpu.add(to_train)
+
+    quantizer_cp = faiss.IndexFlat(d, metric)
+    idx_cpu = faiss.IndexIVFScalarQuantizer(quantizer_cp, d, nlist,
+                                            qtype, metric, by_residual)
+    idx_gpu.copyTo(idx_cpu)
+
+    return idx_cpu, idx_gpu
+
+
+def make_indices_train(nlist, d, qtype, by_residual, metric, clamp):
+    to_train = make_t(10000, d, clamp)
+
+    quantizer_cp = faiss.IndexFlat(d, metric)
+    idx_cpu = faiss.IndexIVFScalarQuantizer(quantizer_cp, d, nlist,
+                                            qtype, metric, by_residual)
+    assert(by_residual == idx_cpu.by_residual)
+
+    idx_cpu.train(to_train)
+    idx_cpu.add(to_train)
+
+    res = faiss.StandardGpuResources()
+    res.noTempMemory()
+    idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, d, nlist,
+                                               qtype, metric, by_residual)
+    assert(by_residual == idx_gpu.by_residual)
+
+    idx_gpu.train(to_train)
+    idx_gpu.add(to_train)
+
+    return idx_cpu, idx_gpu
+
+#
+# Testing functions
+#
+
+def summarize_results(dist, idx):
+    valid = []
+    invalid = []
+    for query in range(dist.shape[0]):
+        valid_sub = {}
+        invalid_sub = []
+
+        for order, (d, i) in enumerate(zip(dist[query], idx[query])):
+            if i == -1:
+                invalid_sub.append(order)
+            else:
+                valid_sub[i] = [order, d]
+
+        valid.append(valid_sub)
+        invalid.append(invalid_sub)
+
+    return valid, invalid
+
+def compare_results(d1, i1, d2, i2):
+    # Count number of index differences
+    idx_diffs = {}
+    idx_diffs_inf = 0
+    idx_invalid = 0
+
+    valid1, invalid1 = summarize_results(d1, i1)
+    valid2, invalid2 = summarize_results(d2, i2)
+
+    # Invalid results should be the same for both
+    # (except if we happen to hit different centroids)
+    for inv1, inv2 in zip(invalid1, invalid2):
+        if (len(inv1) != len(inv2)):
+            print('mismatch ', len(inv1), len(inv2), inv2[0])
+
+        assert(len(inv1) == len(inv2))
+        idx_invalid += len(inv2)
+        for x1, x2 in zip(inv1, inv2):
+            assert(x1 == x2)
+
+    for _, (query1, query2) in enumerate(zip(valid1, valid2)):
+        for idx1, order_d1 in query1.items():
+            order_d2 = query2.get(idx1, None)
+            if order_d2:
+                idx_diff = order_d1[0] - order_d2[0]
+
+                if idx_diff not in idx_diffs:
+                    idx_diffs[idx_diff] = 1
+                else:
+                    idx_diffs[idx_diff] += 1
+            else:
+                idx_diffs_inf += 1
+
+    return idx_diffs, idx_diffs_inf, idx_invalid
+
+def check_diffs(total_num, in_window_thresh, diffs, diff_inf, invalid):
+    # We require a certain fraction of results to be within +/- diff_window
+    # index differences
+    diff_window = 4
+    in_window = 0
+
+    for diff in sorted(diffs):
+        if abs(diff) <= diff_window:
+            in_window += diffs[diff] / total_num
+
+    if (in_window < in_window_thresh):
+        print('error {} {}'.format(in_window, in_window_thresh))
+        assert(in_window >= in_window_thresh)
+
+def do_test_with_index(ci, gi, nprobe, k, clamp, in_window_thresh):
+    num_query = 11
+    to_query = make_t(num_query, ci.d, clamp)
+
+    ci.nprobe = ci.nprobe
+    gi.nprobe = gi.nprobe
+
+    total_num = num_query * k
+    check_diffs(total_num, in_window_thresh,
+                *compare_results(*ci.search(to_query, k),
+                                 *gi.search(to_query, k)))
+
+def do_test(nlist, d, qtype, by_residual, metric, nprobe, k):
+    clamp = (qtype == faiss.ScalarQuantizer.QT_8bit_direct)
+    ci, gi = make_indices_copy_from_cpu(nlist, d, qtype,
+                                        by_residual, metric, clamp)
+    # A direct copy should be much more closely in agreement
+    # (except for fp accumulation order differences)
+    do_test_with_index(ci, gi, nprobe, k, clamp, 0.99)
+
+    ci, gi = make_indices_copy_from_gpu(nlist, d, qtype,
+                                        by_residual, metric, clamp)
+    # A direct copy should be much more closely in agreement
+    # (except for fp accumulation order differences)
+    do_test_with_index(ci, gi, nprobe, k, clamp, 0.99)
+
+    ci, gi = make_indices_train(nlist, d, qtype,
+                                by_residual, metric, clamp)
+    # Separate training can produce a slightly different coarse quantizer
+    # and residuals
+    do_test_with_index(ci, gi, nprobe, k, clamp, 0.8)
+
+def do_multi_test(qtype):
+    nlist = 100
+    nprobe = 10
+    k = 50
+
+    for d in [11, 64]:
+        if (qtype != faiss.ScalarQuantizer.QT_8bit_direct):
+            # residual doesn't make sense here
+            do_test(nlist, d, qtype, True,
+                    faiss.METRIC_L2, nprobe, k)
+            do_test(nlist, d, qtype, True,
+                    faiss.METRIC_INNER_PRODUCT, nprobe, k)
+        do_test(nlist, d, qtype, False, faiss.METRIC_L2, nprobe, k)
+        do_test(nlist, d, qtype, False, faiss.METRIC_INNER_PRODUCT, nprobe, k)
+
+#
+# Test
+#
+
+class TestSQ(unittest.TestCase):
+    def test_fp16(self):
+        do_multi_test(faiss.ScalarQuantizer.QT_fp16)
+
+    def test_8bit(self):
+        do_multi_test(faiss.ScalarQuantizer.QT_8bit)
+
+    def test_8bit_uniform(self):
+        do_multi_test(faiss.ScalarQuantizer.QT_8bit_uniform)
+
+    def test_6bit(self):
+        try:
+            do_multi_test(faiss.ScalarQuantizer.QT_6bit)
+            # should not reach here; QT_6bit is unimplemented
+        except:
+            print('QT_6bit exception thrown (is expected)')
+        else:
+            assert(False)
+
+    def test_4bit(self):
+        do_multi_test(faiss.ScalarQuantizer.QT_4bit)
+
+    def test_4bit_uniform(self):
+        do_multi_test(faiss.ScalarQuantizer.QT_4bit_uniform)
+
+    def test_8bit_direct(self):
+        do_multi_test(faiss.ScalarQuantizer.QT_8bit_direct)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/core/src/index/thirdparty/faiss/gpu/test/test_pytorch_faiss.py b/core/src/index/thirdparty/faiss/gpu/test/test_pytorch_faiss.py
new file mode 100644
index 0000000000..3348e104b2
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/test/test_pytorch_faiss.py
@@ -0,0 +1,209 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+import numpy as np
+import unittest
+import faiss
+import torch
+
+def swig_ptr_from_FloatTensor(x):
+    assert x.is_contiguous()
+    assert x.dtype == torch.float32
+    return faiss.cast_integer_to_float_ptr(
+        x.storage().data_ptr() + x.storage_offset() * 4)
+
+def swig_ptr_from_LongTensor(x):
+    assert x.is_contiguous()
+    assert x.dtype == torch.int64, 'dtype=%s' % x.dtype
+    return faiss.cast_integer_to_long_ptr(
+        x.storage().data_ptr() + x.storage_offset() * 8)
+
+
+
+def search_index_pytorch(index, x, k, D=None, I=None):
+    """call the search function of an index with pytorch tensor I/O (CPU
+    and GPU supported)"""
+    assert x.is_contiguous()
+    n, d = x.size()
+    assert d == index.d
+
+    if D is None:
+        D = torch.empty((n, k), dtype=torch.float32, device=x.device)
+    else:
+        assert D.size() == (n, k)
+
+    if I is None:
+        I = torch.empty((n, k), dtype=torch.int64, device=x.device)
+    else:
+        assert I.size() == (n, k)
+    torch.cuda.synchronize()
+    xptr = swig_ptr_from_FloatTensor(x)
+    Iptr = swig_ptr_from_LongTensor(I)
+    Dptr = swig_ptr_from_FloatTensor(D)
+    index.search_c(n, xptr,
+                   k, Dptr, Iptr)
+    torch.cuda.synchronize()
+    return D, I
+
+
+def search_raw_array_pytorch(res, xb, xq, k, D=None, I=None,
+                             metric=faiss.METRIC_L2):
+    assert xb.device == xq.device
+
+    nq, d = xq.size()
+    if xq.is_contiguous():
+        xq_row_major = True
+    elif xq.t().is_contiguous():
+        xq = xq.t()    # I initially wrote xq:t(), Lua is still haunting me :-)
+        xq_row_major = False
+    else:
+        raise TypeError('matrix should be row or column-major')
+
+    xq_ptr = swig_ptr_from_FloatTensor(xq)
+
+    nb, d2 = xb.size()
+    assert d2 == d
+    if xb.is_contiguous():
+        xb_row_major = True
+    elif xb.t().is_contiguous():
+        xb = xb.t()
+        xb_row_major = False
+    else:
+        raise TypeError('matrix should be row or column-major')
+    xb_ptr = swig_ptr_from_FloatTensor(xb)
+
+    if D is None:
+        D = torch.empty(nq, k, device=xb.device, dtype=torch.float32)
+    else:
+        assert D.shape == (nq, k)
+        assert D.device == xb.device
+
+    if I is None:
+        I = torch.empty(nq, k, device=xb.device, dtype=torch.int64)
+    else:
+        assert I.shape == (nq, k)
+        assert I.device == xb.device
+
+    D_ptr = swig_ptr_from_FloatTensor(D)
+    I_ptr = swig_ptr_from_LongTensor(I)
+
+    faiss.bruteForceKnn(res, metric,
+                        xb_ptr, xb_row_major, nb,
+                        xq_ptr, xq_row_major, nq,
+                        d, k, D_ptr, I_ptr)
+
+    return D, I
+
+
+class PytorchFaissInterop(unittest.TestCase):
+
+    def test_interop(self):
+
+        d = 16
+        nq = 5
+        nb = 20
+
+        xq = faiss.randn(nq * d, 1234).reshape(nq, d)
+        xb = faiss.randn(nb * d, 1235).reshape(nb, d)
+
+        res = faiss.StandardGpuResources()
+        index = faiss.GpuIndexFlatIP(res, d)
+        index.add(xb)
+
+        # reference CPU result
+        Dref, Iref = index.search(xq, 5)
+
+        # query is pytorch tensor (CPU)
+        xq_torch = torch.FloatTensor(xq)
+
+        D2, I2 = search_index_pytorch(index, xq_torch, 5)
+
+        assert np.all(Iref == I2.numpy())
+
+        # query is pytorch tensor (GPU)
+        xq_torch = xq_torch.cuda()
+        # no need for a sync here
+
+        D3, I3 = search_index_pytorch(index, xq_torch, 5)
+
+        # D3 and I3 are on torch tensors on GPU as well.
+        # this does a sync, which is useful because faiss and
+        # pytorch use different Cuda streams.
+        res.syncDefaultStreamCurrentDevice()
+
+        assert np.all(Iref == I3.cpu().numpy())
+
+    def test_raw_array_search(self):
+        d = 32
+        nb = 1024
+        nq = 128
+        k = 10
+
+        # make GT on Faiss CPU
+
+        xq = faiss.randn(nq * d, 1234).reshape(nq, d)
+        xb = faiss.randn(nb * d, 1235).reshape(nb, d)
+
+        index = faiss.IndexFlatL2(d)
+        index.add(xb)
+        gt_D, gt_I = index.search(xq, k)
+
+        # resource object, can be re-used over calls
+        res = faiss.StandardGpuResources()
+        # put on same stream as pytorch to avoid synchronizing streams
+        res.setDefaultNullStreamAllDevices()
+
+        for xq_row_major in True, False:
+            for xb_row_major in True, False:
+
+                # move to pytorch & GPU
+                xq_t = torch.from_numpy(xq).cuda()
+                xb_t = torch.from_numpy(xb).cuda()
+
+                if not xq_row_major:
+                    xq_t = xq_t.t().clone().t()
+                    assert not xq_t.is_contiguous()
+
+                if not xb_row_major:
+                    xb_t = xb_t.t().clone().t()
+                    assert not xb_t.is_contiguous()
+
+                D, I = search_raw_array_pytorch(res, xb_t, xq_t, k)
+
+                # back to CPU for verification
+                D = D.cpu().numpy()
+                I = I.cpu().numpy()
+
+                assert np.all(I == gt_I)
+                assert np.all(np.abs(D - gt_D).max() < 1e-4)
+
+
+
+                # test on subset
+                try:
+                    D, I = search_raw_array_pytorch(res, xb_t, xq_t[60:80], k)
+                except TypeError:
+                    if not xq_row_major:
+                        # then it is expected
+                        continue
+                    # otherwise it is an error
+                    raise
+
+                # back to CPU for verification
+                D = D.cpu().numpy()
+                I = I.cpu().numpy()
+
+                assert np.all(I == gt_I[60:80])
+                assert np.all(np.abs(D - gt_D[60:80]).max() < 1e-4)
+
+
+
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectFloat.cu b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectFloat.cu
new file mode 100644
index 0000000000..47617fbe85
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectFloat.cu
@@ -0,0 +1,144 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+
+namespace faiss { namespace gpu {
+
+// warp Q to thread Q:
+// 1, 1
+// 32, 2
+// 64, 3
+// 128, 3
+// 256, 4
+// 512, 8
+// 1024, 8
+// 2048, 8
+
+BLOCK_SELECT_DECL(float, true, 1);
+BLOCK_SELECT_DECL(float, true, 32);
+BLOCK_SELECT_DECL(float, true, 64);
+BLOCK_SELECT_DECL(float, true, 128);
+BLOCK_SELECT_DECL(float, true, 256);
+BLOCK_SELECT_DECL(float, true, 512);
+BLOCK_SELECT_DECL(float, true, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+BLOCK_SELECT_DECL(float, true, 2048);
+#endif
+
+BLOCK_SELECT_DECL(float, false, 1);
+BLOCK_SELECT_DECL(float, false, 32);
+BLOCK_SELECT_DECL(float, false, 64);
+BLOCK_SELECT_DECL(float, false, 128);
+BLOCK_SELECT_DECL(float, false, 256);
+BLOCK_SELECT_DECL(float, false, 512);
+BLOCK_SELECT_DECL(float, false, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+BLOCK_SELECT_DECL(float, false, 2048);
+#endif
+
+void runBlockSelect(Tensor<float, 2, true>& in,
+                    Tensor<float, 2, true>& outK,
+                    Tensor<int, 2, true>& outV,
+                    bool dir, int k, cudaStream_t stream) {
+  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
+
+  if (dir) {
+    if (k == 1) {
+      BLOCK_SELECT_CALL(float, true, 1);
+    } else if (k <= 32) {
+      BLOCK_SELECT_CALL(float, true, 32);
+    } else if (k <= 64) {
+      BLOCK_SELECT_CALL(float, true, 64);
+    } else if (k <= 128) {
+      BLOCK_SELECT_CALL(float, true, 128);
+    } else if (k <= 256) {
+      BLOCK_SELECT_CALL(float, true, 256);
+    } else if (k <= 512) {
+      BLOCK_SELECT_CALL(float, true, 512);
+    } else if (k <= 1024) {
+      BLOCK_SELECT_CALL(float, true, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+    } else if (k <= 2048) {
+      BLOCK_SELECT_CALL(float, true, 2048);
+#endif
+    }
+  } else {
+    if (k == 1) {
+      BLOCK_SELECT_CALL(float, false, 1);
+    } else if (k <= 32) {
+      BLOCK_SELECT_CALL(float, false, 32);
+    } else if (k <= 64) {
+      BLOCK_SELECT_CALL(float, false, 64);
+    } else if (k <= 128) {
+      BLOCK_SELECT_CALL(float, false, 128);
+    } else if (k <= 256) {
+      BLOCK_SELECT_CALL(float, false, 256);
+    } else if (k <= 512) {
+      BLOCK_SELECT_CALL(float, false, 512);
+    } else if (k <= 1024) {
+      BLOCK_SELECT_CALL(float, false, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+    } else if (k <= 2048) {
+      BLOCK_SELECT_CALL(float, false, 2048);
+#endif
+    }
+  }
+}
+
+void runBlockSelectPair(Tensor<float, 2, true>& inK,
+                        Tensor<int, 2, true>& inV,
+                        Tensor<float, 2, true>& outK,
+                        Tensor<int, 2, true>& outV,
+                        bool dir, int k, cudaStream_t stream) {
+  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
+
+  if (dir) {
+    if (k == 1) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 1);
+    } else if (k <= 32) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 32);
+    } else if (k <= 64) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 64);
+    } else if (k <= 128) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 128);
+    } else if (k <= 256) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 256);
+    } else if (k <= 512) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 512);
+    } else if (k <= 1024) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+    } else if (k <= 2048) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 2048);
+#endif
+    }
+  } else {
+    if (k == 1) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 1);
+    } else if (k <= 32) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 32);
+    } else if (k <= 64) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 64);
+    } else if (k <= 128) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 128);
+    } else if (k <= 256) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 256);
+    } else if (k <= 512) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 512);
+    } else if (k <= 1024) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+    } else if (k <= 2048) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 2048);
+#endif
+    }
+  }
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectHalf.cu b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectHalf.cu
new file mode 100644
index 0000000000..bc05e1485f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectHalf.cu
@@ -0,0 +1,144 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+
+namespace faiss { namespace gpu {
+
+// warp Q to thread Q:
+// 1, 1
+// 32, 2
+// 64, 3
+// 128, 3
+// 256, 4
+// 512, 8
+// 1024, 8
+// 2048, 8
+
+BLOCK_SELECT_DECL(half, true, 1);
+BLOCK_SELECT_DECL(half, true, 32);
+BLOCK_SELECT_DECL(half, true, 64);
+BLOCK_SELECT_DECL(half, true, 128);
+BLOCK_SELECT_DECL(half, true, 256);
+BLOCK_SELECT_DECL(half, true, 512);
+BLOCK_SELECT_DECL(half, true, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+BLOCK_SELECT_DECL(half, true, 2048);
+#endif
+
+BLOCK_SELECT_DECL(half, false, 1);
+BLOCK_SELECT_DECL(half, false, 32);
+BLOCK_SELECT_DECL(half, false, 64);
+BLOCK_SELECT_DECL(half, false, 128);
+BLOCK_SELECT_DECL(half, false, 256);
+BLOCK_SELECT_DECL(half, false, 512);
+BLOCK_SELECT_DECL(half, false, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+BLOCK_SELECT_DECL(half, false, 2048);
+#endif
+
+void runBlockSelect(Tensor<half, 2, true>& in,
+                    Tensor<half, 2, true>& outK,
+                    Tensor<int, 2, true>& outV,
+                    bool dir, int k, cudaStream_t stream) {
+  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
+
+  if (dir) {
+    if (k == 1) {
+      BLOCK_SELECT_CALL(half, true, 1);
+    } else if (k <= 32) {
+      BLOCK_SELECT_CALL(half, true, 32);
+    } else if (k <= 64) {
+      BLOCK_SELECT_CALL(half, true, 64);
+    } else if (k <= 128) {
+      BLOCK_SELECT_CALL(half, true, 128);
+    } else if (k <= 256) {
+      BLOCK_SELECT_CALL(half, true, 256);
+    } else if (k <= 512) {
+      BLOCK_SELECT_CALL(half, true, 512);
+    } else if (k <= 1024) {
+      BLOCK_SELECT_CALL(half, true, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+    } else if (k <= 2048) {
+      BLOCK_SELECT_CALL(half, true, 2048);
+#endif
+    }
+  } else {
+    if (k == 1) {
+      BLOCK_SELECT_CALL(half, false, 1);
+    } else if (k <= 32) {
+      BLOCK_SELECT_CALL(half, false, 32);
+    } else if (k <= 64) {
+      BLOCK_SELECT_CALL(half, false, 64);
+    } else if (k <= 128) {
+      BLOCK_SELECT_CALL(half, false, 128);
+    } else if (k <= 256) {
+      BLOCK_SELECT_CALL(half, false, 256);
+    } else if (k <= 512) {
+      BLOCK_SELECT_CALL(half, false, 512);
+    } else if (k <= 1024) {
+      BLOCK_SELECT_CALL(half, false, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+    } else if (k <= 2048) {
+      BLOCK_SELECT_CALL(half, false, 2048);
+#endif
+    }
+  }
+}
+
+void runBlockSelectPair(Tensor<half, 2, true>& inK,
+                        Tensor<int, 2, true>& inV,
+                        Tensor<half, 2, true>& outK,
+                        Tensor<int, 2, true>& outV,
+                        bool dir, int k, cudaStream_t stream) {
+  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
+
+  if (dir) {
+    if (k == 1) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 1);
+    } else if (k <= 32) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 32);
+    } else if (k <= 64) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 64);
+    } else if (k <= 128) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 128);
+    } else if (k <= 256) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 256);
+    } else if (k <= 512) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 512);
+    } else if (k <= 1024) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+    } else if (k <= 2048) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 2048);
+#endif
+    }
+  } else {
+    if (k == 1) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 1);
+    } else if (k <= 32) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 32);
+    } else if (k <= 64) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 64);
+    } else if (k <= 128) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 128);
+    } else if (k <= 256) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 256);
+    } else if (k <= 512) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 512);
+    } else if (k <= 1024) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+    } else if (k <= 2048) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 2048);
+#endif
+    }
+  }
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh
new file mode 100644
index 0000000000..04e76541de
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh
@@ -0,0 +1,135 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/utils/Select.cuh>
+
+namespace faiss { namespace gpu {
+
+template <typename K,
+          typename IndexType,
+          bool Dir,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
+__global__ void blockSelect(Tensor<K, 2, true> in,
+                            Tensor<K, 2, true> outK,
+                            Tensor<IndexType, 2, true> outV,
+                            K initK,
+                            IndexType initV,
+                            int k) {
+  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+
+  __shared__ K smemK[kNumWarps * NumWarpQ];
+  __shared__ IndexType smemV[kNumWarps * NumWarpQ];
+
+  BlockSelect<K, IndexType, Dir, Comparator<K>,
+              NumWarpQ, NumThreadQ, ThreadsPerBlock>
+    heap(initK, initV, smemK, smemV, k);
+
+  // Grid is exactly sized to rows available
+  int row = blockIdx.x;
+
+  int i = threadIdx.x;
+  K* inStart = in[row][i].data();
+
+  // Whole warps must participate in the selection
+  int limit = utils::roundDown(in.getSize(1), kWarpSize);
+
+  for (; i < limit; i += ThreadsPerBlock) {
+    heap.add(*inStart, (IndexType) i);
+    inStart += ThreadsPerBlock;
+  }
+
+  // Handle last remainder fraction of a warp of elements
+  if (i < in.getSize(1)) {
+    heap.addThreadQ(*inStart, (IndexType) i);
+  }
+
+  heap.reduce();
+
+  for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
+    outK[row][i] = smemK[i];
+    outV[row][i] = smemV[i];
+  }
+}
+
+template <typename K,
+          typename IndexType,
+          bool Dir,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
+__global__ void blockSelectPair(Tensor<K, 2, true> inK,
+                                Tensor<IndexType, 2, true> inV,
+                                Tensor<K, 2, true> outK,
+                                Tensor<IndexType, 2, true> outV,
+                                K initK,
+                                IndexType initV,
+                                int k) {
+  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+
+  __shared__ K smemK[kNumWarps * NumWarpQ];
+  __shared__ IndexType smemV[kNumWarps * NumWarpQ];
+
+  BlockSelect<K, IndexType, Dir, Comparator<K>,
+              NumWarpQ, NumThreadQ, ThreadsPerBlock>
+    heap(initK, initV, smemK, smemV, k);
+
+  // Grid is exactly sized to rows available
+  int row = blockIdx.x;
+
+  int i = threadIdx.x;
+  K* inKStart = inK[row][i].data();
+  IndexType* inVStart = inV[row][i].data();
+
+  // Whole warps must participate in the selection
+  int limit = utils::roundDown(inK.getSize(1), kWarpSize);
+
+  for (; i < limit; i += ThreadsPerBlock) {
+    heap.add(*inKStart, *inVStart);
+    inKStart += ThreadsPerBlock;
+    inVStart += ThreadsPerBlock;
+  }
+
+  // Handle last remainder fraction of a warp of elements
+  if (i < inK.getSize(1)) {
+    heap.addThreadQ(*inKStart, *inVStart);
+  }
+
+  heap.reduce();
+
+  for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
+    outK[row][i] = smemK[i];
+    outV[row][i] = smemV[i];
+  }
+}
+
+void runBlockSelect(Tensor<float, 2, true>& in,
+                    Tensor<float, 2, true>& outKeys,
+                    Tensor<int, 2, true>& outIndices,
+                    bool dir, int k, cudaStream_t stream);
+
+void runBlockSelectPair(Tensor<float, 2, true>& inKeys,
+                        Tensor<int, 2, true>& inIndices,
+                        Tensor<float, 2, true>& outKeys,
+                        Tensor<int, 2, true>& outIndices,
+                        bool dir, int k, cudaStream_t stream);
+
+void runBlockSelect(Tensor<half, 2, true>& in,
+                    Tensor<half, 2, true>& outKeys,
+                    Tensor<int, 2, true>& outIndices,
+                    bool dir, int k, cudaStream_t stream);
+
+void runBlockSelectPair(Tensor<half, 2, true>& inKeys,
+                        Tensor<int, 2, true>& inIndices,
+                        Tensor<half, 2, true>& outKeys,
+                        Tensor<int, 2, true>& outIndices,
+                        bool dir, int k, cudaStream_t stream);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Comparators.cuh b/core/src/index/thirdparty/faiss/gpu/utils/Comparators.cuh
new file mode 100644
index 0000000000..5abfab6af5
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/Comparators.cuh
@@ -0,0 +1,46 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <cuda.h>
+#include <faiss/gpu/utils/Float16.cuh>
+
+namespace faiss { namespace gpu {
+
+template <typename T>
+struct Comparator {
+  __device__ static inline bool lt(T a, T b) {
+    return a < b;
+  }
+
+  __device__ static inline bool gt(T a, T b) {
+    return a > b;
+  }
+};
+
+template <>
+struct Comparator<half> {
+  __device__ static inline bool lt(half a, half b) {
+#if FAISS_USE_FULL_FLOAT16
+    return __hlt(a, b);
+#else
+    return __half2float(a) < __half2float(b);
+#endif // FAISS_USE_FULL_FLOAT16
+  }
+
+  __device__ static inline bool gt(half a, half b) {
+#if FAISS_USE_FULL_FLOAT16
+    return __hgt(a, b);
+#else
+    return __half2float(a) > __half2float(b);
+#endif // FAISS_USE_FULL_FLOAT16
+  }
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/ConversionOperators.cuh b/core/src/index/thirdparty/faiss/gpu/utils/ConversionOperators.cuh
new file mode 100644
index 0000000000..a53e6fc2ed
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/ConversionOperators.cuh
@@ -0,0 +1,124 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <cuda.h>
+#include <faiss/Index.h>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+
+#include <thrust/execution_policy.h>
+#include <thrust/transform.h>
+
+namespace faiss { namespace gpu {
+
+//
+// Conversion utilities
+//
+
+template <typename From, typename To>
+struct Convert {
+  inline __device__ To operator()(From v) const {
+    return (To) v;
+  }
+};
+
+template <>
+struct Convert<float, half> {
+  inline __device__ half operator()(float v) const {
+    return __float2half(v);
+  }
+};
+
+template <>
+struct Convert<half, float> {
+  inline __device__ float operator()(half v) const {
+    return __half2float(v);
+  }
+};
+
+template <typename T>
+struct ConvertTo {
+};
+
+template <>
+struct ConvertTo<float> {
+  static inline __device__ float to(float v) { return v; }
+  static inline __device__ float to(half v) { return __half2float(v); }
+};
+
+template <>
+struct ConvertTo<float2> {
+  static inline __device__ float2 to(float2 v) { return v; }
+  static inline __device__ float2 to(half2 v) { return __half22float2(v); }
+};
+
+template <>
+struct ConvertTo<float4> {
+  static inline __device__ float4 to(float4 v) { return v; }
+  static inline __device__ float4 to(Half4 v) { return half4ToFloat4(v); }
+};
+
+template <>
+struct ConvertTo<half> {
+  static inline __device__ half to(float v) { return __float2half(v); }
+  static inline __device__ half to(half v) { return v; }
+};
+
+template <>
+struct ConvertTo<half2> {
+  static inline __device__ half2 to(float2 v) { return __float22half2_rn(v); }
+  static inline __device__ half2 to(half2 v) { return v; }
+};
+
+template <>
+struct ConvertTo<Half4> {
+  static inline __device__ Half4 to(float4 v) { return float4ToHalf4(v); }
+  static inline __device__ Half4 to(Half4 v) { return v; }
+};
+
+// Tensor conversion
+template <typename From, typename To>
+void runConvert(const From* in,
+                To* out,
+                size_t num,
+                cudaStream_t stream) {
+  thrust::transform(thrust::cuda::par.on(stream),
+                    in, in + num, out, Convert<From, To>());
+}
+
+template <typename From, typename To, int Dim>
+void convertTensor(cudaStream_t stream,
+                   Tensor<From, Dim, true>& in,
+                   Tensor<To, Dim, true>& out) {
+  FAISS_ASSERT(in.numElements() == out.numElements());
+
+  runConvert<From, To>(in.data(), out.data(), in.numElements(), stream);
+}
+
+template <typename From, typename To, int Dim>
+DeviceTensor<To, Dim, true> convertTensor(GpuResources* res,
+                                          cudaStream_t stream,
+                                          Tensor<From, Dim, true>& in) {
+  DeviceTensor<To, Dim, true> out;
+
+  if (res) {
+    out = std::move(DeviceTensor<To, Dim, true>(
+                      res->getMemoryManagerCurrentDevice(),
+                      in.sizes(),
+                      stream));
+  } else {
+    out = std::move(DeviceTensor<To, Dim, true>(in.sizes()));
+  }
+
+  convertTensor(stream, in, out);
+  return out;
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/CopyUtils.cuh b/core/src/index/thirdparty/faiss/gpu/utils/CopyUtils.cuh
new file mode 100644
index 0000000000..922ca4ed0e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/CopyUtils.cuh
@@ -0,0 +1,107 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/HostTensor.cuh>
+
+namespace faiss { namespace gpu {
+
+/// Ensure the memory at `p` is either on the given device, or copy it
+/// to the device in a new allocation.
+/// If `resources` is provided, then we will perform a temporary
+/// memory allocation if needed. Otherwise, we will call cudaMalloc if
+/// needed.
+template <typename T, int Dim>
+DeviceTensor<T, Dim, true> toDevice(GpuResources* resources,
+                                    int dstDevice,
+                                    T* src,
+                                    cudaStream_t stream,
+                                    std::initializer_list<int> sizes) {
+  int dev = getDeviceForAddress(src);
+
+  if (dev == dstDevice) {
+    // On device we expect
+    return DeviceTensor<T, Dim, true>(src, sizes);
+  } else {
+    // On different device or on host
+    DeviceScope scope(dstDevice);
+
+    Tensor<T, Dim, true> oldT(src, sizes);
+
+    if (resources) {
+      DeviceTensor<T, Dim, true> newT(resources->getMemoryManager(dstDevice),
+                                      sizes,
+                                      stream);
+
+      newT.copyFrom(oldT, stream);
+      return newT;
+    } else {
+      DeviceTensor<T, Dim, true> newT(sizes);
+
+      newT.copyFrom(oldT, stream);
+      return newT;
+    }
+  }
+}
+
+/// Copies data to the CPU, if it is not already on the CPU
+template <typename T, int Dim>
+HostTensor<T, Dim, true> toHost(T* src,
+                                cudaStream_t stream,
+                                std::initializer_list<int> sizes) {
+  int dev = getDeviceForAddress(src);
+
+  if (dev == -1) {
+    // Already on the CPU, just wrap in a HostTensor that doesn't own this
+    // memory
+    return HostTensor<T, Dim, true>(src, sizes);
+  } else {
+    HostTensor<T, Dim, true> out(sizes);
+    Tensor<T, Dim, true> devData(src, sizes);
+    out.copyFrom(devData, stream);
+
+    return out;
+  }
+}
+
+/// Copies a device array's allocation to an address, if necessary
+template <typename T>
+inline void fromDevice(T* src, T* dst, size_t num, cudaStream_t stream) {
+  // It is possible that the array already represents memory at `p`,
+  // in which case no copy is needed
+  if (src == dst) {
+    return;
+  }
+
+  int dev = getDeviceForAddress(dst);
+
+  if (dev == -1) {
+    CUDA_VERIFY(cudaMemcpyAsync(dst,
+                                src,
+                                num * sizeof(T),
+                                cudaMemcpyDeviceToHost,
+                                stream));
+  } else {
+    CUDA_VERIFY(cudaMemcpyAsync(dst,
+                                src,
+                                num * sizeof(T),
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+  }
+}
+
+/// Copies a device array's allocation to an address, if necessary
+template <typename T, int Dim>
+void fromDevice(Tensor<T, Dim, true>& src, T* dst, cudaStream_t stream) {
+  FAISS_ASSERT(src.isContiguous());
+  fromDevice(src.data(), dst, src.numElements(), stream);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/DeviceDefs.cuh b/core/src/index/thirdparty/faiss/gpu/utils/DeviceDefs.cuh
new file mode 100644
index 0000000000..89d3dda289
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/DeviceDefs.cuh
@@ -0,0 +1,48 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <cuda.h>
+
+namespace faiss { namespace gpu {
+
+#ifdef __CUDA_ARCH__
+#if __CUDA_ARCH__ <= 750
+constexpr int kWarpSize = 32;
+#else
+#error Unknown __CUDA_ARCH__; please define parameters for compute capability
+#endif // __CUDA_ARCH__ types
+#endif // __CUDA_ARCH__
+
+#ifndef __CUDA_ARCH__
+// dummy value for host compiler
+constexpr int kWarpSize = 32;
+#endif // !__CUDA_ARCH__
+
+// This is a memory barrier for intra-warp writes to shared memory.
+__forceinline__ __device__ void warpFence() {
+
+#if CUDA_VERSION >= 9000
+  __syncwarp();
+#else
+  // For the time being, assume synchronicity.
+  //  __threadfence_block();
+#endif
+}
+
+#if CUDA_VERSION > 9000
+// Based on the CUDA version (we assume what version of nvcc/ptxas we were
+// compiled with), the register allocation algorithm is much better, so only
+// enable the 2048 selection code if we are above 9.0 (9.2 seems to be ok)
+#define GPU_MAX_SELECTION_K 2048
+#else
+#define GPU_MAX_SELECTION_K 1024
+#endif
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/DeviceMemory.cpp b/core/src/index/thirdparty/faiss/gpu/utils/DeviceMemory.cpp
new file mode 100644
index 0000000000..df00892e3b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/DeviceMemory.cpp
@@ -0,0 +1,77 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/utils/DeviceMemory.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/impl/FaissAssert.h>
+
+namespace faiss { namespace gpu {
+
+DeviceMemoryReservation::DeviceMemoryReservation()
+    : state_(NULL),
+      device_(0),
+      data_(NULL),
+      size_(0),
+      stream_(0) {
+}
+
+DeviceMemoryReservation::DeviceMemoryReservation(DeviceMemory* state,
+                                             int device,
+                                             void* p,
+                                             size_t size,
+                                             cudaStream_t stream)
+    : state_(state),
+      device_(device),
+      data_(p),
+      size_(size),
+      stream_(stream) {
+}
+
+DeviceMemoryReservation::DeviceMemoryReservation(
+  DeviceMemoryReservation&& m) noexcept {
+
+  state_ = m.state_;
+  device_ = m.device_;
+  data_ = m.data_;
+  size_ = m.size_;
+  stream_ = m.stream_;
+
+  m.data_ = NULL;
+}
+
+DeviceMemoryReservation::~DeviceMemoryReservation() {
+  if (data_) {
+    FAISS_ASSERT(state_);
+    state_->returnAllocation(*this);
+  }
+
+  data_ = NULL;
+}
+
+DeviceMemoryReservation&
+DeviceMemoryReservation::operator=(DeviceMemoryReservation&& m) {
+  if (data_) {
+    FAISS_ASSERT(state_);
+    state_->returnAllocation(*this);
+  }
+
+  state_ = m.state_;
+  device_ = m.device_;
+  data_ = m.data_;
+  size_ = m.size_;
+  stream_ = m.stream_;
+
+  m.data_ = NULL;
+
+  return *this;
+}
+
+DeviceMemory::~DeviceMemory() {
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/DeviceMemory.h b/core/src/index/thirdparty/faiss/gpu/utils/DeviceMemory.h
new file mode 100644
index 0000000000..1bffdc00ac
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/DeviceMemory.h
@@ -0,0 +1,71 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <string>
+
+namespace faiss { namespace gpu {
+
+class DeviceMemory;
+
+class DeviceMemoryReservation {
+ public:
+  DeviceMemoryReservation();
+  DeviceMemoryReservation(DeviceMemory* state,
+                          int device, void* p, size_t size,
+                          cudaStream_t stream);
+  DeviceMemoryReservation(DeviceMemoryReservation&& m) noexcept;
+  ~DeviceMemoryReservation();
+
+  DeviceMemoryReservation& operator=(DeviceMemoryReservation&& m);
+
+  int device() { return device_; }
+  void* get() { return data_; }
+  size_t size() { return size_; }
+  cudaStream_t stream() { return stream_; }
+
+ private:
+  DeviceMemory* state_;
+
+  int device_;
+  void* data_;
+  size_t size_;
+  cudaStream_t stream_;
+};
+
+/// Manages temporary memory allocations on a GPU device
+class DeviceMemory {
+ public:
+  virtual ~DeviceMemory();
+
+  /// Returns the device we are managing memory for
+  virtual int getDevice() const = 0;
+
+  /// Obtains a temporary memory allocation for our device,
+  /// whose usage is ordered with respect to the given stream.
+  virtual DeviceMemoryReservation getMemory(cudaStream_t stream,
+                                            size_t size) = 0;
+
+  /// Returns the current size available without calling cudaMalloc
+  virtual size_t getSizeAvailable() const = 0;
+
+  /// Returns a string containing our current memory manager state
+  virtual std::string toString() const = 0;
+
+  /// Returns the high-water mark of cudaMalloc allocations for our
+  /// device
+  virtual size_t getHighWaterCudaMalloc() const = 0;
+
+ protected:
+  friend class DeviceMemoryReservation;
+  virtual void returnAllocation(DeviceMemoryReservation& m) = 0;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/DeviceTensor-inl.cuh b/core/src/index/thirdparty/faiss/gpu/utils/DeviceTensor-inl.cuh
new file mode 100644
index 0000000000..cff5452989
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/DeviceTensor-inl.cuh
@@ -0,0 +1,228 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <utility> // std::move
+
+namespace faiss { namespace gpu {
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor() :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
+    state_(AllocState::NotOwner),
+    space_(MemorySpace::Device) {
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
+  DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
+    state_(AllocState::NotOwner),
+    space_(MemorySpace::Device) {
+  this->operator=(std::move(t));
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
+  DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
+  if (this->state_ == AllocState::Owner) {
+    CUDA_VERIFY(cudaFree(this->data_));
+  }
+
+  this->Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
+    std::move(t));
+
+  this->state_ = t.state_; t.state_ = AllocState::NotOwner;
+  this->space_ = t.space_;
+  this->reservation_ = std::move(t.reservation_);
+
+  return *this;
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~DeviceTensor() {
+  if (state_ == AllocState::Owner) {
+    FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
+    CUDA_VERIFY(cudaFree(this->data_));
+    this->data_ = nullptr;
+  }
+
+  // Otherwise, if we have a temporary memory reservation, then its
+  // destructor will return the reservation
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
+  const IndexT sizes[Dim],
+  MemorySpace space) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
+    state_(AllocState::Owner),
+    space_(space) {
+
+  allocMemorySpace(space, &this->data_, this->getSizeInBytes());
+  FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
+  std::initializer_list<IndexT> sizes,
+  MemorySpace space) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
+    state_(AllocState::Owner),
+    space_(space) {
+
+  allocMemorySpace(space, &this->data_, this->getSizeInBytes());
+  FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
+}
+
+// memory reservation constructor
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
+  DeviceMemory& m,
+  const IndexT sizes[Dim],
+  cudaStream_t stream,
+  MemorySpace space) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
+    state_(AllocState::Reservation),
+    space_(space) {
+
+  // FIXME: add MemorySpace to DeviceMemory
+  auto memory = m.getMemory(stream, this->getSizeInBytes());
+
+  this->data_ = (T*) memory.get();
+  FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
+  reservation_ = std::move(memory);
+}
+
+// memory reservation constructor
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
+  DeviceMemory& m,
+  std::initializer_list<IndexT> sizes,
+  cudaStream_t stream,
+  MemorySpace space) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
+    state_(AllocState::Reservation),
+    space_(space) {
+
+  // FIXME: add MemorySpace to DeviceMemory
+  auto memory = m.getMemory(stream, this->getSizeInBytes());
+
+  this->data_ = (T*) memory.get();
+  FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
+  reservation_ = std::move(memory);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
+  DataPtrType data,
+  const IndexT sizes[Dim],
+  MemorySpace space) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
+    state_(AllocState::NotOwner),
+    space_(space) {
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
+  DataPtrType data,
+  std::initializer_list<IndexT> sizes,
+  MemorySpace space) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
+    state_(AllocState::NotOwner),
+    space_(space) {
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
+  DataPtrType data,
+  const IndexT sizes[Dim],
+  const IndexT strides[Dim],
+  MemorySpace space) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
+    state_(AllocState::NotOwner),
+    space_(space) {
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
+  cudaStream_t stream,
+  MemorySpace space) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
+    state_(AllocState::Owner),
+    space_(space) {
+
+  allocMemorySpace(space_, &this->data_, this->getSizeInBytes());
+  FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
+  this->copyFrom(t, stream);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
+  DeviceMemory& m,
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
+  cudaStream_t stream,
+  MemorySpace space) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
+    state_(AllocState::Reservation),
+    space_(space) {
+
+  // FIXME: add MemorySpace to DeviceMemory
+  auto memory = m.getMemory(stream, this->getSizeInBytes());
+
+  this->data_ = (T*) memory.get();
+  FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
+  reservation_ = std::move(memory);
+
+  this->copyFrom(t, stream);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero(
+  cudaStream_t stream) {
+  if (this->data_) {
+    // Region must be contiguous
+    FAISS_ASSERT(this->isContiguous());
+
+    CUDA_VERIFY(cudaMemsetAsync(
+                  this->data_, 0, this->getSizeInBytes(), stream));
+  }
+
+  return *this;
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/DeviceTensor.cuh b/core/src/index/thirdparty/faiss/gpu/utils/DeviceTensor.cuh
new file mode 100644
index 0000000000..78039969c5
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/DeviceTensor.cuh
@@ -0,0 +1,113 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/gpu/utils/DeviceMemory.h>
+#include <faiss/gpu/utils/MemorySpace.h>
+
+namespace faiss { namespace gpu {
+
+template <typename T,
+          int Dim,
+          bool InnerContig = false,
+          typename IndexT = int,
+          template <typename U> class PtrTraits = traits::DefaultPtrTraits>
+class DeviceTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
+ public:
+  typedef IndexT IndexType;
+  typedef typename PtrTraits<T>::PtrType DataPtrType;
+
+  /// Default constructor
+  __host__ DeviceTensor();
+
+  /// Destructor
+  __host__ ~DeviceTensor();
+
+  /// Move constructor
+  __host__ DeviceTensor(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
+
+  /// Move assignment
+  __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
+  operator=(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
+
+  /// Constructs a tensor of the given size, allocating memory for it
+  /// locally
+  __host__ DeviceTensor(const IndexT sizes[Dim],
+                        MemorySpace space = MemorySpace::Device);
+  __host__ DeviceTensor(std::initializer_list<IndexT> sizes,
+                        MemorySpace space = MemorySpace::Device);
+
+  /// Constructs a tensor of the given size, reserving a temporary
+  /// memory reservation via a memory manager.
+  /// The memory reservation should be ordered with respect to the
+  /// given stream.
+  __host__ DeviceTensor(DeviceMemory& m,
+                        const IndexT sizes[Dim],
+                        cudaStream_t stream,
+                        MemorySpace space = MemorySpace::Device);
+  __host__ DeviceTensor(DeviceMemory& m,
+                        std::initializer_list<IndexT> sizes,
+                        cudaStream_t stream,
+                        MemorySpace space = MemorySpace::Device);
+
+  /// Constructs a tensor of the given size and stride, referencing a
+  /// memory region we do not own
+  __host__ DeviceTensor(DataPtrType data,
+                        const IndexT sizes[Dim],
+                        MemorySpace space = MemorySpace::Device);
+  __host__ DeviceTensor(DataPtrType data,
+                        std::initializer_list<IndexT> sizes,
+                        MemorySpace space = MemorySpace::Device);
+
+  /// Constructs a tensor of the given size and stride, referencing a
+  /// memory region we do not own
+  __host__ DeviceTensor(DataPtrType data,
+                        const IndexT sizes[Dim],
+                        const IndexT strides[Dim],
+                        MemorySpace space = MemorySpace::Device);
+
+  /// Copies a tensor into ourselves, allocating memory for it locally
+  __host__ DeviceTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
+                        cudaStream_t stream,
+                        MemorySpace space = MemorySpace::Device);
+
+  /// Copies a tensor into ourselves, reserving a temporary
+  /// memory reservation via a memory manager.
+  __host__ DeviceTensor(DeviceMemory& m,
+                        Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
+                        cudaStream_t stream,
+                        MemorySpace space = MemorySpace::Device);
+
+  /// Call to zero out memory
+  __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
+  zero(cudaStream_t stream);
+
+ private:
+  enum AllocState {
+    /// This tensor itself owns the memory, which must be freed via
+    /// cudaFree
+    Owner,
+
+    /// This tensor itself is not an owner of the memory; there is
+    /// nothing to free
+    NotOwner,
+
+    /// This tensor has the memory via a temporary memory reservation
+    Reservation
+  };
+
+  AllocState state_;
+  MemorySpace space_;
+  DeviceMemoryReservation reservation_;
+};
+
+} } // namespace
+
+#include <faiss/gpu/utils/DeviceTensor-inl.cuh>
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/DeviceUtils.cu b/core/src/index/thirdparty/faiss/gpu/utils/DeviceUtils.cu
new file mode 100644
index 0000000000..5d8254a09b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/DeviceUtils.cu
@@ -0,0 +1,195 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/impl/FaissAssert.h>
+#include <mutex>
+#include <unordered_map>
+#include <cuda_profiler_api.h>
+
+namespace faiss { namespace gpu {
+
+int getCurrentDevice() {
+  int dev = -1;
+  CUDA_VERIFY(cudaGetDevice(&dev));
+  FAISS_ASSERT(dev != -1);
+
+  return dev;
+}
+
+void setCurrentDevice(int device) {
+  CUDA_VERIFY(cudaSetDevice(device));
+}
+
+int getNumDevices() {
+  int numDev = -1;
+  cudaError_t err = cudaGetDeviceCount(&numDev);
+  if (cudaErrorNoDevice == err) {
+    numDev = 0;
+  } else {
+    CUDA_VERIFY(err);
+  }
+  FAISS_ASSERT(numDev != -1);
+
+  return numDev;
+}
+
+void profilerStart() {
+  CUDA_VERIFY(cudaProfilerStart());
+}
+
+void profilerStop() {
+  CUDA_VERIFY(cudaProfilerStop());
+}
+
+void synchronizeAllDevices() {
+  for (int i = 0; i < getNumDevices(); ++i) {
+    DeviceScope scope(i);
+
+    CUDA_VERIFY(cudaDeviceSynchronize());
+  }
+}
+
+const cudaDeviceProp& getDeviceProperties(int device) {
+  static std::mutex mutex;
+  static std::unordered_map<int, cudaDeviceProp> properties;
+
+  std::lock_guard<std::mutex> guard(mutex);
+
+  auto it = properties.find(device);
+  if (it == properties.end()) {
+    cudaDeviceProp prop;
+    CUDA_VERIFY(cudaGetDeviceProperties(&prop, device));
+
+    properties[device] = prop;
+    it = properties.find(device);
+  }
+
+  return it->second;
+}
+
+const cudaDeviceProp& getCurrentDeviceProperties() {
+  return getDeviceProperties(getCurrentDevice());
+}
+
+int getMaxThreads(int device) {
+  return getDeviceProperties(device).maxThreadsPerBlock;
+}
+
+int getMaxThreadsCurrentDevice() {
+  return getMaxThreads(getCurrentDevice());
+}
+
+size_t getMaxSharedMemPerBlock(int device) {
+  return getDeviceProperties(device).sharedMemPerBlock;
+}
+
+size_t getMaxSharedMemPerBlockCurrentDevice() {
+  return getMaxSharedMemPerBlock(getCurrentDevice());
+}
+
+int getDeviceForAddress(const void* p) {
+  if (!p) {
+    return -1;
+  }
+
+  cudaPointerAttributes att;
+  cudaError_t err = cudaPointerGetAttributes(&att, p);
+  FAISS_ASSERT(err == cudaSuccess ||
+         err == cudaErrorInvalidValue);
+
+  if (err == cudaErrorInvalidValue) {
+    // Make sure the current thread error status has been reset
+    err = cudaGetLastError();
+    FAISS_ASSERT(err == cudaErrorInvalidValue);
+    return -1;
+  } else if (att.memoryType == cudaMemoryTypeHost) {
+    return -1;
+  } else {
+    return att.device;
+  }
+}
+
+bool getFullUnifiedMemSupport(int device) {
+  const auto& prop = getDeviceProperties(device);
+  return (prop.major >= 6);
+}
+
+bool getFullUnifiedMemSupportCurrentDevice() {
+  return getFullUnifiedMemSupport(getCurrentDevice());
+}
+
+int getMaxKSelection() {
+  // Don't use the device at the moment, just base this based on the CUDA SDK
+  // that we were compiled with
+  return GPU_MAX_SELECTION_K;
+}
+
+DeviceScope::DeviceScope(int device) {
+  prevDevice_ = getCurrentDevice();
+
+  if (prevDevice_ != device) {
+    setCurrentDevice(device);
+  } else {
+    prevDevice_ = -1;
+  }
+}
+
+DeviceScope::~DeviceScope() {
+  if (prevDevice_ != -1) {
+    setCurrentDevice(prevDevice_);
+  }
+}
+
+CublasHandleScope::CublasHandleScope() {
+  auto blasStatus = cublasCreate(&blasHandle_);
+  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
+}
+
+CublasHandleScope::~CublasHandleScope() {
+  auto blasStatus = cublasDestroy(blasHandle_);
+  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
+}
+
+CudaEvent::CudaEvent(cudaStream_t stream)
+    : event_(0) {
+  CUDA_VERIFY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+  CUDA_VERIFY(cudaEventRecord(event_, stream));
+}
+
+CudaEvent::CudaEvent(CudaEvent&& event) noexcept
+    : event_(std::move(event.event_)) {
+  event.event_ = 0;
+}
+
+CudaEvent::~CudaEvent() {
+  if (event_) {
+    CUDA_VERIFY(cudaEventDestroy(event_));
+  }
+}
+
+CudaEvent&
+CudaEvent::operator=(CudaEvent&& event) noexcept {
+  event_ = std::move(event.event_);
+  event.event_ = 0;
+
+  return *this;
+}
+
+void
+CudaEvent::streamWaitOnEvent(cudaStream_t stream) {
+  CUDA_VERIFY(cudaStreamWaitEvent(stream, event_, 0));
+}
+
+void
+CudaEvent::cpuWaitOnEvent() {
+  CUDA_VERIFY(cudaEventSynchronize(event_));
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/DeviceUtils.h b/core/src/index/thirdparty/faiss/gpu/utils/DeviceUtils.h
new file mode 100644
index 0000000000..02fccfc6bb
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/DeviceUtils.h
@@ -0,0 +1,185 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/impl/FaissAssert.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <vector>
+
+namespace faiss { namespace gpu {
+
+/// Returns the current thread-local GPU device
+int getCurrentDevice();
+
+/// Sets the current thread-local GPU device
+void setCurrentDevice(int device);
+
+/// Returns the number of available GPU devices
+int getNumDevices();
+
+/// Starts the CUDA profiler (exposed via SWIG)
+void profilerStart();
+
+/// Stops the CUDA profiler (exposed via SWIG)
+void profilerStop();
+
+/// Synchronizes the CPU against all devices (equivalent to
+/// cudaDeviceSynchronize for each device)
+void synchronizeAllDevices();
+
+/// Returns a cached cudaDeviceProp for the given device
+const cudaDeviceProp& getDeviceProperties(int device);
+
+/// Returns the cached cudaDeviceProp for the current device
+const cudaDeviceProp& getCurrentDeviceProperties();
+
+/// Returns the maximum number of threads available for the given GPU
+/// device
+int getMaxThreads(int device);
+
+/// Equivalent to getMaxThreads(getCurrentDevice())
+int getMaxThreadsCurrentDevice();
+
+/// Returns the maximum smem available for the given GPU device
+size_t getMaxSharedMemPerBlock(int device);
+
+/// Equivalent to getMaxSharedMemPerBlock(getCurrentDevice())
+size_t getMaxSharedMemPerBlockCurrentDevice();
+
+/// For a given pointer, returns whether or not it is located on
+/// a device (deviceId >= 0) or the host (-1).
+int getDeviceForAddress(const void* p);
+
+/// Does the given device support full unified memory sharing host
+/// memory?
+bool getFullUnifiedMemSupport(int device);
+
+/// Equivalent to getFullUnifiedMemSupport(getCurrentDevice())
+bool getFullUnifiedMemSupportCurrentDevice();
+
+/// Returns the maximum k-selection value supported based on the CUDA SDK that
+/// we were compiled with. .cu files can use DeviceDefs.cuh, but this is for
+/// non-CUDA files
+int getMaxKSelection();
+
+/// RAII object to set the current device, and restore the previous
+/// device upon destruction
+class DeviceScope {
+ public:
+  explicit DeviceScope(int device);
+  ~DeviceScope();
+
+ private:
+  int prevDevice_;
+};
+
+/// RAII object to manage a cublasHandle_t
+class CublasHandleScope {
+ public:
+  CublasHandleScope();
+  ~CublasHandleScope();
+
+  cublasHandle_t get() { return blasHandle_; }
+
+ private:
+  cublasHandle_t blasHandle_;
+};
+
+// RAII object to manage a cudaEvent_t
+class CudaEvent {
+ public:
+  /// Creates an event and records it in this stream
+  explicit CudaEvent(cudaStream_t stream);
+  CudaEvent(const CudaEvent& event) = delete;
+  CudaEvent(CudaEvent&& event) noexcept;
+  ~CudaEvent();
+
+  inline cudaEvent_t get() { return event_; }
+
+  /// Wait on this event in this stream
+  void streamWaitOnEvent(cudaStream_t stream);
+
+  /// Have the CPU wait for the completion of this event
+  void cpuWaitOnEvent();
+
+  CudaEvent& operator=(CudaEvent&& event) noexcept;
+  CudaEvent& operator=(CudaEvent& event) = delete;
+
+ private:
+  cudaEvent_t event_;
+};
+
+/// Wrapper to test return status of CUDA functions
+#define CUDA_VERIFY(X)                                                  \
+  do {                                                                  \
+    auto err__ = (X);                                                   \
+    FAISS_ASSERT_FMT(err__ == cudaSuccess, "CUDA error %d %s",          \
+                     (int) err__, cudaGetErrorString(err__));           \
+  } while (0)
+
+/// Wrapper to synchronously probe for CUDA errors
+// #define FAISS_GPU_SYNC_ERROR 1
+
+#ifdef FAISS_GPU_SYNC_ERROR
+#define CUDA_TEST_ERROR()                       \
+  do {                                          \
+    CUDA_VERIFY(cudaDeviceSynchronize());       \
+  } while (0)
+#else
+#define CUDA_TEST_ERROR()                       \
+  do {                                          \
+    CUDA_VERIFY(cudaGetLastError());            \
+  } while (0)
+#endif
+
+/// Call for a collection of streams to wait on
+template <typename L1, typename L2>
+void streamWaitBase(const L1& listWaiting, const L2& listWaitOn) {
+  // For all the streams we are waiting on, create an event
+  std::vector<cudaEvent_t> events;
+  for (auto& stream : listWaitOn) {
+    cudaEvent_t event;
+    CUDA_VERIFY(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+    CUDA_VERIFY(cudaEventRecord(event, stream));
+    events.push_back(event);
+  }
+
+  // For all the streams that are waiting, issue a wait
+  for (auto& stream : listWaiting) {
+    for (auto& event : events) {
+      CUDA_VERIFY(cudaStreamWaitEvent(stream, event, 0));
+    }
+  }
+
+  for (auto& event : events) {
+    CUDA_VERIFY(cudaEventDestroy(event));
+  }
+}
+
+/// These versions allow usage of initializer_list as arguments, since
+/// otherwise {...} doesn't have a type
+template <typename L1>
+void streamWait(const L1& a,
+                const std::initializer_list<cudaStream_t>& b) {
+  streamWaitBase(a, b);
+}
+
+template <typename L2>
+void streamWait(const std::initializer_list<cudaStream_t>& a,
+                const L2& b) {
+  streamWaitBase(a, b);
+}
+
+inline void streamWait(const std::initializer_list<cudaStream_t>& a,
+                       const std::initializer_list<cudaStream_t>& b) {
+  streamWaitBase(a, b);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/DeviceVector.cuh b/core/src/index/thirdparty/faiss/gpu/utils/DeviceVector.cuh
new file mode 100644
index 0000000000..041db76510
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/DeviceVector.cuh
@@ -0,0 +1,190 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/MemorySpace.h>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <algorithm>
+#include <cuda.h>
+#include <vector>
+
+namespace faiss { namespace gpu {
+
+/// A simple version of thrust::device_vector<T>, but has more control
+/// over whether resize() initializes new space with T() (which we
+/// don't want), and control on how much the reserved space grows by
+/// upon resize/reserve. It is also meant for POD types only.
+template <typename T>
+class DeviceVector {
+ public:
+  DeviceVector(MemorySpace space = MemorySpace::Device)
+      : data_(nullptr),
+        num_(0),
+        capacity_(0),
+        owner(true),
+        space_(space) {
+  }
+
+  ~DeviceVector() {
+    clear();
+  }
+
+  void reset(T* data, size_t num, size_t capacity, MemorySpace space = MemorySpace::Device) {
+      FAISS_ASSERT(data != nullptr);
+      FAISS_ASSERT(capacity >= num);
+      clear();
+      owner = false;
+      data_ = data;
+      num_ = num;
+      capacity_ = capacity_;
+  }
+  // Clear all allocated memory; reset to zero size
+  void clear() {
+    if(owner) {
+        freeMemorySpace(space_, data_);
+    }
+    data_ = nullptr;
+    num_ = 0;
+    capacity_ = 0;
+    owner = true;
+  }
+
+  size_t size() const { return num_; }
+  size_t capacity() const { return capacity_; }
+  T* data() { return data_; }
+  const T* data() const { return data_; }
+
+  template <typename OutT>
+  std::vector<OutT> copyToHost(cudaStream_t stream) const {
+    FAISS_ASSERT(num_ * sizeof(T) % sizeof(OutT) == 0);
+
+    std::vector<OutT> out((num_ * sizeof(T)) / sizeof(OutT));
+    CUDA_VERIFY(cudaMemcpyAsync(out.data(), data_, num_ * sizeof(T),
+                                cudaMemcpyDeviceToHost, stream));
+
+    return out;
+  }
+
+  // Returns true if we actually reallocated memory
+  // If `reserveExact` is true, then we reserve only the memory that
+  // we need for what we're appending
+  bool append(const T* d,
+              size_t n,
+              cudaStream_t stream,
+              bool reserveExact = false) {
+    bool mem = false;
+
+    if (n > 0) {
+      size_t reserveSize = num_ + n;
+      if (!reserveExact) {
+        reserveSize = getNewCapacity_(reserveSize);
+      }
+
+      mem = reserve(reserveSize, stream);
+
+      int dev = getDeviceForAddress(d);
+      if (dev == -1) {
+        CUDA_VERIFY(cudaMemcpyAsync(data_ + num_, d, n * sizeof(T),
+                                    cudaMemcpyHostToDevice, stream));
+      } else {
+        CUDA_VERIFY(cudaMemcpyAsync(data_ + num_, d, n * sizeof(T),
+                                    cudaMemcpyDeviceToDevice, stream));
+      }
+      num_ += n;
+    }
+
+    return mem;
+  }
+
+  // Returns true if we actually reallocated memory
+  bool resize(size_t newSize, cudaStream_t stream) {
+    bool mem = false;
+
+    if (num_ < newSize) {
+      mem = reserve(getNewCapacity_(newSize), stream);
+    }
+
+    // Don't bother zero initializing the newly accessible memory
+    // (unlike thrust::device_vector)
+    num_ = newSize;
+
+    return mem;
+  }
+
+  // Clean up after oversized allocations, while leaving some space to
+  // remain for subsequent allocations (if `exact` false) or to
+  // exactly the space we need (if `exact` true); returns space
+  // reclaimed in bytes
+  size_t reclaim(bool exact, cudaStream_t stream) {
+    size_t free = capacity_ - num_;
+
+    if (exact) {
+      realloc_(num_, stream);
+      return free * sizeof(T);
+    }
+
+    // If more than 1/4th of the space is free, then we want to
+    // truncate to only having 1/8th of the space free; this still
+    // preserves some space for new elements, but won't force us to
+    // double our size right away
+    if (free > (capacity_ / 4)) {
+      size_t newFree = capacity_ / 8;
+      size_t newCapacity = num_ + newFree;
+
+      size_t oldCapacity = capacity_;
+      FAISS_ASSERT(newCapacity < oldCapacity);
+
+      realloc_(newCapacity, stream);
+
+      return (oldCapacity - newCapacity) * sizeof(T);
+    }
+
+    return 0;
+  }
+
+  // Returns true if we actually reallocated memory
+  bool reserve(size_t newCapacity, cudaStream_t stream) {
+    if (newCapacity <= capacity_) {
+      return false;
+    }
+
+    // Otherwise, we need new space.
+    realloc_(newCapacity, stream);
+    return true;
+  }
+
+ private:
+  void realloc_(size_t newCapacity, cudaStream_t stream) {
+    FAISS_ASSERT(num_ <= newCapacity);
+    FAISS_ASSERT_MSG(owner, "Cannot realloc due to no ownership of mem");
+
+    T* newData = nullptr;
+    allocMemorySpace(space_, &newData, newCapacity * sizeof(T));
+    CUDA_VERIFY(cudaMemcpyAsync(newData, data_, num_ * sizeof(T),
+                                cudaMemcpyDeviceToDevice, stream));
+    freeMemorySpace(space_, data_);
+
+    data_ = newData;
+    capacity_ = newCapacity;
+  }
+
+  size_t getNewCapacity_(size_t preferredSize) {
+    return utils::nextHighestPowerOf2(preferredSize);
+  }
+
+  T* data_;
+  size_t num_;
+  size_t capacity_;
+  MemorySpace space_;
+  bool owner = true;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Float16.cu b/core/src/index/thirdparty/faiss/gpu/utils/Float16.cu
new file mode 100644
index 0000000000..bcfa5a7ed0
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/Float16.cu
@@ -0,0 +1,35 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/nvidia/fp16_emu.cuh>
+#include <thrust/execution_policy.h>
+#include <thrust/transform.h>
+
+namespace faiss { namespace gpu {
+
+bool getDeviceSupportsFloat16Math(int device) {
+  const auto& prop = getDeviceProperties(device);
+
+  return (prop.major >= 6 ||
+          (prop.major == 5 && prop.minor >= 3));
+}
+
+__half hostFloat2Half(float a) {
+#if CUDA_VERSION >= 9000
+  __half_raw raw;
+  raw.x = cpu_float2half_rn(a).x;
+  return __half(raw);
+#else
+  __half h;
+  h.x = cpu_float2half_rn(a).x;
+  return h;
+#endif
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Float16.cuh b/core/src/index/thirdparty/faiss/gpu/utils/Float16.cuh
new file mode 100644
index 0000000000..4954f27b64
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/Float16.cuh
@@ -0,0 +1,75 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <cuda.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+
+// We require at least CUDA 7.5 for compilation
+#if CUDA_VERSION < 7050
+#error "CUDA >= 7.5 is required"
+#endif
+
+// Some compute capabilities have full float16 ALUs.
+#if __CUDA_ARCH__ >= 530
+#define FAISS_USE_FULL_FLOAT16 1
+#endif // __CUDA_ARCH__ types
+
+#include <cuda_fp16.h>
+
+namespace faiss { namespace gpu {
+
+// 64 bytes containing 4 half (float16) values
+struct Half4 {
+  half2 a;
+  half2 b;
+};
+
+inline __device__ float4 half4ToFloat4(Half4 v) {
+  float2 a = __half22float2(v.a);
+  float2 b = __half22float2(v.b);
+
+  float4 out;
+  out.x = a.x;
+  out.y = a.y;
+  out.z = b.x;
+  out.w = b.y;
+
+  return out;
+}
+
+inline __device__ Half4 float4ToHalf4(float4 v) {
+  float2 a;
+  a.x = v.x;
+  a.y = v.y;
+
+  float2 b;
+  b.x = v.z;
+  b.y = v.w;
+
+  Half4 out;
+  out.a = __float22half2_rn(a);
+  out.b = __float22half2_rn(b);
+
+  return out;
+}
+
+// 128 bytes containing 8 half (float16) values
+struct Half8 {
+  Half4 a;
+  Half4 b;
+};
+
+/// Returns true if the given device supports native float16 math
+bool getDeviceSupportsFloat16Math(int device);
+
+__half hostFloat2Half(float v);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/HostTensor-inl.cuh b/core/src/index/thirdparty/faiss/gpu/utils/HostTensor-inl.cuh
new file mode 100644
index 0000000000..37149fc936
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/HostTensor-inl.cuh
@@ -0,0 +1,180 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+namespace faiss { namespace gpu {
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor() :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
+    state_(AllocState::NotOwner) {
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~HostTensor() {
+  if (state_ == AllocState::Owner) {
+    FAISS_ASSERT(this->data_ != nullptr);
+    delete[] this->data_;
+    this->data_ = nullptr;
+  }
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
+  HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
+    state_(AllocState::NotOwner) {
+  this->operator=(std::move(t));
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
+  HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
+  if (this->state_ == AllocState::Owner) {
+    FAISS_ASSERT(this->data_ != nullptr);
+    delete[] this->data_;
+    this->data_ = nullptr;
+  }
+
+  this->Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
+    std::move(t));
+
+  this->state_ = t.state_; t.state_ = AllocState::NotOwner;
+
+  return *this;
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
+  const IndexT sizes[Dim]) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
+    state_(AllocState::Owner) {
+
+  this->data_ = new T[this->numElements()];
+  FAISS_ASSERT(this->data_ != nullptr);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
+  std::initializer_list<IndexT> sizes) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
+    state_(AllocState::Owner) {
+  this->data_ = new T[this->numElements()];
+  FAISS_ASSERT(this->data_ != nullptr);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
+  DataPtrType data,
+  const IndexT sizes[Dim]) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
+    state_(AllocState::NotOwner) {
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
+  DataPtrType data,
+  std::initializer_list<IndexT> sizes) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
+    state_(AllocState::NotOwner) {
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
+  DataPtrType data,
+  const IndexT sizes[Dim],
+  const IndexT strides[Dim]) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
+    state_(AllocState::NotOwner) {
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
+  cudaStream_t stream) :
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
+    state_(AllocState::Owner) {
+  // Only contiguous arrays handled for now
+  FAISS_ASSERT(t.isContiguous());
+
+  this->data_ = new T[t.numElements()];
+  this->copyFrom(t, stream);
+}
+
+/// Call to zero out memory
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero() {
+  // Region must be contiguous
+  FAISS_ASSERT(this->isContiguous());
+
+  if (this->data_ != nullptr) {
+    memset(this->data_, 0, this->getSizeInBytes());
+  }
+
+  return *this;
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ T
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::maxDiff(
+  const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
+  auto size = this->numElements();
+
+  FAISS_ASSERT(size == t.numElements());
+  FAISS_ASSERT(size > 0);
+
+  if (InnerContig) {
+    auto a = this->data();
+    auto b = t.data();
+
+    T maxDiff = a[0] - b[0];
+    // FIXME: type-specific abs()
+    maxDiff = maxDiff < 0 ? maxDiff * (T) -1 : maxDiff;
+
+    for (IndexT i = 1; i < size; ++i) {
+      auto diff = a[i] - b[i];
+      // FIXME: type-specific abs
+      diff = diff < 0 ? diff * (T) -1 : diff;
+      if (diff > maxDiff) {
+        maxDiff = diff;
+      }
+    }
+
+    return maxDiff;
+  } else {
+    // non-contiguous
+    // FIXME
+    FAISS_ASSERT(false);
+    return (T) 0;
+  }
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/HostTensor.cuh b/core/src/index/thirdparty/faiss/gpu/utils/HostTensor.cuh
new file mode 100644
index 0000000000..5b8758a8ce
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/HostTensor.cuh
@@ -0,0 +1,91 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/Tensor.cuh>
+
+namespace faiss { namespace gpu {
+
+template <typename T,
+          int Dim,
+          bool InnerContig = false,
+          typename IndexT = int,
+          template <typename U> class PtrTraits = traits::DefaultPtrTraits>
+class HostTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
+ public:
+  typedef IndexT IndexType;
+  typedef typename PtrTraits<T>::PtrType DataPtrType;
+
+  /// Default constructor
+  __host__ HostTensor();
+
+  /// Destructor
+  __host__ ~HostTensor();
+
+  /// Move constructor
+  __host__ HostTensor(HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
+
+  /// Move assignment
+  __host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
+  operator=(HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
+
+  /// Constructs a tensor of the given size, allocating memory for it
+  /// locally
+  __host__ HostTensor(const IndexT sizes[Dim]);
+  __host__ HostTensor(std::initializer_list<IndexT> sizes);
+
+  /// Constructs a tensor of the given size and stride, referencing a
+  /// memory region we do not own
+  __host__ HostTensor(DataPtrType data,
+                      const IndexT sizes[Dim]);
+  __host__ HostTensor(DataPtrType data,
+                      std::initializer_list<IndexT> sizes);
+
+  /// Constructs a tensor of the given size and stride, referencing a
+  /// memory region we do not own
+  __host__ HostTensor(DataPtrType data,
+                      const IndexT sizes[Dim],
+                      const IndexT strides[Dim]);
+
+  /// Copies a tensor into ourselves, allocating memory for it
+  /// locally. If the tensor is on the GPU, then we will copy it to
+  /// ourselves wrt the given stream.
+  __host__ HostTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
+                      cudaStream_t stream);
+
+  /// Call to zero out memory
+  __host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& zero();
+
+  /// Returns the maximum difference seen between two tensors
+  __host__ T
+  maxDiff(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const;
+
+  /// Are the two tensors exactly equal?
+  __host__ bool
+  equal(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
+    return (maxDiff(t) == (T) 0);
+  }
+
+ private:
+  enum AllocState {
+    /// This tensor itself owns the memory, which must be freed via
+    /// cudaFree
+    Owner,
+
+    /// This tensor itself is not an owner of the memory; there is
+    /// nothing to free
+    NotOwner,
+  };
+
+  AllocState state_;
+};
+
+} } // namespace
+
+#include <faiss/gpu/utils/HostTensor-inl.cuh>
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Limits.cuh b/core/src/index/thirdparty/faiss/gpu/utils/Limits.cuh
new file mode 100644
index 0000000000..7dfaa2e2ce
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/Limits.cuh
@@ -0,0 +1,82 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/Pair.cuh>
+#include <limits>
+
+namespace faiss { namespace gpu {
+
+template <typename T>
+struct Limits {
+};
+
+// Unfortunately we can't use constexpr because there is no
+// constexpr constructor for half
+// FIXME: faiss CPU uses +/-FLT_MAX instead of +/-infinity
+constexpr float kFloatMax = std::numeric_limits<float>::max();
+constexpr float kFloatMin = std::numeric_limits<float>::lowest();
+
+template <>
+struct Limits<float> {
+  static __device__ __host__ inline float getMin() {
+    return kFloatMin;
+  }
+  static __device__ __host__ inline float getMax() {
+    return kFloatMax;
+  }
+};
+
+inline __device__ __host__ half kGetHalf(unsigned short v) {
+#if CUDA_VERSION >= 9000
+  __half_raw h;
+  h.x = v;
+  return __half(h);
+#else
+  half h;
+  h.x = v;
+  return h;
+#endif
+}
+
+template <>
+struct Limits<half> {
+  static __device__ __host__ inline half getMin() {
+    return kGetHalf(0xfbffU);
+  }
+  static __device__ __host__ inline half getMax() {
+    return kGetHalf(0x7bffU);
+  }
+};
+
+constexpr int kIntMax = std::numeric_limits<int>::max();
+constexpr int kIntMin = std::numeric_limits<int>::lowest();
+
+template <>
+struct Limits<int> {
+  static __device__ __host__ inline int getMin() {
+    return kIntMin;
+  }
+  static __device__ __host__ inline int getMax() {
+    return kIntMax;
+  }
+};
+
+template<typename K, typename V>
+struct Limits<Pair<K, V>> {
+  static __device__ __host__ inline Pair<K, V> getMin() {
+    return Pair<K, V>(Limits<K>::getMin(), Limits<V>::getMin());
+  }
+
+  static __device__ __host__ inline Pair<K, V> getMax() {
+    return Pair<K, V>(Limits<K>::getMax(), Limits<V>::getMax());
+  }
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/LoadStoreOperators.cuh b/core/src/index/thirdparty/faiss/gpu/utils/LoadStoreOperators.cuh
new file mode 100644
index 0000000000..b0bb8b5330
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/LoadStoreOperators.cuh
@@ -0,0 +1,90 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/Float16.cuh>
+
+#ifndef __HALF2_TO_UI
+// cuda_fp16.hpp doesn't export this
+#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
+#endif
+
+
+//
+// Templated wrappers to express load/store for different scalar and vector
+// types, so kernels can have the same written form but can operate
+// over half and float, and on vector types transparently
+//
+
+namespace faiss { namespace gpu {
+
+template <typename T>
+struct LoadStore {
+  static inline __device__ T load(void* p) {
+    return *((T*) p);
+  }
+
+  static inline __device__ void store(void* p, const T& v) {
+    *((T*) p) = v;
+  }
+};
+
+template <>
+struct LoadStore<Half4> {
+  static inline __device__ Half4 load(void* p) {
+    Half4 out;
+#if CUDA_VERSION >= 9000
+    asm("ld.global.v2.u32 {%0, %1}, [%2];" :
+        "=r"(__HALF2_TO_UI(out.a)), "=r"(__HALF2_TO_UI(out.b)) : "l"(p));
+#else
+    asm("ld.global.v2.u32 {%0, %1}, [%2];" :
+        "=r"(out.a.x), "=r"(out.b.x) : "l"(p));
+#endif
+    return out;
+  }
+
+  static inline __device__ void store(void* p, Half4& v) {
+#if CUDA_VERSION >= 9000
+    asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p),
+        "r"(__HALF2_TO_UI(v.a)), "r"(__HALF2_TO_UI(v.b)));
+#else
+    asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p), "r"(v.a.x), "r"(v.b.x));
+#endif
+  }
+};
+
+template <>
+struct LoadStore<Half8> {
+  static inline __device__ Half8 load(void* p) {
+    Half8 out;
+#if CUDA_VERSION >= 9000
+    asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" :
+        "=r"(__HALF2_TO_UI(out.a.a)), "=r"(__HALF2_TO_UI(out.a.b)),
+        "=r"(__HALF2_TO_UI(out.b.a)), "=r"(__HALF2_TO_UI(out.b.b)) : "l"(p));
+#else
+    asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" :
+        "=r"(out.a.a.x), "=r"(out.a.b.x),
+        "=r"(out.b.a.x), "=r"(out.b.b.x) : "l"(p));
+#endif
+    return out;
+  }
+
+  static inline __device__ void store(void* p, Half8& v) {
+#if CUDA_VERSION >= 9000
+    asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
+        : : "l"(p), "r"(__HALF2_TO_UI(v.a.a)), "r"(__HALF2_TO_UI(v.a.b)),
+          "r"(__HALF2_TO_UI(v.b.a)), "r"(__HALF2_TO_UI(v.b.b)));
+#else
+    asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
+        : : "l"(p), "r"(v.a.a.x), "r"(v.a.b.x), "r"(v.b.a.x), "r"(v.b.b.x));
+#endif
+  }
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/MathOperators.cuh b/core/src/index/thirdparty/faiss/gpu/utils/MathOperators.cuh
new file mode 100644
index 0000000000..f62971bdd3
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/MathOperators.cuh
@@ -0,0 +1,565 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/Float16.cuh>
+
+//
+// Templated wrappers to express math for different scalar and vector
+// types, so kernels can have the same written form but can operate
+// over half and float, and on vector types transparently
+//
+
+namespace faiss { namespace gpu {
+
+template <typename T>
+struct Math {
+  typedef T ScalarType;
+
+  static inline __device__ T add(T a, T b) {
+    return a + b;
+  }
+
+  static inline __device__ T sub(T a, T b) {
+    return a - b;
+  }
+
+  static inline __device__ T mul(T a, T b) {
+    return a * b;
+  }
+
+  static inline __device__ T neg(T v) {
+    return -v;
+  }
+
+  /// For a vector type, this is a horizontal add, returning sum(v_i)
+  static inline __device__ T reduceAdd(T v) {
+    return v;
+  }
+
+  static inline __device__ bool lt(T a, T b) {
+    return a < b;
+  }
+
+  static inline __device__ bool gt(T a, T b) {
+    return a > b;
+  }
+
+  static inline __device__ bool eq(T a, T b) {
+    return a == b;
+  }
+
+  static inline __device__ T zero() {
+    return (T) 0;
+  }
+};
+
+template <>
+struct Math<float2> {
+  typedef float ScalarType;
+
+  static inline __device__ float2 add(float2 a, float2 b) {
+    float2 v;
+    v.x = a.x + b.x;
+    v.y = a.y + b.y;
+    return v;
+  }
+
+  static inline __device__ float2 sub(float2 a, float2 b) {
+    float2 v;
+    v.x = a.x - b.x;
+    v.y = a.y - b.y;
+    return v;
+  }
+
+  static inline __device__ float2 add(float2 a, float b) {
+    float2 v;
+    v.x = a.x + b;
+    v.y = a.y + b;
+    return v;
+  }
+
+  static inline __device__ float2 sub(float2 a, float b) {
+    float2 v;
+    v.x = a.x - b;
+    v.y = a.y - b;
+    return v;
+  }
+
+  static inline __device__ float2 mul(float2 a, float2 b) {
+    float2 v;
+    v.x = a.x * b.x;
+    v.y = a.y * b.y;
+    return v;
+  }
+
+  static inline __device__ float2 mul(float2 a, float b) {
+    float2 v;
+    v.x = a.x * b;
+    v.y = a.y * b;
+    return v;
+  }
+
+  static inline __device__ float2 neg(float2 v) {
+    v.x = -v.x;
+    v.y = -v.y;
+    return v;
+  }
+
+  /// For a vector type, this is a horizontal add, returning sum(v_i)
+  static inline __device__ float reduceAdd(float2 v) {
+    return v.x + v.y;
+  }
+
+  // not implemented for vector types
+  // static inline __device__ bool lt(float2 a, float2 b);
+  // static inline __device__ bool gt(float2 a, float2 b);
+  // static inline __device__ bool eq(float2 a, float2 b);
+
+  static inline __device__ float2 zero() {
+    float2 v;
+    v.x = 0.0f;
+    v.y = 0.0f;
+    return v;
+  }
+};
+
+template <>
+struct Math<float4> {
+  typedef float ScalarType;
+
+  static inline __device__ float4 add(float4 a, float4 b) {
+    float4 v;
+    v.x = a.x + b.x;
+    v.y = a.y + b.y;
+    v.z = a.z + b.z;
+    v.w = a.w + b.w;
+    return v;
+  }
+
+  static inline __device__ float4 sub(float4 a, float4 b) {
+    float4 v;
+    v.x = a.x - b.x;
+    v.y = a.y - b.y;
+    v.z = a.z - b.z;
+    v.w = a.w - b.w;
+    return v;
+  }
+
+  static inline __device__ float4 add(float4 a, float b) {
+    float4 v;
+    v.x = a.x + b;
+    v.y = a.y + b;
+    v.z = a.z + b;
+    v.w = a.w + b;
+    return v;
+  }
+
+  static inline __device__ float4 sub(float4 a, float b) {
+    float4 v;
+    v.x = a.x - b;
+    v.y = a.y - b;
+    v.z = a.z - b;
+    v.w = a.w - b;
+    return v;
+  }
+
+  static inline __device__ float4 mul(float4 a, float4 b) {
+    float4 v;
+    v.x = a.x * b.x;
+    v.y = a.y * b.y;
+    v.z = a.z * b.z;
+    v.w = a.w * b.w;
+    return v;
+  }
+
+  static inline __device__ float4 mul(float4 a, float b) {
+    float4 v;
+    v.x = a.x * b;
+    v.y = a.y * b;
+    v.z = a.z * b;
+    v.w = a.w * b;
+    return v;
+  }
+
+  static inline __device__ float4 neg(float4 v) {
+    v.x = -v.x;
+    v.y = -v.y;
+    v.z = -v.z;
+    v.w = -v.w;
+    return v;
+  }
+
+  /// For a vector type, this is a horizontal add, returning sum(v_i)
+  static inline __device__ float reduceAdd(float4 v) {
+    return v.x + v.y + v.z + v.w;
+  }
+
+  // not implemented for vector types
+  // static inline __device__ bool lt(float4 a, float4 b);
+  // static inline __device__ bool gt(float4 a, float4 b);
+  // static inline __device__ bool eq(float4 a, float4 b);
+
+  static inline __device__ float4 zero() {
+    float4 v;
+    v.x = 0.0f;
+    v.y = 0.0f;
+    v.z = 0.0f;
+    v.w = 0.0f;
+    return v;
+  }
+};
+
+template <>
+struct Math<half> {
+  typedef half ScalarType;
+
+  static inline __device__ half add(half a, half b) {
+#ifdef FAISS_USE_FULL_FLOAT16
+    return __hadd(a, b);
+#else
+    return __float2half(__half2float(a) + __half2float(b));
+#endif
+  }
+
+  static inline __device__ half sub(half a, half b) {
+#ifdef FAISS_USE_FULL_FLOAT16
+    return __hsub(a, b);
+#else
+    return __float2half(__half2float(a) - __half2float(b));
+#endif
+  }
+
+  static inline __device__ half mul(half a, half b) {
+#ifdef FAISS_USE_FULL_FLOAT16
+    return __hmul(a, b);
+#else
+    return __float2half(__half2float(a) * __half2float(b));
+#endif
+  }
+
+  static inline __device__ half neg(half v) {
+#ifdef FAISS_USE_FULL_FLOAT16
+    return __hneg(v);
+#else
+    return __float2half(-__half2float(v));
+#endif
+  }
+
+  static inline __device__ half reduceAdd(half v) {
+    return v;
+  }
+
+  static inline __device__ bool lt(half a, half b) {
+#ifdef FAISS_USE_FULL_FLOAT16
+    return __hlt(a, b);
+#else
+    return __half2float(a) < __half2float(b);
+#endif
+  }
+
+  static inline __device__ bool gt(half a, half b) {
+#ifdef FAISS_USE_FULL_FLOAT16
+    return __hgt(a, b);
+#else
+    return __half2float(a) > __half2float(b);
+#endif
+  }
+
+  static inline __device__ bool eq(half a, half b) {
+#ifdef FAISS_USE_FULL_FLOAT16
+    return __heq(a, b);
+#else
+    return __half2float(a) == __half2float(b);
+#endif
+  }
+
+  static inline __device__ half zero() {
+#if CUDA_VERSION >= 9000
+    return 0;
+#else
+    half h;
+    h.x = 0;
+    return h;
+#endif
+  }
+};
+
+template <>
+struct Math<half2> {
+  typedef half ScalarType;
+
+  static inline __device__ half2 add(half2 a, half2 b) {
+#ifdef FAISS_USE_FULL_FLOAT16
+    return __hadd2(a, b);
+#else
+  float2 af = __half22float2(a);
+  float2 bf = __half22float2(b);
+
+  af.x += bf.x;
+  af.y += bf.y;
+
+  return __float22half2_rn(af);
+#endif
+  }
+
+  static inline __device__ half2 sub(half2 a, half2 b) {
+#ifdef FAISS_USE_FULL_FLOAT16
+    return __hsub2(a, b);
+#else
+  float2 af = __half22float2(a);
+  float2 bf = __half22float2(b);
+
+  af.x -= bf.x;
+  af.y -= bf.y;
+
+  return __float22half2_rn(af);
+#endif
+  }
+
+  static inline __device__ half2 add(half2 a, half b) {
+#ifdef FAISS_USE_FULL_FLOAT16
+    half2 b2 = __half2half2(b);
+    return __hadd2(a, b2);
+#else
+  float2 af = __half22float2(a);
+  float bf = __half2float(b);
+
+  af.x += bf;
+  af.y += bf;
+
+  return __float22half2_rn(af);
+#endif
+  }
+
+  static inline __device__ half2 sub(half2 a, half b) {
+#ifdef FAISS_USE_FULL_FLOAT16
+    half2 b2 = __half2half2(b);
+    return __hsub2(a, b2);
+#else
+  float2 af = __half22float2(a);
+  float bf = __half2float(b);
+
+  af.x -= bf;
+  af.y -= bf;
+
+  return __float22half2_rn(af);
+#endif
+  }
+
+  static inline __device__ half2 mul(half2 a, half2 b) {
+#ifdef FAISS_USE_FULL_FLOAT16
+    return __hmul2(a, b);
+#else
+  float2 af = __half22float2(a);
+  float2 bf = __half22float2(b);
+
+  af.x *= bf.x;
+  af.y *= bf.y;
+
+  return __float22half2_rn(af);
+#endif
+  }
+
+  static inline __device__ half2 mul(half2 a, half b) {
+#ifdef FAISS_USE_FULL_FLOAT16
+    half2 b2 = __half2half2(b);
+    return __hmul2(a, b2);
+#else
+  float2 af = __half22float2(a);
+  float bf = __half2float(b);
+
+  af.x *= bf;
+  af.y *= bf;
+
+  return __float22half2_rn(af);
+#endif
+  }
+
+  static inline __device__ half2 neg(half2 v) {
+#ifdef FAISS_USE_FULL_FLOAT16
+    return __hneg2(v);
+#else
+  float2 vf = __half22float2(v);
+  vf.x = -vf.x;
+  vf.y = -vf.y;
+
+  return __float22half2_rn(vf);
+#endif
+  }
+
+  static inline __device__ half reduceAdd(half2 v) {
+#ifdef FAISS_USE_FULL_FLOAT16
+  half hv = __high2half(v);
+  half lv = __low2half(v);
+
+  return __hadd(hv, lv);
+#else
+  float2 vf = __half22float2(v);
+  vf.x += vf.y;
+
+  return __float2half(vf.x);
+#endif
+  }
+
+  // not implemented for vector types
+  // static inline __device__ bool lt(half2 a, half2 b);
+  // static inline __device__ bool gt(half2 a, half2 b);
+  // static inline __device__ bool eq(half2 a, half2 b);
+
+  static inline __device__ half2 zero() {
+    return __half2half2(Math<half>::zero());
+  }
+};
+
+template <>
+struct Math<Half4> {
+  typedef half ScalarType;
+
+  static inline __device__ Half4 add(Half4 a, Half4 b) {
+    Half4 h;
+    h.a = Math<half2>::add(a.a, b.a);
+    h.b = Math<half2>::add(a.b, b.b);
+    return h;
+  }
+
+  static inline __device__ Half4 sub(Half4 a, Half4 b) {
+    Half4 h;
+    h.a = Math<half2>::sub(a.a, b.a);
+    h.b = Math<half2>::sub(a.b, b.b);
+    return h;
+  }
+
+  static inline __device__ Half4 add(Half4 a, half b) {
+    Half4 h;
+    h.a = Math<half2>::add(a.a, b);
+    h.b = Math<half2>::add(a.b, b);
+    return h;
+  }
+
+  static inline __device__ Half4 sub(Half4 a, half b) {
+    Half4 h;
+    h.a = Math<half2>::sub(a.a, b);
+    h.b = Math<half2>::sub(a.b, b);
+    return h;
+  }
+
+  static inline __device__ Half4 mul(Half4 a, Half4 b) {
+    Half4 h;
+    h.a = Math<half2>::mul(a.a, b.a);
+    h.b = Math<half2>::mul(a.b, b.b);
+    return h;
+  }
+
+  static inline __device__ Half4 mul(Half4 a, half b) {
+    Half4 h;
+    h.a = Math<half2>::mul(a.a, b);
+    h.b = Math<half2>::mul(a.b, b);
+    return h;
+  }
+
+  static inline __device__ Half4 neg(Half4 v) {
+    Half4 h;
+    h.a = Math<half2>::neg(v.a);
+    h.b = Math<half2>::neg(v.b);
+    return h;
+  }
+
+  static inline __device__ half reduceAdd(Half4 v) {
+    half hx = Math<half2>::reduceAdd(v.a);
+    half hy = Math<half2>::reduceAdd(v.b);
+    return Math<half>::add(hx, hy);
+  }
+
+  // not implemented for vector types
+  // static inline __device__ bool lt(Half4 a, Half4 b);
+  // static inline __device__ bool gt(Half4 a, Half4 b);
+  // static inline __device__ bool eq(Half4 a, Half4 b);
+
+  static inline __device__ Half4 zero() {
+    Half4 h;
+    h.a = Math<half2>::zero();
+    h.b = Math<half2>::zero();
+    return h;
+  }
+};
+
+template <>
+struct Math<Half8> {
+  typedef half ScalarType;
+
+  static inline __device__ Half8 add(Half8 a, Half8 b) {
+    Half8 h;
+    h.a = Math<Half4>::add(a.a, b.a);
+    h.b = Math<Half4>::add(a.b, b.b);
+    return h;
+  }
+
+  static inline __device__ Half8 sub(Half8 a, Half8 b) {
+    Half8 h;
+    h.a = Math<Half4>::sub(a.a, b.a);
+    h.b = Math<Half4>::sub(a.b, b.b);
+    return h;
+  }
+
+  static inline __device__ Half8 add(Half8 a, half b) {
+    Half8 h;
+    h.a = Math<Half4>::add(a.a, b);
+    h.b = Math<Half4>::add(a.b, b);
+    return h;
+  }
+
+  static inline __device__ Half8 sub(Half8 a, half b) {
+    Half8 h;
+    h.a = Math<Half4>::sub(a.a, b);
+    h.b = Math<Half4>::sub(a.b, b);
+    return h;
+  }
+
+  static inline __device__ Half8 mul(Half8 a, Half8 b) {
+    Half8 h;
+    h.a = Math<Half4>::mul(a.a, b.a);
+    h.b = Math<Half4>::mul(a.b, b.b);
+    return h;
+  }
+
+  static inline __device__ Half8 mul(Half8 a, half b) {
+    Half8 h;
+    h.a = Math<Half4>::mul(a.a, b);
+    h.b = Math<Half4>::mul(a.b, b);
+    return h;
+  }
+
+  static inline __device__ Half8 neg(Half8 v) {
+    Half8 h;
+    h.a = Math<Half4>::neg(v.a);
+    h.b = Math<Half4>::neg(v.b);
+    return h;
+  }
+
+  static inline __device__ half reduceAdd(Half8 v) {
+    half hx = Math<Half4>::reduceAdd(v.a);
+    half hy = Math<Half4>::reduceAdd(v.b);
+    return Math<half>::add(hx, hy);
+  }
+
+  // not implemented for vector types
+  // static inline __device__ bool lt(Half8 a, Half8 b);
+  // static inline __device__ bool gt(Half8 a, Half8 b);
+  // static inline __device__ bool eq(Half8 a, Half8 b);
+
+  static inline __device__ Half8 zero() {
+    Half8 h;
+    h.a = Math<Half4>::zero();
+    h.b = Math<Half4>::zero();
+    return h;
+  }
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult.cu b/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult.cu
new file mode 100644
index 0000000000..42c031119e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult.cu
@@ -0,0 +1,281 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/utils/MatrixMult.cuh>
+#include <faiss/gpu/utils/DeviceMemory.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/HostTensor.cuh>
+
+namespace faiss { namespace gpu {
+
+template <typename T>
+struct CublasGemm {
+};
+
+template <>
+struct CublasGemm<float> {
+  static cublasStatus_t gemm(cublasHandle_t handle,
+                             cublasOperation_t transa,
+                             cublasOperation_t transb,
+                             int m,
+                             int n,
+                             int k,
+                             float fAlpha,
+                             const float *A,
+                             int lda,
+                             const float *B,
+                             int ldb,
+                             float fBeta,
+                             float *C,
+                             int ldc,
+                             bool useHgemm) {
+    return cublasSgemm(handle, transa, transb, m, n, k,
+                       &fAlpha, A, lda, B, ldb, &fBeta, C, ldc);
+  }
+};
+
+template <>
+struct CublasGemm<half> {
+  static cublasStatus_t gemm(cublasHandle_t handle,
+                             cublasOperation_t transa,
+                             cublasOperation_t transb,
+                             int m,
+                             int n,
+                             int k,
+                             const float fAlpha,
+                             const half *A,
+                             int lda,
+                             const half *B,
+                             int ldb,
+                             const float fBeta,
+                             half *C,
+                             int ldc,
+                             bool useHgemm) {
+    if (getDeviceSupportsFloat16Math(getCurrentDevice()) && useHgemm) {
+      half hAlpha = hostFloat2Half(fAlpha);
+      half hBeta = hostFloat2Half(fBeta);
+
+      return cublasHgemm(handle, transa, transb, m, n, k,
+                         &hAlpha, A, lda, B, ldb, &hBeta, C, ldc);
+    }
+
+    // CUDA 8.0 changes the half datatype specifier
+#if CUDA_VERSION == 7050
+    auto halfType = CUBLAS_DATA_HALF;
+#else
+    auto halfType = CUDA_R_16F;
+#endif // CUDA_VERSION
+
+    return cublasSgemmEx(handle, transa, transb, m, n, k,
+                         &fAlpha, A, halfType, lda,
+                         B, halfType, ldb,
+                         &fBeta,
+                         C, halfType, ldc);
+  }
+};
+
+template <typename T>
+void
+runMatrixMult(Tensor<T, 2, true>& c, bool transC,
+              Tensor<T, 2, true>& a, bool transA,
+              Tensor<T, 2, true>& b, bool transB,
+              float alpha,
+              float beta,
+              bool useHgemm,
+              cublasHandle_t handle,
+              cudaStream_t stream) {
+  cublasSetStream(handle, stream);
+
+  // Check that we have (m x k) * (k x n) = (m x n)
+  // using the input row-major layout
+  int aM = transA ? a.getSize(1) : a.getSize(0);
+  int aK = transA ? a.getSize(0) : a.getSize(1);
+
+  int bK = transB ? b.getSize(1) : b.getSize(0);
+  int bN = transB ? b.getSize(0) : b.getSize(1);
+
+  int cM = transC ? c.getSize(1) : c.getSize(0);
+  int cN = transC ? c.getSize(0) : c.getSize(1);
+
+  FAISS_ASSERT(aM == cM);
+  FAISS_ASSERT(aK == bK);
+  FAISS_ASSERT(bN == cN);
+
+  FAISS_ASSERT(a.getStride(1) == 1);
+  FAISS_ASSERT(b.getStride(1) == 1);
+  FAISS_ASSERT(c.getStride(1) == 1);
+
+  // Now, we have to represent the matrix multiplication in
+  // column-major layout
+  T* pA = transC ? a.data() : b.data();
+  T* pB = transC ? b.data() : a.data();
+  T* pC = c.data();
+
+  int m = c.getSize(1); // stride 1 size
+  int n = c.getSize(0); // other size
+  int k = transA ? a.getSize(0) : a.getSize(1);
+
+  int lda = transC ? a.getStride(0) : b.getStride(0);
+  int ldb = transC ? b.getStride(0) : a.getStride(0);
+  int ldc = c.getStride(0);
+
+  auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+  auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  if (transC) {
+    gemmTrA = transA ? CUBLAS_OP_N : CUBLAS_OP_T;
+    gemmTrB = transB ? CUBLAS_OP_N : CUBLAS_OP_T;
+  }
+
+  auto err = CublasGemm<T>::gemm(handle,
+                                 gemmTrA, gemmTrB,
+                                 m, n, k, alpha,
+                                 pA, lda, pB, ldb, beta,
+                                 pC, ldc, useHgemm);
+
+  FAISS_ASSERT_FMT(err == CUBLAS_STATUS_SUCCESS,
+                   "cublas failed (%d): %s "
+                   "(%d, %d)%s x (%d, %d)%s = (%d, %d)%s",
+                   (int) err,
+                   useHgemm ? "Hgemm" : "Sgemm",
+                   a.getSize(0), a.getSize(1), transA ? "'" : "",
+                   b.getSize(0), b.getSize(1), transB ? "'" : "",
+                   c.getSize(0), c.getSize(1), transC ? "'" : "");
+  CUDA_TEST_ERROR();
+}
+
+void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
+                   Tensor<float, 2, true>& a, bool transA,
+                   Tensor<float, 2, true>& b, bool transB,
+                   float alpha,
+                   float beta,
+                   bool useHgemm,
+                   cublasHandle_t handle,
+                   cudaStream_t stream) {
+  return runMatrixMult<float>(c, transC, a, transA, b, transB,
+                              alpha, beta, useHgemm, handle, stream);
+}
+
+void runMatrixMult(Tensor<half, 2, true>& c, bool transC,
+                   Tensor<half, 2, true>& a, bool transA,
+                   Tensor<half, 2, true>& b, bool transB,
+                   float alpha,
+                   float beta,
+                   bool useHgemm,
+                   cublasHandle_t handle,
+                   cudaStream_t stream) {
+  return runMatrixMult<half>(c, transC, a, transA, b, transB,
+                             alpha, beta, useHgemm, handle, stream);
+}
+
+void
+runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,
+                      Tensor<float, 3, true>& a, bool transA,
+                      Tensor<float, 3, true>& b, bool transB,
+                      float alpha,
+                      float beta,
+                      cublasHandle_t handle,
+                      cudaStream_t stream) {
+  FAISS_ASSERT(c.getSize(0) == a.getSize(0));
+  FAISS_ASSERT(a.getSize(0) == b.getSize(0));
+
+  for (int i = 0; i < a.getSize(0); ++i) {
+    auto cView = c[i].view();
+    auto aView = a[i].view();
+    auto bView = b[i].view();
+
+    runMatrixMult(cView, transC,
+                  aView, transA,
+                  bView, transB,
+                  alpha, beta, false, handle, stream);
+  }
+}
+
+void
+runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
+                   Tensor<float, 3, true>& a, bool transA,
+                   Tensor<float, 3, true>& b, bool transB,
+                   float alpha,
+                   float beta,
+                   DeviceMemory& mem,
+                   cublasHandle_t handle,
+                   cudaStream_t stream) {
+  FAISS_ASSERT(c.getSize(0) == a.getSize(0));
+  FAISS_ASSERT(a.getSize(0) == b.getSize(0));
+  cublasSetStream(handle, stream);
+
+  // Check that we have (m x k) * (k x n) = (m x n)
+  // using the input row-major layout
+  int aM = transA ? a.getSize(2) : a.getSize(1);
+  int aK = transA ? a.getSize(1) : a.getSize(2);
+
+  int bK = transB ? b.getSize(2) : b.getSize(1);
+  int bN = transB ? b.getSize(1) : b.getSize(2);
+
+  int cM = transC ? c.getSize(2) : c.getSize(1);
+  int cN = transC ? c.getSize(1) : c.getSize(2);
+
+  FAISS_ASSERT(aM == cM);
+  FAISS_ASSERT(aK == bK);
+  FAISS_ASSERT(bN == cN);
+
+  // Now, we have to represent the matrix multiplication in
+  // column-major layout
+  float* pA = transC ? a.data() : b.data();
+  float* pB = transC ? b.data() : a.data();
+  float* pC = c.data();
+
+  int m = c.getSize(2); // stride 1 size
+  int n = c.getSize(1); // other size
+  int k = transA ? a.getSize(1) : a.getSize(2);
+
+  int lda = transC ? a.getStride(1) : b.getStride(1);
+  int ldb = transC ? b.getStride(1) : a.getStride(1);
+  int ldc = c.getStride(1);
+
+  auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+  auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  if (transC) {
+    gemmTrA = transA ? CUBLAS_OP_N : CUBLAS_OP_T;
+    gemmTrB = transB ? CUBLAS_OP_N : CUBLAS_OP_T;
+  }
+
+  HostTensor<float*, 1, true> hostA({a.getSize(0)});
+  HostTensor<float*, 1, true> hostB({b.getSize(0)});
+  HostTensor<float*, 1, true> hostC({c.getSize(0)});
+
+  size_t aOffset = a.getStride(0);
+  size_t bOffset = b.getStride(0);
+  size_t cOffset = c.getStride(0);
+
+  for (int i = 0; i < a.getSize(0); ++i) {
+    hostA[i] = transC ? a.data() + i * aOffset : b.data() + i * bOffset;
+    hostB[i] = transC ? b.data() + i * bOffset : a.data() + i * aOffset;
+    hostC[i] = c.data() + i * cOffset;
+  }
+
+  DeviceTensor<float*, 1, true> deviceA(mem, hostA, stream);
+  DeviceTensor<float*, 1, true> deviceB(mem, hostB, stream);
+  DeviceTensor<float*, 1, true> deviceC(mem, hostC, stream);
+
+  auto err =
+    cublasSgemmBatched(handle,
+                       gemmTrA, gemmTrB,
+                       m, n, k, &alpha,
+                       (const float**) deviceA.data(), lda,
+                       (const float**) deviceB.data(), ldb, &beta,
+                       deviceC.data(), ldc, a.getSize(0));
+  FAISS_ASSERT_FMT(err == CUBLAS_STATUS_SUCCESS,
+                   "cublasSgemmBatched failed (%d)", (int) err);
+  CUDA_TEST_ERROR();
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult.cuh b/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult.cuh
new file mode 100644
index 0000000000..1175ac213a
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult.cuh
@@ -0,0 +1,63 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <cublas_v2.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+
+namespace faiss { namespace gpu {
+
+class DeviceMemory;
+
+/// C = alpha * A * B + beta * C
+/// Expects row major layout, not fortran/blas column major!
+void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
+                   Tensor<float, 2, true>& a, bool transA,
+                   Tensor<float, 2, true>& b, bool transB,
+                   float alpha,
+                   float beta,
+                   bool useHgemm, // ignored for float32
+                   cublasHandle_t handle,
+                   cudaStream_t stream);
+
+/// C = alpha * A * B + beta * C
+/// Expects row major layout, not fortran/blas column major!
+void runMatrixMult(Tensor<half, 2, true>& c, bool transC,
+                   Tensor<half, 2, true>& a, bool transA,
+                   Tensor<half, 2, true>& b, bool transB,
+                   float alpha,
+                   float beta,
+                   bool useHgemm,
+                   cublasHandle_t handle,
+                   cudaStream_t stream);
+
+/// C_i = alpha * A_i * B_i + beta * C_i
+/// where `i` is the outermost dimension, via iterated gemm
+/// Expects row major layout, not fortran/blas column major!
+void runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,
+                           Tensor<float, 3, true>& a, bool transA,
+                           Tensor<float, 3, true>& b, bool transB,
+                           float alpha,
+                           float beta,
+                           cublasHandle_t handle,
+                           cudaStream_t stream);
+
+/// C_i = alpha * A_i * B_i + beta * C_i
+/// where `i` is the outermost dimension, via batched gemm
+/// Expects row major layout, not fortran/blas column major!
+void runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
+                        Tensor<float, 3, true>& a, bool transA,
+                        Tensor<float, 3, true>& b, bool transB,
+                        float alpha,
+                        float beta,
+                        DeviceMemory& mem,
+                        cublasHandle_t handle,
+                        cudaStream_t stream);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/MemorySpace.cpp b/core/src/index/thirdparty/faiss/gpu/utils/MemorySpace.cpp
new file mode 100644
index 0000000000..282f835784
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/MemorySpace.cpp
@@ -0,0 +1,89 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/utils/MemorySpace.h>
+#include <faiss/impl/FaissAssert.h>
+#include <cuda_runtime.h>
+
+namespace faiss { namespace gpu {
+
+/// Allocates CUDA memory for a given memory space
+void allocMemorySpaceV(MemorySpace space, void** p, size_t size) {
+  switch (space) {
+    case MemorySpace::Device:
+    {
+      auto err = cudaMalloc(p, size);
+
+      // Throw if we fail to allocate
+      FAISS_THROW_IF_NOT_FMT(
+        err == cudaSuccess,
+        "failed to cudaMalloc %zu bytes (error %d %s)",
+        size, (int) err, cudaGetErrorString(err));
+    }
+    break;
+    case MemorySpace::Unified:
+    {
+#ifdef FAISS_UNIFIED_MEM
+      auto err = cudaMallocManaged(p, size);
+
+      // Throw if we fail to allocate
+      FAISS_THROW_IF_NOT_FMT(
+        err == cudaSuccess,
+        "failed to cudaMallocManaged %zu bytes (error %d %s)",
+        size, (int) err, cudaGetErrorString(err));
+#else
+      FAISS_THROW_MSG("Attempting to allocate via cudaMallocManaged "
+                      "without CUDA 8+ support");
+#endif
+    }
+    break;
+    case MemorySpace::HostPinned:
+    {
+      auto err = cudaHostAlloc(p, size, cudaHostAllocDefault);
+
+      // Throw if we fail to allocate
+      FAISS_THROW_IF_NOT_FMT(
+        err == cudaSuccess,
+        "failed to cudaHostAlloc %zu bytes (error %d %s)",
+        size, (int) err, cudaGetErrorString(err));
+    }
+    break;
+    default:
+      FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) space);
+      break;
+  }
+}
+
+// We'll allow allocation to fail, but free should always succeed and be a
+// fatal error if it doesn't free
+void freeMemorySpace(MemorySpace space, void* p) {
+  switch (space) {
+    case MemorySpace::Device:
+    case MemorySpace::Unified:
+    {
+      auto err = cudaFree(p);
+      FAISS_ASSERT_FMT(err == cudaSuccess,
+                       "Failed to cudaFree pointer %p (error %d %s)",
+                       p, (int) err, cudaGetErrorString(err));
+    }
+    break;
+    case MemorySpace::HostPinned:
+    {
+      auto err = cudaFreeHost(p);
+      FAISS_ASSERT_FMT(err == cudaSuccess,
+                       "Failed to cudaFreeHost pointer %p (error %d %s)",
+                       p, (int) err, cudaGetErrorString(err));
+    }
+    break;
+    default:
+      FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) space);
+      break;
+  }
+}
+
+} }
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/MemorySpace.h b/core/src/index/thirdparty/faiss/gpu/utils/MemorySpace.h
new file mode 100644
index 0000000000..f269f06a39
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/MemorySpace.h
@@ -0,0 +1,44 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <cuda.h>
+
+#if CUDA_VERSION >= 8000
+// Whether or not we enable usage of CUDA Unified Memory
+#define FAISS_UNIFIED_MEM 1
+#endif
+
+namespace faiss { namespace gpu {
+
+enum MemorySpace {
+  /// Managed using cudaMalloc/cudaFree
+  Device = 1,
+  /// Managed using cudaMallocManaged/cudaFree
+  Unified = 2,
+  /// Managed using cudaHostAlloc/cudaFreeHost
+  HostPinned = 3,
+};
+
+/// All memory allocations and de-allocations come through these functions
+
+/// Allocates CUDA memory for a given memory space (void pointer)
+/// Throws a FaissException if we are unable to allocate the memory
+void allocMemorySpaceV(MemorySpace space, void** p, size_t size);
+
+template <typename T>
+inline void allocMemorySpace(MemorySpace space, T** p, size_t size) {
+  allocMemorySpaceV(space, (void**)(void*) p, size);
+}
+
+/// Frees CUDA memory for a given memory space
+/// Asserts if we are unable to free the region
+void freeMemorySpace(MemorySpace space, void* p);
+
+} }
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/MergeNetworkBlock.cuh b/core/src/index/thirdparty/faiss/gpu/utils/MergeNetworkBlock.cuh
new file mode 100644
index 0000000000..2776258b57
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/MergeNetworkBlock.cuh
@@ -0,0 +1,289 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/MergeNetworkUtils.cuh>
+#include <faiss/gpu/utils/PtxUtils.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/gpu/utils/WarpShuffles.cuh>
+#include <faiss/impl/FaissAssert.h>
+#include <cuda.h>
+
+namespace faiss { namespace gpu {
+
+// Merge pairs of lists smaller than blockDim.x (NumThreads)
+template <int NumThreads,
+          typename K,
+          typename V,
+          int N,
+          int L,
+          bool AllThreads,
+          bool Dir,
+          typename Comp,
+          bool FullMerge>
+inline __device__ void blockMergeSmall(K* listK, V* listV) {
+  static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
+  static_assert(utils::isPowerOf2(NumThreads),
+                "NumThreads must be a power-of-2");
+  static_assert(L <= NumThreads, "merge list size must be <= NumThreads");
+
+  // Which pair of lists we are merging
+  int mergeId = threadIdx.x / L;
+
+  // Which thread we are within the merge
+  int tid = threadIdx.x % L;
+
+  // listK points to a region of size N * 2 * L
+  listK += 2 * L * mergeId;
+  listV += 2 * L * mergeId;
+
+  // It's not a bitonic merge, both lists are in the same direction,
+  // so handle the first swap assuming the second list is reversed
+  int pos = L - 1 - tid;
+  int stride = 2 * tid + 1;
+
+  if (AllThreads || (threadIdx.x < N * L)) {
+    K ka = listK[pos];
+    K kb = listK[pos + stride];
+
+    bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+    listK[pos] = swap ? kb : ka;
+    listK[pos + stride] = swap ? ka : kb;
+
+    V va = listV[pos];
+    V vb = listV[pos + stride];
+    listV[pos] = swap ? vb : va;
+    listV[pos + stride] = swap ? va : vb;
+
+    // FIXME: is this a CUDA 9 compiler bug?
+    // K& ka = listK[pos];
+    // K& kb = listK[pos + stride];
+
+    // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+    // swap(s, ka, kb);
+
+    // V& va = listV[pos];
+    // V& vb = listV[pos + stride];
+    // swap(s, va, vb);
+  }
+
+  __syncthreads();
+
+#pragma unroll
+  for (int stride = L / 2; stride > 0; stride /= 2) {
+    int pos = 2 * tid - (tid & (stride - 1));
+
+    if (AllThreads || (threadIdx.x < N * L)) {
+      K ka = listK[pos];
+      K kb = listK[pos + stride];
+
+      bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      listK[pos] = swap ? kb : ka;
+      listK[pos + stride] = swap ? ka : kb;
+
+      V va = listV[pos];
+      V vb = listV[pos + stride];
+      listV[pos] = swap ? vb : va;
+      listV[pos + stride] = swap ? va : vb;
+
+      // FIXME: is this a CUDA 9 compiler bug?
+      // K& ka = listK[pos];
+      // K& kb = listK[pos + stride];
+
+      // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      // swap(s, ka, kb);
+
+      // V& va = listV[pos];
+      // V& vb = listV[pos + stride];
+      // swap(s, va, vb);
+    }
+
+    __syncthreads();
+  }
+}
+
+// Merge pairs of sorted lists larger than blockDim.x (NumThreads)
+template <int NumThreads,
+          typename K,
+          typename V,
+          int L,
+          bool Dir,
+          typename Comp,
+          bool FullMerge>
+inline __device__ void blockMergeLarge(K* listK, V* listV) {
+  static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
+  static_assert(L >= kWarpSize, "merge list size must be >= 32");
+  static_assert(utils::isPowerOf2(NumThreads),
+                "NumThreads must be a power-of-2");
+  static_assert(L >= NumThreads, "merge list size must be >= NumThreads");
+
+  // For L > NumThreads, each thread has to perform more work
+  // per each stride.
+  constexpr int kLoopPerThread = L / NumThreads;
+
+  // It's not a bitonic merge, both lists are in the same direction,
+  // so handle the first swap assuming the second list is reversed
+#pragma unroll
+  for (int loop = 0; loop < kLoopPerThread; ++loop) {
+    int tid = loop * NumThreads + threadIdx.x;
+    int pos = L - 1 - tid;
+    int stride = 2 * tid + 1;
+
+   K ka = listK[pos];
+    K kb = listK[pos + stride];
+
+    bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+    listK[pos] = swap ? kb : ka;
+    listK[pos + stride] = swap ? ka : kb;
+
+    V va = listV[pos];
+    V vb = listV[pos + stride];
+    listV[pos] = swap ? vb : va;
+    listV[pos + stride] = swap ? va : vb;
+
+    // FIXME: is this a CUDA 9 compiler bug?
+    // K& ka = listK[pos];
+    // K& kb = listK[pos + stride];
+
+    // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+    // swap(s, ka, kb);
+
+    // V& va = listV[pos];
+    // V& vb = listV[pos + stride];
+    // swap(s, va, vb);
+  }
+
+  __syncthreads();
+
+  constexpr int kSecondLoopPerThread =
+    FullMerge ? kLoopPerThread : kLoopPerThread / 2;
+
+#pragma unroll
+  for (int stride = L / 2; stride > 0; stride /= 2) {
+#pragma unroll
+    for (int loop = 0; loop < kSecondLoopPerThread; ++loop) {
+      int tid = loop * NumThreads + threadIdx.x;
+      int pos = 2 * tid - (tid & (stride - 1));
+
+      K ka = listK[pos];
+      K kb = listK[pos + stride];
+
+      bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      listK[pos] = swap ? kb : ka;
+      listK[pos + stride] = swap ? ka : kb;
+
+      V va = listV[pos];
+      V vb = listV[pos + stride];
+      listV[pos] = swap ? vb : va;
+      listV[pos + stride] = swap ? va : vb;
+
+      // FIXME: is this a CUDA 9 compiler bug?
+      // K& ka = listK[pos];
+      // K& kb = listK[pos + stride];
+
+      // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      // swap(s, ka, kb);
+
+      // V& va = listV[pos];
+      // V& vb = listV[pos + stride];
+      // swap(s, va, vb);
+    }
+
+    __syncthreads();
+  }
+}
+
+/// Class template to prevent static_assert from firing for
+/// mixing smaller/larger than block cases
+template <int NumThreads,
+          typename K,
+          typename V,
+          int N,
+          int L,
+          bool Dir,
+          typename Comp,
+          bool SmallerThanBlock,
+          bool FullMerge>
+struct BlockMerge {
+};
+
+/// Merging lists smaller than a block
+template <int NumThreads,
+          typename K,
+          typename V,
+          int N,
+          int L,
+          bool Dir,
+          typename Comp,
+          bool FullMerge>
+struct BlockMerge<NumThreads, K, V, N, L, Dir, Comp, true, FullMerge> {
+  static inline __device__ void merge(K* listK, V* listV) {
+    constexpr int kNumParallelMerges = NumThreads / L;
+    constexpr int kNumIterations = N / kNumParallelMerges;
+
+    static_assert(L <= NumThreads, "list must be <= NumThreads");
+    static_assert((N < kNumParallelMerges) ||
+                  (kNumIterations * kNumParallelMerges == N),
+                  "improper selection of N and L");
+
+    if (N < kNumParallelMerges) {
+      // We only need L threads per each list to perform the merge
+      blockMergeSmall<NumThreads, K, V, N, L, false, Dir, Comp, FullMerge>(
+        listK, listV);
+    } else {
+      // All threads participate
+#pragma unroll
+      for (int i = 0; i < kNumIterations; ++i) {
+        int start = i * kNumParallelMerges * 2 * L;
+
+        blockMergeSmall<NumThreads, K, V, N, L, true, Dir, Comp, FullMerge>(
+          listK + start, listV + start);
+      }
+    }
+  }
+};
+
+/// Merging lists larger than a block
+template <int NumThreads,
+          typename K,
+          typename V,
+          int N,
+          int L,
+          bool Dir,
+          typename Comp,
+          bool FullMerge>
+struct BlockMerge<NumThreads, K, V, N, L, Dir, Comp, false, FullMerge> {
+  static inline __device__ void merge(K* listK, V* listV) {
+    // Each pair of lists is merged sequentially
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      int start = i * 2 * L;
+
+      blockMergeLarge<NumThreads, K, V, L, Dir, Comp, FullMerge>(
+        listK + start, listV + start);
+    }
+  }
+};
+
+template <int NumThreads,
+          typename K,
+          typename V,
+          int N,
+          int L,
+          bool Dir,
+          typename Comp,
+          bool FullMerge = true>
+inline __device__ void blockMerge(K* listK, V* listV) {
+  constexpr bool kSmallerThanBlock = (L <= NumThreads);
+
+  BlockMerge<NumThreads, K, V, N, L, Dir, Comp, kSmallerThanBlock, FullMerge>::
+    merge(listK, listV);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/MergeNetworkUtils.cuh b/core/src/index/thirdparty/faiss/gpu/utils/MergeNetworkUtils.cuh
new file mode 100644
index 0000000000..6810345226
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/MergeNetworkUtils.cuh
@@ -0,0 +1,24 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+namespace faiss { namespace gpu {
+
+template <typename T>
+inline __device__ void swap(bool swap, T& x, T& y) {
+  T tmp = x;
+  x = swap ? y : x;
+  y = swap ? tmp : y;
+}
+
+template <typename T>
+inline __device__ void assign(bool assign, T& x, T y) {
+  x = assign ? y : x;
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/MergeNetworkWarp.cuh b/core/src/index/thirdparty/faiss/gpu/utils/MergeNetworkWarp.cuh
new file mode 100644
index 0000000000..4e486b025f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/MergeNetworkWarp.cuh
@@ -0,0 +1,510 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/MergeNetworkUtils.cuh>
+#include <faiss/gpu/utils/PtxUtils.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/gpu/utils/WarpShuffles.cuh>
+
+namespace faiss { namespace gpu {
+
+//
+// This file contains functions to:
+//
+// -perform bitonic merges on pairs of sorted lists, held in
+// registers. Each list contains N * kWarpSize (multiple of 32)
+// elements for some N.
+// The bitonic merge is implemented for arbitrary sizes;
+// sorted list A of size N1 * kWarpSize registers
+// sorted list B of size N2 * kWarpSize registers =>
+// sorted list C if size (N1 + N2) * kWarpSize registers. N1 and N2
+// are >= 1 and don't have to be powers of 2.
+//
+// -perform bitonic sorts on a set of N * kWarpSize key/value pairs
+// held in registers, by using the above bitonic merge as a
+// primitive.
+// N can be an arbitrary N >= 1; i.e., the bitonic sort here supports
+// odd sizes and doesn't require the input to be a power of 2.
+//
+// The sort or merge network is completely statically instantiated via
+// template specialization / expansion and constexpr, and it uses warp
+// shuffles to exchange values between warp lanes.
+//
+// A note about comparsions:
+//
+// For a sorting network of keys only, we only need one
+// comparison (a < b). However, what we really need to know is
+// if one lane chooses to exchange a value, then the
+// corresponding lane should also do the exchange.
+// Thus, if one just uses the negation !(x < y) in the higher
+// lane, this will also include the case where (x == y). Thus, one
+// lane in fact performs an exchange and the other doesn't, but
+// because the only value being exchanged is equivalent, nothing has
+// changed.
+// So, you can get away with just one comparison and its negation.
+//
+// If we're sorting keys and values, where equivalent keys can
+// exist, then this is a problem, since we want to treat (x, v1)
+// as not equivalent to (x, v2).
+//
+// To remedy this, you can either compare with a lexicographic
+// ordering (a.k < b.k || (a.k == b.k && a.v < b.v)), which since
+// we're predicating all of the choices results in 3 comparisons
+// being executed, or we can invert the selection so that there is no
+// middle choice of equality; the other lane will likewise
+// check that (b.k > a.k) (the higher lane has the values
+// swapped). Then, the first lane swaps if and only if the
+// second lane swaps; if both lanes have equivalent keys, no
+// swap will be performed. This results in only two comparisons
+// being executed.
+//
+// If you don't consider values as well, then this does not produce a
+// consistent ordering among (k, v) pairs with equivalent keys but
+// different values; for us, we don't really care about ordering or
+// stability here.
+//
+// I have tried both re-arranging the order in the higher lane to get
+// away with one comparison or adding the value to the check; both
+// result in greater register consumption or lower speed than just
+// perfoming both < and > comparisons with the variables, so I just
+// stick with this.
+
+// This function merges kWarpSize / 2L lists in parallel using warp
+// shuffles.
+// It works on at most size-16 lists, as we need 32 threads for this
+// shuffle merge.
+//
+// If IsBitonic is false, the first stage is reversed, so we don't
+// need to sort directionally. It's still technically a bitonic sort.
+template <typename K, typename V, int L,
+          bool Dir, typename Comp, bool IsBitonic>
+inline __device__ void warpBitonicMergeLE16(K& k, V& v) {
+  static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
+  static_assert(L <= kWarpSize / 2, "merge list size must be <= 16");
+
+  int laneId = getLaneId();
+
+  if (!IsBitonic) {
+    // Reverse the first comparison stage.
+    // For example, merging a list of size 8 has the exchanges:
+    // 0 <-> 15, 1 <-> 14, ...
+    K otherK = shfl_xor(k, 2 * L - 1);
+    V otherV = shfl_xor(v, 2 * L - 1);
+
+    // Whether we are the lesser thread in the exchange
+    bool small = !(laneId & L);
+
+    if (Dir) {
+      // See the comment above how performing both of these
+      // comparisons in the warp seems to win out over the
+      // alternatives in practice
+      bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK);
+      assign(s, k, otherK);
+      assign(s, v, otherV);
+
+    } else {
+      bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK);
+      assign(s, k, otherK);
+      assign(s, v, otherV);
+    }
+  }
+
+#pragma unroll
+  for (int stride = IsBitonic ? L : L / 2; stride > 0; stride /= 2) {
+    K otherK = shfl_xor(k, stride);
+    V otherV = shfl_xor(v, stride);
+
+    // Whether we are the lesser thread in the exchange
+    bool small = !(laneId & stride);
+
+    if (Dir) {
+      bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK);
+      assign(s, k, otherK);
+      assign(s, v, otherV);
+
+    } else {
+      bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK);
+      assign(s, k, otherK);
+      assign(s, v, otherV);
+    }
+  }
+}
+
+// Template for performing a bitonic merge of an arbitrary set of
+// registers
+template <typename K, typename V, int N,
+          bool Dir, typename Comp, bool Low, bool Pow2>
+struct BitonicMergeStep {
+};
+
+//
+// Power-of-2 merge specialization
+//
+
+// All merges eventually call this
+template <typename K, typename V, bool Dir, typename Comp, bool Low>
+struct BitonicMergeStep<K, V, 1, Dir, Comp, Low, true> {
+  static inline __device__ void merge(K k[1], V v[1]) {
+    // Use warp shuffles
+    warpBitonicMergeLE16<K, V, 16, Dir, Comp, true>(k[0], v[0]);
+  }
+};
+
+template <typename K, typename V, int N, bool Dir, typename Comp, bool Low>
+struct BitonicMergeStep<K, V, N, Dir, Comp, Low, true> {
+  static inline __device__ void merge(K k[N], V v[N]) {
+    static_assert(utils::isPowerOf2(N), "must be power of 2");
+    static_assert(N > 1, "must be N > 1");
+
+#pragma unroll
+    for (int i = 0; i < N / 2; ++i) {
+      K& ka = k[i];
+      V& va = v[i];
+
+      K& kb = k[i + N / 2];
+      V& vb = v[i + N / 2];
+
+      bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      swap(s, ka, kb);
+      swap(s, va, vb);
+    }
+
+    {
+      K newK[N / 2];
+      V newV[N / 2];
+
+#pragma unroll
+      for (int i = 0; i < N / 2; ++i) {
+        newK[i] = k[i];
+        newV[i] = v[i];
+      }
+
+      BitonicMergeStep<K, V, N / 2, Dir, Comp, true, true>::merge(newK, newV);
+
+#pragma unroll
+      for (int i = 0; i < N / 2; ++i) {
+        k[i] = newK[i];
+        v[i] = newV[i];
+      }
+    }
+
+    {
+      K newK[N / 2];
+      V newV[N / 2];
+
+#pragma unroll
+      for (int i = 0; i < N / 2; ++i) {
+        newK[i] = k[i + N / 2];
+        newV[i] = v[i + N / 2];
+      }
+
+      BitonicMergeStep<K, V, N / 2, Dir, Comp, false, true>::merge(newK, newV);
+
+#pragma unroll
+      for (int i = 0; i < N / 2; ++i) {
+        k[i + N / 2] = newK[i];
+        v[i + N / 2] = newV[i];
+      }
+    }
+  }
+};
+
+//
+// Non-power-of-2 merge specialization
+//
+
+// Low recursion
+template <typename K, typename V, int N, bool Dir, typename Comp>
+struct BitonicMergeStep<K, V, N, Dir, Comp, true, false> {
+  static inline __device__ void merge(K k[N], V v[N]) {
+    static_assert(!utils::isPowerOf2(N), "must be non-power-of-2");
+    static_assert(N >= 3, "must be N >= 3");
+
+    constexpr int kNextHighestPowerOf2 = utils::nextHighestPowerOf2(N);
+
+#pragma unroll
+    for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) {
+      K& ka = k[i];
+      V& va = v[i];
+
+      K& kb = k[i + kNextHighestPowerOf2 / 2];
+      V& vb = v[i + kNextHighestPowerOf2 / 2];
+
+      bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      swap(s, ka, kb);
+      swap(s, va, vb);
+    }
+
+    constexpr int kLowSize = N - kNextHighestPowerOf2 / 2;
+    constexpr int kHighSize = kNextHighestPowerOf2 / 2;
+    {
+      K newK[kLowSize];
+      V newV[kLowSize];
+
+#pragma unroll
+      for (int i = 0; i < kLowSize; ++i) {
+        newK[i] = k[i];
+        newV[i] = v[i];
+      }
+
+      constexpr bool kLowIsPowerOf2 =
+        utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
+      // FIXME: compiler doesn't like this expression? compiler bug?
+//      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize);
+      BitonicMergeStep<K, V, kLowSize, Dir, Comp,
+                       true, // low
+                       kLowIsPowerOf2>::merge(newK, newV);
+
+#pragma unroll
+      for (int i = 0; i < kLowSize; ++i) {
+        k[i] = newK[i];
+        v[i] = newV[i];
+      }
+    }
+
+    {
+      K newK[kHighSize];
+      V newV[kHighSize];
+
+#pragma unroll
+      for (int i = 0; i < kHighSize; ++i) {
+        newK[i] = k[i + kLowSize];
+        newV[i] = v[i + kLowSize];
+      }
+
+      constexpr bool kHighIsPowerOf2 =
+        utils::isPowerOf2(kNextHighestPowerOf2 / 2);
+      // FIXME: compiler doesn't like this expression? compiler bug?
+//      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize);
+      BitonicMergeStep<K, V, kHighSize, Dir, Comp,
+                       false, // high
+                       kHighIsPowerOf2>::merge(newK, newV);
+
+#pragma unroll
+      for (int i = 0; i < kHighSize; ++i) {
+        k[i + kLowSize] = newK[i];
+        v[i + kLowSize] = newV[i];
+      }
+    }
+  }
+};
+
+// High recursion
+template <typename K, typename V, int N, bool Dir, typename Comp>
+struct BitonicMergeStep<K, V, N, Dir, Comp, false, false> {
+  static inline __device__ void merge(K k[N], V v[N]) {
+    static_assert(!utils::isPowerOf2(N), "must be non-power-of-2");
+    static_assert(N >= 3, "must be N >= 3");
+
+    constexpr int kNextHighestPowerOf2 = utils::nextHighestPowerOf2(N);
+
+#pragma unroll
+    for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) {
+      K& ka = k[i];
+      V& va = v[i];
+
+      K& kb = k[i + kNextHighestPowerOf2 / 2];
+      V& vb = v[i + kNextHighestPowerOf2 / 2];
+
+      bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      swap(s, ka, kb);
+      swap(s, va, vb);
+    }
+
+    constexpr int kLowSize = kNextHighestPowerOf2 / 2;
+    constexpr int kHighSize = N - kNextHighestPowerOf2 / 2;
+    {
+      K newK[kLowSize];
+      V newV[kLowSize];
+
+#pragma unroll
+      for (int i = 0; i < kLowSize; ++i) {
+        newK[i] = k[i];
+        newV[i] = v[i];
+      }
+
+      constexpr bool kLowIsPowerOf2 =
+        utils::isPowerOf2(kNextHighestPowerOf2 / 2);
+      // FIXME: compiler doesn't like this expression? compiler bug?
+//      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize);
+      BitonicMergeStep<K, V, kLowSize, Dir, Comp,
+                       true, // low
+                       kLowIsPowerOf2>::merge(newK, newV);
+
+#pragma unroll
+      for (int i = 0; i < kLowSize; ++i) {
+        k[i] = newK[i];
+        v[i] = newV[i];
+      }
+    }
+
+    {
+      K newK[kHighSize];
+      V newV[kHighSize];
+
+#pragma unroll
+      for (int i = 0; i < kHighSize; ++i) {
+        newK[i] = k[i + kLowSize];
+        newV[i] = v[i + kLowSize];
+      }
+
+      constexpr bool kHighIsPowerOf2 =
+        utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
+      // FIXME: compiler doesn't like this expression? compiler bug?
+//      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize);
+      BitonicMergeStep<K, V, kHighSize, Dir, Comp,
+                       false, // high
+                       kHighIsPowerOf2>::merge(newK, newV);
+
+#pragma unroll
+      for (int i = 0; i < kHighSize; ++i) {
+        k[i + kLowSize] = newK[i];
+        v[i + kLowSize] = newV[i];
+      }
+    }
+  }
+};
+
+/// Merges two sets of registers across the warp of any size;
+/// i.e., merges a sorted k/v list of size kWarpSize * N1 with a
+/// sorted k/v list of size kWarpSize * N2, where N1 and N2 are any
+/// value >= 1
+template <typename K,
+          typename V,
+          int N1,
+          int N2,
+          bool Dir,
+          typename Comp,
+          bool FullMerge = true>
+inline __device__ void warpMergeAnyRegisters(K k1[N1], V v1[N1],
+                                             K k2[N2], V v2[N2]) {
+  constexpr int kSmallestN = N1 < N2 ? N1 : N2;
+
+#pragma unroll
+  for (int i = 0; i < kSmallestN; ++i) {
+    K& ka = k1[N1 - 1 - i];
+    V& va = v1[N1 - 1 - i];
+
+    K& kb = k2[i];
+    V& vb = v2[i];
+
+    K otherKa;
+    V otherVa;
+
+    if (FullMerge) {
+      // We need the other values
+      otherKa = shfl_xor(ka, kWarpSize - 1);
+      otherVa = shfl_xor(va, kWarpSize - 1);
+    }
+
+    K otherKb = shfl_xor(kb, kWarpSize - 1);
+    V otherVb = shfl_xor(vb, kWarpSize - 1);
+
+    // ka is always first in the list, so we needn't use our lane
+    // in this comparison
+    bool swapa = Dir ? Comp::gt(ka, otherKb) : Comp::lt(ka, otherKb);
+    assign(swapa, ka, otherKb);
+    assign(swapa, va, otherVb);
+
+    // kb is always second in the list, so we needn't use our lane
+    // in this comparison
+    if (FullMerge) {
+      bool swapb = Dir ? Comp::lt(kb, otherKa) : Comp::gt(kb, otherKa);
+      assign(swapb, kb, otherKa);
+      assign(swapb, vb, otherVa);
+
+    } else {
+      // We don't care about updating elements in the second list
+    }
+  }
+
+  BitonicMergeStep<K, V, N1, Dir, Comp,
+                   true, utils::isPowerOf2(N1)>::merge(k1, v1);
+  if (FullMerge) {
+    // Only if we care about N2 do we need to bother merging it fully
+    BitonicMergeStep<K, V, N2, Dir, Comp,
+                     false, utils::isPowerOf2(N2)>::merge(k2, v2);
+  }
+}
+
+// Recursive template that uses the above bitonic merge to perform a
+// bitonic sort
+template <typename K, typename V, int N, bool Dir, typename Comp>
+struct BitonicSortStep {
+  static inline __device__ void sort(K k[N], V v[N]) {
+    static_assert(N > 1, "did not hit specialized case");
+
+    // Sort recursively
+    constexpr int kSizeA = N / 2;
+    constexpr int kSizeB = N - kSizeA;
+
+    K aK[kSizeA];
+    V aV[kSizeA];
+
+#pragma unroll
+    for (int i = 0; i < kSizeA; ++i) {
+      aK[i] = k[i];
+      aV[i] = v[i];
+    }
+
+    BitonicSortStep<K, V, kSizeA, Dir, Comp>::sort(aK, aV);
+
+    K bK[kSizeB];
+    V bV[kSizeB];
+
+#pragma unroll
+    for (int i = 0; i < kSizeB; ++i) {
+      bK[i] = k[i + kSizeA];
+      bV[i] = v[i + kSizeA];
+    }
+
+    BitonicSortStep<K, V, kSizeB, Dir, Comp>::sort(bK, bV);
+
+    // Merge halves
+    warpMergeAnyRegisters<K, V, kSizeA, kSizeB, Dir, Comp>(aK, aV, bK, bV);
+
+#pragma unroll
+    for (int i = 0; i < kSizeA; ++i) {
+      k[i] = aK[i];
+      v[i] = aV[i];
+    }
+
+#pragma unroll
+    for (int i = 0; i < kSizeB; ++i) {
+      k[i + kSizeA] = bK[i];
+      v[i + kSizeA] = bV[i];
+    }
+  }
+};
+
+// Single warp (N == 1) sorting specialization
+template <typename K, typename V, bool Dir, typename Comp>
+struct BitonicSortStep<K, V, 1, Dir, Comp> {
+  static inline __device__ void sort(K k[1], V v[1]) {
+    // Update this code if this changes
+    // should go from 1 -> kWarpSize in multiples of 2
+    static_assert(kWarpSize == 32, "unexpected warp size");
+
+    warpBitonicMergeLE16<K, V, 1, Dir, Comp, false>(k[0], v[0]);
+    warpBitonicMergeLE16<K, V, 2, Dir, Comp, false>(k[0], v[0]);
+    warpBitonicMergeLE16<K, V, 4, Dir, Comp, false>(k[0], v[0]);
+    warpBitonicMergeLE16<K, V, 8, Dir, Comp, false>(k[0], v[0]);
+    warpBitonicMergeLE16<K, V, 16, Dir, Comp, false>(k[0], v[0]);
+  }
+};
+
+/// Sort a list of kWarpSize * N elements in registers, where N is an
+/// arbitrary >= 1
+template <typename K, typename V, int N, bool Dir, typename Comp>
+inline __device__ void warpSortAnyRegisters(K k[N], V v[N]) {
+  BitonicSortStep<K, V, N, Dir, Comp>::sort(k, v);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/NoTypeTensor.cuh b/core/src/index/thirdparty/faiss/gpu/utils/NoTypeTensor.cuh
new file mode 100644
index 0000000000..fdbc879f35
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/NoTypeTensor.cuh
@@ -0,0 +1,123 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <initializer_list>
+
+namespace faiss { namespace gpu {
+
+template <int Dim, bool InnerContig = false, typename IndexT = int>
+class NoTypeTensor {
+ public:
+  NoTypeTensor()
+      : mem_(nullptr),
+        typeSize_(0) {
+  }
+
+  template <typename T>
+  NoTypeTensor(Tensor<T, Dim, InnerContig, IndexT>& t)
+      : mem_(t.data()),
+        typeSize_(sizeof(T)) {
+    for (int i = 0; i < Dim; ++i) {
+      size_[i] = t.getSize(i);
+      stride_[i] = t.getStride(i);
+    }
+  }
+
+  NoTypeTensor(void* mem, int typeSize, std::initializer_list<IndexT> sizes)
+      : mem_(mem),
+        typeSize_(typeSize) {
+
+    int i = 0;
+    for (auto s : sizes) {
+      size_[i++] = s;
+    }
+
+    stride_[Dim - 1] = (IndexT) 1;
+    for (int j = Dim - 2; j >= 0; --j) {
+      stride_[j] = stride_[j + 1] * size_[j + 1];
+    }
+  }
+
+  NoTypeTensor(void* mem, int typeSize, int sizes[Dim])
+      : mem_(mem),
+        typeSize_(typeSize) {
+    for (int i = 0; i < Dim; ++i) {
+      size_[i] = sizes[i];
+    }
+
+    stride_[Dim - 1] = (IndexT) 1;
+    for (int i = Dim - 2; i >= 0; --i) {
+      stride_[i] = stride_[i + 1] * sizes[i + 1];
+    }
+  }
+
+  NoTypeTensor(void* mem, int typeSize,
+               IndexT sizes[Dim], IndexT strides[Dim])
+    : mem_(mem),
+      typeSize_(typeSize) {
+    for (int i = 0; i < Dim; ++i) {
+      size_[i] = sizes[i];
+      stride_[i] = strides[i];
+    }
+  }
+
+  int getTypeSize() const {
+    return typeSize_;
+  }
+
+  IndexT getSize(int dim) const {
+    FAISS_ASSERT(dim < Dim);
+    return size_[dim];
+  }
+
+  IndexT getStride(int dim) const {
+    FAISS_ASSERT(dim < Dim);
+    return stride_[dim];
+  }
+
+  template <typename T>
+  Tensor<T, Dim, InnerContig, IndexT> toTensor() {
+    FAISS_ASSERT(sizeof(T) == typeSize_);
+
+    return Tensor<T, Dim, InnerContig, IndexT>((T*) mem_, size_, stride_);
+  }
+
+  NoTypeTensor<Dim, InnerContig, IndexT> narrowOutermost(IndexT start,
+                                                         IndexT size) {
+    char* newPtr = (char*) mem_;
+
+    if (start > 0) {
+      newPtr += typeSize_ * start * stride_[0];
+    }
+
+    IndexT newSize[Dim];
+    for (int i = 0; i < Dim; ++i) {
+      if (i == 0) {
+        assert(start + size <= size_[0]);
+        newSize[i] = size;
+      } else {
+        newSize[i] = size_[i];
+      }
+    }
+
+    return NoTypeTensor<Dim, InnerContig, IndexT>(
+      newPtr, typeSize_, newSize, stride_);
+  }
+
+ private:
+  void* mem_;
+  int typeSize_;
+  IndexT size_[Dim];
+  IndexT stride_[Dim];
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Pair.cuh b/core/src/index/thirdparty/faiss/gpu/utils/Pair.cuh
new file mode 100644
index 0000000000..0162c91a70
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/Pair.cuh
@@ -0,0 +1,69 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <cuda.h>
+#include <faiss/gpu/utils/MathOperators.cuh>
+#include <faiss/gpu/utils/WarpShuffles.cuh>
+
+namespace faiss { namespace gpu {
+
+/// A simple pair type for CUDA device usage
+template <typename K, typename V>
+struct Pair {
+  constexpr __device__ inline Pair() {
+  }
+
+  constexpr __device__ inline Pair(K key, V value)
+      : k(key), v(value) {
+  }
+
+  __device__ inline bool
+  operator==(const Pair<K, V>& rhs) const {
+    return Math<K>::eq(k, rhs.k) && Math<V>::eq(v, rhs.v);
+  }
+
+  __device__ inline bool
+  operator!=(const Pair<K, V>& rhs) const {
+    return !operator==(rhs);
+  }
+
+  __device__ inline bool
+  operator<(const Pair<K, V>& rhs) const {
+    return Math<K>::lt(k, rhs.k) ||
+      (Math<K>::eq(k, rhs.k) && Math<V>::lt(v, rhs.v));
+  }
+
+  __device__ inline bool
+  operator>(const Pair<K, V>& rhs) const {
+    return Math<K>::gt(k, rhs.k) ||
+      (Math<K>::eq(k, rhs.k) && Math<V>::gt(v, rhs.v));
+  }
+
+  K k;
+  V v;
+};
+
+template <typename T, typename U>
+inline __device__ Pair<T, U> shfl_up(const Pair<T, U>& pair,
+                                     unsigned int delta,
+                                     int width = kWarpSize) {
+  return Pair<T, U>(shfl_up(pair.k, delta, width),
+                    shfl_up(pair.v, delta, width));
+}
+
+template <typename T, typename U>
+inline __device__ Pair<T, U> shfl_xor(const Pair<T, U>& pair,
+                                      int laneMask,
+                                      int width = kWarpSize) {
+  return Pair<T, U>(shfl_xor(pair.k, laneMask, width),
+                    shfl_xor(pair.v, laneMask, width));
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/PtxUtils.cuh b/core/src/index/thirdparty/faiss/gpu/utils/PtxUtils.cuh
new file mode 100644
index 0000000000..d1fad3905f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/PtxUtils.cuh
@@ -0,0 +1,76 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <cuda.h>
+
+namespace faiss { namespace gpu {
+
+__device__ __forceinline__
+unsigned int getBitfield(unsigned int val, int pos, int len) {
+  unsigned int ret;
+  asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
+  return ret;
+}
+
+__device__ __forceinline__
+unsigned long getBitfield(unsigned long val, int pos, int len) {
+  unsigned long ret;
+  asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
+  return ret;
+}
+
+__device__ __forceinline__
+unsigned int setBitfield(unsigned int val,
+                         unsigned int toInsert, int pos, int len) {
+  unsigned int ret;
+  asm("bfi.b32 %0, %1, %2, %3, %4;" :
+      "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
+  return ret;
+}
+
+__device__ __forceinline__ int getLaneId() {
+  int laneId;
+  asm("mov.u32 %0, %laneid;" : "=r"(laneId) );
+  return laneId;
+}
+
+__device__ __forceinline__ unsigned getLaneMaskLt() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
+  return mask;
+}
+
+__device__ __forceinline__ unsigned getLaneMaskLe() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
+  return mask;
+}
+
+__device__ __forceinline__ unsigned getLaneMaskGt() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
+  return mask;
+}
+
+__device__ __forceinline__ unsigned getLaneMaskGe() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
+  return mask;
+}
+
+__device__ __forceinline__ void namedBarrierWait(int name, int numThreads) {
+  asm volatile("bar.sync %0, %1;" : : "r"(name), "r"(numThreads) : "memory");
+}
+
+__device__ __forceinline__ void namedBarrierArrived(int name, int numThreads) {
+  asm volatile("bar.arrive %0, %1;" : : "r"(name), "r"(numThreads) : "memory");
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/ReductionOperators.cuh b/core/src/index/thirdparty/faiss/gpu/utils/ReductionOperators.cuh
new file mode 100644
index 0000000000..b810fc66ea
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/ReductionOperators.cuh
@@ -0,0 +1,73 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <cuda.h>
+#include <faiss/gpu/utils/Limits.cuh>
+#include <faiss/gpu/utils/MathOperators.cuh>
+#include <faiss/gpu/utils/Pair.cuh>
+
+namespace faiss { namespace gpu {
+
+template <typename T>
+struct Sum {
+  __device__ inline T operator()(T a, T b) const {
+    return Math<T>::add(a, b);
+  }
+
+  inline __device__ T identity() const {
+    return Math<T>::zero();
+  }
+};
+
+template <typename T>
+struct Min {
+  __device__ inline T operator()(T a, T b) const {
+    return Math<T>::lt(a, b) ? a : b;
+  }
+
+  inline __device__ T identity() const {
+    return Limits<T>::getMax();
+  }
+};
+
+template <typename T>
+struct Max {
+  __device__ inline T operator()(T a, T b) const {
+    return Math<T>::gt(a, b) ? a : b;
+  }
+
+  inline __device__ T identity() const {
+    return Limits<T>::getMin();
+  }
+};
+
+/// Used for producing segmented prefix scans; the value of the Pair
+/// denotes the start of a new segment for the scan
+template <typename T, typename ReduceOp>
+struct SegmentedReduce {
+  inline __device__ SegmentedReduce(const ReduceOp& o)
+      : op(o) {
+  }
+
+  __device__
+  inline Pair<T, bool>
+  operator()(const Pair<T, bool>& a, const Pair<T, bool>& b) const {
+    return Pair<T, bool>(b.v ? b.k : op(a.k, b.k),
+                         a.v || b.v);
+  }
+
+  inline __device__ Pair<T, bool> identity() const {
+    return Pair<T, bool>(op.identity(), false);
+  }
+
+  ReduceOp op;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Reductions.cuh b/core/src/index/thirdparty/faiss/gpu/utils/Reductions.cuh
new file mode 100644
index 0000000000..e99b518630
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/Reductions.cuh
@@ -0,0 +1,142 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/PtxUtils.cuh>
+#include <faiss/gpu/utils/ReductionOperators.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/gpu/utils/WarpShuffles.cuh>
+#include <cuda.h>
+
+namespace faiss { namespace gpu {
+
+template <typename T, typename Op, int ReduceWidth = kWarpSize>
+__device__ inline T warpReduceAll(T val, Op op) {
+#pragma unroll
+  for (int mask = ReduceWidth / 2; mask > 0; mask >>= 1) {
+    val = op(val, shfl_xor(val, mask));
+  }
+
+  return val;
+}
+
+/// Sums a register value across all warp threads
+template <typename T, int ReduceWidth = kWarpSize>
+__device__ inline T warpReduceAllSum(T val) {
+  return warpReduceAll<T, Sum<T>, ReduceWidth>(val, Sum<T>());
+}
+
+/// Performs a block-wide reduction
+template <typename T, typename Op, bool BroadcastAll, bool KillWARDependency>
+__device__ inline T blockReduceAll(T val, Op op, T* smem) {
+  int laneId = getLaneId();
+  int warpId = threadIdx.x / kWarpSize;
+
+  val = warpReduceAll<T, Op>(val, op);
+  if (laneId == 0) {
+    smem[warpId] = val;
+  }
+  __syncthreads();
+
+  if (warpId == 0) {
+    val = laneId < utils::divUp(blockDim.x, kWarpSize) ? smem[laneId] :
+      op.identity();
+    val = warpReduceAll<T, Op>(val, op);
+
+    if (BroadcastAll) {
+      __threadfence_block();
+
+      if (laneId == 0) {
+        smem[0] = val;
+      }
+    }
+  }
+
+  if (BroadcastAll) {
+    __syncthreads();
+    val = smem[0];
+  }
+
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+
+  return val;
+}
+
+/// Performs a block-wide reduction of multiple values simultaneously
+template <int Num, typename T, typename Op, bool BroadcastAll, bool KillWARDependency>
+__device__ inline void blockReduceAll(T val[Num], Op op, T* smem) {
+  int laneId = getLaneId();
+  int warpId = threadIdx.x / kWarpSize;
+
+#pragma unroll
+  for (int i = 0; i < Num; ++i) {
+    val[i] = warpReduceAll<T, Op>(val[i], op);
+  }
+
+  if (laneId == 0) {
+#pragma unroll
+    for (int i = 0; i < Num; ++i) {
+      smem[warpId * Num + i] = val[i];
+    }
+  }
+
+  __syncthreads();
+
+  if (warpId == 0) {
+#pragma unroll
+    for (int i = 0; i < Num; ++i) {
+      val[i] =
+        laneId < utils::divUp(blockDim.x, kWarpSize) ? smem[laneId * Num + i] :
+        op.identity();
+      val[i] = warpReduceAll<T, Op>(val[i], op);
+    }
+
+    if (BroadcastAll) {
+      __threadfence_block();
+
+      if (laneId == 0) {
+#pragma unroll
+        for (int i = 0; i < Num; ++i) {
+          smem[i] = val[i];
+        }
+      }
+    }
+  }
+
+  if (BroadcastAll) {
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < Num; ++i) {
+      val[i] = smem[i];
+    }
+  }
+
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+
+/// Sums a register value across the entire block
+template <typename T, bool BroadcastAll, bool KillWARDependency>
+__device__ inline T blockReduceAllSum(T val, T* smem) {
+  return blockReduceAll<T, Sum<T>, BroadcastAll, KillWARDependency>(
+    val, Sum<T>(), smem);
+}
+
+template <int Num, typename T, bool BroadcastAll, bool KillWARDependency>
+__device__ inline void blockReduceAllSum(T vals[Num], T* smem) {
+  return blockReduceAll<Num, T, Sum<T>, BroadcastAll, KillWARDependency>(
+    vals, Sum<T>(), smem);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Select.cuh b/core/src/index/thirdparty/faiss/gpu/utils/Select.cuh
new file mode 100644
index 0000000000..43a1cc1893
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/Select.cuh
@@ -0,0 +1,571 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/utils/Comparators.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/MergeNetworkBlock.cuh>
+#include <faiss/gpu/utils/MergeNetworkWarp.cuh>
+#include <faiss/gpu/utils/PtxUtils.cuh>
+#include <faiss/gpu/utils/Reductions.cuh>
+#include <faiss/gpu/utils/ReductionOperators.cuh>
+#include <faiss/gpu/utils/Tensor.cuh>
+
+namespace faiss { namespace gpu {
+
+// Specialization for block-wide monotonic merges producing a merge sort
+// since what we really want is a constexpr loop expansion
+template <int NumWarps,
+          int NumThreads, typename K, typename V, int NumWarpQ,
+          bool Dir, typename Comp>
+struct FinalBlockMerge {
+};
+
+template <int NumThreads, typename K, typename V, int NumWarpQ,
+          bool Dir, typename Comp>
+struct FinalBlockMerge<1, NumThreads, K, V, NumWarpQ, Dir, Comp> {
+  static inline __device__ void merge(K* sharedK, V* sharedV) {
+    // no merge required; single warp
+  }
+};
+
+template <int NumThreads, typename K, typename V, int NumWarpQ,
+          bool Dir, typename Comp>
+struct FinalBlockMerge<2, NumThreads, K, V, NumWarpQ, Dir, Comp> {
+  static inline __device__ void merge(K* sharedK, V* sharedV) {
+    // Final merge doesn't need to fully merge the second list
+    blockMerge<NumThreads, K, V, NumThreads / (kWarpSize * 2),
+               NumWarpQ, !Dir, Comp, false>(sharedK, sharedV);
+  }
+};
+
+template <int NumThreads, typename K, typename V, int NumWarpQ,
+          bool Dir, typename Comp>
+struct FinalBlockMerge<4, NumThreads, K, V, NumWarpQ, Dir, Comp> {
+  static inline __device__ void merge(K* sharedK, V* sharedV) {
+    blockMerge<NumThreads, K, V, NumThreads / (kWarpSize * 2),
+               NumWarpQ, !Dir, Comp>(sharedK, sharedV);
+    // Final merge doesn't need to fully merge the second list
+    blockMerge<NumThreads, K, V, NumThreads / (kWarpSize * 4),
+               NumWarpQ * 2, !Dir, Comp, false>(sharedK, sharedV);
+  }
+};
+
+template <int NumThreads, typename K, typename V, int NumWarpQ,
+          bool Dir, typename Comp>
+struct FinalBlockMerge<8, NumThreads, K, V, NumWarpQ, Dir, Comp> {
+  static inline __device__ void merge(K* sharedK, V* sharedV) {
+    blockMerge<NumThreads, K, V, NumThreads / (kWarpSize * 2),
+               NumWarpQ, !Dir, Comp>(sharedK, sharedV);
+    blockMerge<NumThreads, K, V, NumThreads / (kWarpSize * 4),
+               NumWarpQ * 2, !Dir, Comp>(sharedK, sharedV);
+    // Final merge doesn't need to fully merge the second list
+    blockMerge<NumThreads, K, V, NumThreads / (kWarpSize * 8),
+               NumWarpQ * 4, !Dir, Comp, false>(sharedK, sharedV);
+  }
+};
+
+// `Dir` true, produce largest values.
+// `Dir` false, produce smallest values.
+template <typename K,
+          typename V,
+          bool Dir,
+          typename Comp,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
+struct BlockSelect {
+  static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+  static constexpr int kTotalWarpSortSize = NumWarpQ;
+
+  __device__ inline BlockSelect(K initKVal,
+                                V initVVal,
+                                K* smemK,
+                                V* smemV,
+                                int k) :
+      initK(initKVal),
+      initV(initVVal),
+      numVals(0),
+      warpKTop(initKVal),
+      sharedK(smemK),
+      sharedV(smemV),
+      kMinus1(k - 1) {
+    static_assert(utils::isPowerOf2(ThreadsPerBlock),
+                  "threads must be a power-of-2");
+    static_assert(utils::isPowerOf2(NumWarpQ),
+                  "warp queue must be power-of-2");
+
+    // Fill the per-thread queue keys with the default value
+#pragma unroll
+    for (int i = 0; i < NumThreadQ; ++i) {
+      threadK[i] = initK;
+      threadV[i] = initV;
+    }
+
+    int laneId = getLaneId();
+    int warpId = threadIdx.x / kWarpSize;
+    warpK = sharedK + warpId * kTotalWarpSortSize;
+    warpV = sharedV + warpId * kTotalWarpSortSize;
+
+    // Fill warp queue (only the actual queue space is fine, not where
+    // we write the per-thread queues for merging)
+    for (int i = laneId; i < NumWarpQ; i += kWarpSize) {
+      warpK[i] = initK;
+      warpV[i] = initV;
+    }
+
+    warpFence();
+  }
+
+  __device__ inline void addThreadQ(K k, V v) {
+    if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) {
+      // Rotate right
+#pragma unroll
+      for (int i = NumThreadQ - 1; i > 0; --i) {
+        threadK[i] = threadK[i - 1];
+        threadV[i] = threadV[i - 1];
+      }
+
+      threadK[0] = k;
+      threadV[0] = v;
+      ++numVals;
+    }
+  }
+
+  __device__ inline void checkThreadQ() {
+    bool needSort = (numVals == NumThreadQ);
+
+#if CUDA_VERSION >= 9000
+    needSort = __any_sync(0xffffffff, needSort);
+#else
+    needSort = __any(needSort);
+#endif
+
+    if (!needSort) {
+      // no lanes have triggered a sort
+      return;
+    }
+
+    // This has a trailing warpFence
+    mergeWarpQ();
+
+    // Any top-k elements have been merged into the warp queue; we're
+    // free to reset the thread queues
+    numVals = 0;
+
+#pragma unroll
+    for (int i = 0; i < NumThreadQ; ++i) {
+      threadK[i] = initK;
+      threadV[i] = initV;
+    }
+
+    // We have to beat at least this element
+    warpKTop = warpK[kMinus1];
+
+    warpFence();
+  }
+
+  /// This function handles sorting and merging together the
+  /// per-thread queues with the warp-wide queue, creating a sorted
+  /// list across both
+  __device__ inline void mergeWarpQ() {
+    int laneId = getLaneId();
+
+    // Sort all of the per-thread queues
+    warpSortAnyRegisters<K, V, NumThreadQ, !Dir, Comp>(threadK, threadV);
+
+    constexpr int kNumWarpQRegisters = NumWarpQ / kWarpSize;
+    K warpKRegisters[kNumWarpQRegisters];
+    V warpVRegisters[kNumWarpQRegisters];
+
+#pragma unroll
+    for (int i = 0; i < kNumWarpQRegisters; ++i) {
+      warpKRegisters[i] = warpK[i * kWarpSize + laneId];
+      warpVRegisters[i] = warpV[i * kWarpSize + laneId];
+    }
+
+    warpFence();
+
+    // The warp queue is already sorted, and now that we've sorted the
+    // per-thread queue, merge both sorted lists together, producing
+    // one sorted list
+    warpMergeAnyRegisters<K, V,
+                          kNumWarpQRegisters, NumThreadQ, !Dir, Comp, false>(
+      warpKRegisters, warpVRegisters, threadK, threadV);
+
+    // Write back out the warp queue
+#pragma unroll
+    for (int i = 0; i < kNumWarpQRegisters; ++i) {
+      warpK[i * kWarpSize + laneId] = warpKRegisters[i];
+      warpV[i * kWarpSize + laneId] = warpVRegisters[i];
+    }
+
+    warpFence();
+  }
+
+  /// WARNING: all threads in a warp must participate in this.
+  /// Otherwise, you must call the constituent parts separately.
+  __device__ inline void add(K k, V v) {
+    addThreadQ(k, v);
+    checkThreadQ();
+  }
+
+  __device__ inline void reduce() {
+    // Have all warps dump and merge their queues; this will produce
+    // the final per-warp results
+    mergeWarpQ();
+
+    // block-wide dep; thus far, all warps have been completely
+    // independent
+    __syncthreads();
+
+    // All warp queues are contiguous in smem.
+    // Now, we have kNumWarps lists of NumWarpQ elements.
+    // This is a power of 2.
+    FinalBlockMerge<kNumWarps, ThreadsPerBlock, K, V, NumWarpQ, Dir, Comp>::
+      merge(sharedK, sharedV);
+
+    // The block-wide merge has a trailing syncthreads
+  }
+
+  // Default element key
+  const K initK;
+
+  // Default element value
+  const V initV;
+
+  // Number of valid elements in our thread queue
+  int numVals;
+
+  // The k-th highest (Dir) or lowest (!Dir) element
+  K warpKTop;
+
+  // Thread queue values
+  K threadK[NumThreadQ];
+  V threadV[NumThreadQ];
+
+  // Queues for all warps
+  K* sharedK;
+  V* sharedV;
+
+  // Our warp's queue (points into sharedK/sharedV)
+  // warpK[0] is highest (Dir) or lowest (!Dir)
+  K* warpK;
+  V* warpV;
+
+  // This is a cached k-1 value
+  int kMinus1;
+};
+
+/// Specialization for k == 1 (NumWarpQ == 1)
+template <typename K,
+          typename V,
+          bool Dir,
+          typename Comp,
+          int NumThreadQ,
+          int ThreadsPerBlock>
+struct BlockSelect<K, V, Dir, Comp, 1, NumThreadQ, ThreadsPerBlock> {
+  static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+
+  __device__ inline BlockSelect(K initK, V initV, K* smemK, V* smemV, int k) :
+      sharedK(smemK),
+      sharedV(smemV),
+      threadK(initK),
+      threadV(initV) {
+  }
+
+  __device__ inline void addThreadQ(K k, V v) {
+    bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK);
+    threadK = swap ? k : threadK;
+    threadV = swap ? v : threadV;
+  }
+
+  __device__ inline void checkThreadQ() {
+    // We don't need to do anything here, since the warp doesn't
+    // cooperate until the end
+  }
+
+  __device__ inline void add(K k, V v) {
+    addThreadQ(k, v);
+  }
+
+  __device__ inline void reduce() {
+    // Reduce within the warp
+    Pair<K, V> pair(threadK, threadV);
+
+    if (Dir) {
+      pair =
+        warpReduceAll<Pair<K, V>, Max<Pair<K, V>>>(pair, Max<Pair<K, V>>());
+    } else {
+      pair =
+        warpReduceAll<Pair<K, V>, Min<Pair<K, V>>>(pair, Min<Pair<K, V>>());
+    }
+
+    // Each warp writes out a single value
+    int laneId = getLaneId();
+    int warpId = threadIdx.x / kWarpSize;
+
+    if (laneId == 0) {
+      sharedK[warpId] = pair.k;
+      sharedV[warpId] = pair.v;
+    }
+
+    __syncthreads();
+
+    // We typically use this for small blocks (<= 128), just having the first
+    // thread in the block perform the reduction across warps is
+    // faster
+    if (threadIdx.x == 0) {
+      threadK = sharedK[0];
+      threadV = sharedV[0];
+
+#pragma unroll
+      for (int i = 1; i < kNumWarps; ++i) {
+        K k = sharedK[i];
+        V v = sharedV[i];
+
+        bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK);
+        threadK = swap ? k : threadK;
+        threadV = swap ? v : threadV;
+      }
+
+      // Hopefully a thread's smem reads/writes are ordered wrt
+      // itself, so no barrier needed :)
+      sharedK[0] = threadK;
+      sharedV[0] = threadV;
+    }
+
+    // In case other threads wish to read this value
+    __syncthreads();
+  }
+
+  // threadK is lowest (Dir) or highest (!Dir)
+  K threadK;
+  V threadV;
+
+  // Where we reduce in smem
+  K* sharedK;
+  V* sharedV;
+};
+
+//
+// per-warp WarpSelect
+//
+
+// `Dir` true, produce largest values.
+// `Dir` false, produce smallest values.
+template <typename K,
+          typename V,
+          bool Dir,
+          typename Comp,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
+struct WarpSelect {
+  static constexpr int kNumWarpQRegisters = NumWarpQ / kWarpSize;
+
+  __device__ inline WarpSelect(K initKVal, V initVVal, int k) :
+      initK(initKVal),
+      initV(initVVal),
+      numVals(0),
+      warpKTop(initKVal),
+      kLane((k - 1) % kWarpSize) {
+    static_assert(utils::isPowerOf2(ThreadsPerBlock),
+                  "threads must be a power-of-2");
+    static_assert(utils::isPowerOf2(NumWarpQ),
+                  "warp queue must be power-of-2");
+
+    // Fill the per-thread queue keys with the default value
+#pragma unroll
+    for (int i = 0; i < NumThreadQ; ++i) {
+      threadK[i] = initK;
+      threadV[i] = initV;
+    }
+
+    // Fill the warp queue with the default value
+#pragma unroll
+    for (int i = 0; i < kNumWarpQRegisters; ++i) {
+      warpK[i] = initK;
+      warpV[i] = initV;
+    }
+  }
+
+  __device__ inline void addThreadQ(K k, V v) {
+    if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) {
+      // Rotate right
+#pragma unroll
+      for (int i = NumThreadQ - 1; i > 0; --i) {
+        threadK[i] = threadK[i - 1];
+        threadV[i] = threadV[i - 1];
+      }
+
+      threadK[0] = k;
+      threadV[0] = v;
+      ++numVals;
+    }
+  }
+
+  __device__ inline void checkThreadQ() {
+    bool needSort = (numVals == NumThreadQ);
+
+#if CUDA_VERSION >= 9000
+    needSort = __any_sync(0xffffffff, needSort);
+#else
+    needSort = __any(needSort);
+#endif
+
+    if (!needSort) {
+      // no lanes have triggered a sort
+      return;
+    }
+
+    mergeWarpQ();
+
+    // Any top-k elements have been merged into the warp queue; we're
+    // free to reset the thread queues
+    numVals = 0;
+
+#pragma unroll
+    for (int i = 0; i < NumThreadQ; ++i) {
+      threadK[i] = initK;
+      threadV[i] = initV;
+    }
+
+    // We have to beat at least this element
+    warpKTop = shfl(warpK[kNumWarpQRegisters - 1], kLane);
+  }
+
+  /// This function handles sorting and merging together the
+  /// per-thread queues with the warp-wide queue, creating a sorted
+  /// list across both
+  __device__ inline void mergeWarpQ() {
+    // Sort all of the per-thread queues
+    warpSortAnyRegisters<K, V, NumThreadQ, !Dir, Comp>(threadK, threadV);
+
+    // The warp queue is already sorted, and now that we've sorted the
+    // per-thread queue, merge both sorted lists together, producing
+    // one sorted list
+    warpMergeAnyRegisters<K, V,
+                          kNumWarpQRegisters, NumThreadQ, !Dir, Comp, false>(
+      warpK, warpV, threadK, threadV);
+  }
+
+  /// WARNING: all threads in a warp must participate in this.
+  /// Otherwise, you must call the constituent parts separately.
+  __device__ inline void add(K k, V v) {
+    addThreadQ(k, v);
+    checkThreadQ();
+  }
+
+  __device__ inline void reduce() {
+    // Have all warps dump and merge their queues; this will produce
+    // the final per-warp results
+    mergeWarpQ();
+  }
+
+  /// Dump final k selected values for this warp out
+  __device__ inline void writeOut(K* outK, V* outV, int k) {
+    int laneId = getLaneId();
+
+#pragma unroll
+    for (int i = 0; i < kNumWarpQRegisters; ++i) {
+      int idx = i * kWarpSize + laneId;
+
+      if (idx < k) {
+        outK[idx] = warpK[i];
+        outV[idx] = warpV[i];
+      }
+    }
+  }
+
+  // Default element key
+  const K initK;
+
+  // Default element value
+  const V initV;
+
+  // Number of valid elements in our thread queue
+  int numVals;
+
+  // The k-th highest (Dir) or lowest (!Dir) element
+  K warpKTop;
+
+  // Thread queue values
+  K threadK[NumThreadQ];
+  V threadV[NumThreadQ];
+
+  // warpK[0] is highest (Dir) or lowest (!Dir)
+  K warpK[kNumWarpQRegisters];
+  V warpV[kNumWarpQRegisters];
+
+  // This is what lane we should load an approximation (>=k) to the
+  // kth element from the last register in the warp queue (i.e.,
+  // warpK[kNumWarpQRegisters - 1]).
+  int kLane;
+};
+
+/// Specialization for k == 1 (NumWarpQ == 1)
+template <typename K,
+          typename V,
+          bool Dir,
+          typename Comp,
+          int NumThreadQ,
+          int ThreadsPerBlock>
+struct WarpSelect<K, V, Dir, Comp, 1, NumThreadQ, ThreadsPerBlock> {
+  static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+
+  __device__ inline WarpSelect(K initK, V initV, int k) :
+      threadK(initK),
+      threadV(initV) {
+  }
+
+  __device__ inline void addThreadQ(K k, V v) {
+    bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK);
+    threadK = swap ? k : threadK;
+    threadV = swap ? v : threadV;
+  }
+
+  __device__ inline void checkThreadQ() {
+    // We don't need to do anything here, since the warp doesn't
+    // cooperate until the end
+  }
+
+  __device__ inline void add(K k, V v) {
+    addThreadQ(k, v);
+  }
+
+  __device__ inline void reduce() {
+    // Reduce within the warp
+    Pair<K, V> pair(threadK, threadV);
+
+    if (Dir) {
+      pair =
+        warpReduceAll<Pair<K, V>, Max<Pair<K, V>>>(pair, Max<Pair<K, V>>());
+    } else {
+      pair =
+        warpReduceAll<Pair<K, V>, Min<Pair<K, V>>>(pair, Min<Pair<K, V>>());
+    }
+
+    threadK = pair.k;
+    threadV = pair.v;
+  }
+
+  /// Dump final k selected values for this warp out
+  __device__ inline void writeOut(K* outK, V* outV, int k) {
+    if (getLaneId() == 0) {
+      *outK = threadK;
+      *outV = threadV;
+    }
+  }
+
+  // threadK is lowest (Dir) or highest (!Dir)
+  K threadK;
+  V threadV;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/StackDeviceMemory.cpp b/core/src/index/thirdparty/faiss/gpu/utils/StackDeviceMemory.cpp
new file mode 100644
index 0000000000..18b8e04cff
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/StackDeviceMemory.cpp
@@ -0,0 +1,239 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/utils/StackDeviceMemory.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/MemorySpace.h>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <stdio.h>
+#include <sstream>
+
+namespace faiss { namespace gpu {
+
+StackDeviceMemory::Stack::Stack(int d, size_t sz)
+    : device_(d),
+      isOwner_(true),
+      start_(nullptr),
+      end_(nullptr),
+      size_(sz),
+      head_(nullptr),
+      mallocCurrent_(0),
+      highWaterMemoryUsed_(0),
+      highWaterMalloc_(0),
+      cudaMallocWarning_(true) {
+  DeviceScope s(device_);
+
+  allocMemorySpace(MemorySpace::Device, &start_, size_);
+
+  head_ = start_;
+  end_ = start_ + size_;
+}
+
+StackDeviceMemory::Stack::Stack(int d, void* p, size_t sz, bool isOwner)
+    : device_(d),
+      isOwner_(isOwner),
+      start_((char*) p),
+      end_(((char*) p) + sz),
+      size_(sz),
+      head_((char*) p),
+      mallocCurrent_(0),
+      highWaterMemoryUsed_(0),
+      highWaterMalloc_(0),
+      cudaMallocWarning_(true) {
+}
+
+StackDeviceMemory::Stack::~Stack() {
+  if (isOwner_) {
+    DeviceScope s(device_);
+
+    freeMemorySpace(MemorySpace::Device, start_);
+  }
+}
+
+size_t
+StackDeviceMemory::Stack::getSizeAvailable() const {
+  return (end_ - head_);
+}
+
+char*
+StackDeviceMemory::Stack::getAlloc(size_t size,
+                                   cudaStream_t stream) {
+  if (size > (end_ - head_)) {
+    // Too large for our stack
+    DeviceScope s(device_);
+
+    if (cudaMallocWarning_) {
+      // Print our requested size before we attempt the allocation
+      fprintf(stderr, "WARN: increase temp memory to avoid cudaMalloc, "
+              "or decrease query/add size (alloc %zu B, highwater %zu B)\n",
+              size, highWaterMalloc_);
+    }
+
+    char* p = nullptr;
+    allocMemorySpace(MemorySpace::Device, &p, size);
+
+    mallocCurrent_ += size;
+    highWaterMalloc_ = std::max(highWaterMalloc_, mallocCurrent_);
+
+    return p;
+  } else {
+    // We can make the allocation out of our stack
+    // Find all the ranges that we overlap that may have been
+    // previously allocated; our allocation will be [head, endAlloc)
+    char* startAlloc = head_;
+    char* endAlloc = head_ + size;
+
+    while (lastUsers_.size() > 0) {
+      auto& prevUser = lastUsers_.back();
+
+      // Because there is a previous user, we must overlap it
+      FAISS_ASSERT(prevUser.start_ <= endAlloc && prevUser.end_ >= startAlloc);
+
+      if (stream != prevUser.stream_) {
+        // Synchronization required
+        // FIXME
+        FAISS_ASSERT(false);
+      }
+
+      if (endAlloc < prevUser.end_) {
+        // Update the previous user info
+        prevUser.start_ = endAlloc;
+
+        break;
+      }
+
+      // If we're the exact size of the previous request, then we
+      // don't need to continue
+      bool done = (prevUser.end_ == endAlloc);
+
+      lastUsers_.pop_back();
+
+      if (done) {
+        break;
+      }
+    }
+
+    head_ = endAlloc;
+    FAISS_ASSERT(head_ <= end_);
+
+    highWaterMemoryUsed_ = std::max(highWaterMemoryUsed_,
+                                    (size_t) (head_ - start_));
+    return startAlloc;
+  }
+}
+
+void
+StackDeviceMemory::Stack::returnAlloc(char* p,
+                                      size_t size,
+                                      cudaStream_t stream) {
+  if (p < start_ || p >= end_) {
+    // This is not on our stack; it was a one-off allocation
+    DeviceScope s(device_);
+
+    freeMemorySpace(MemorySpace::Device, p);
+
+    FAISS_ASSERT(mallocCurrent_ >= size);
+    mallocCurrent_ -= size;
+  } else {
+    // This is on our stack
+    // Allocations should be freed in the reverse order they are made
+    FAISS_ASSERT(p + size == head_);
+
+    head_ = p;
+    lastUsers_.push_back(Range(p, p + size, stream));
+  }
+}
+
+std::string
+StackDeviceMemory::Stack::toString() const {
+  std::stringstream s;
+
+  s << "SDM device " << device_ << ": Total memory " << size_ << " ["
+    << (void*) start_ << ", " << (void*) end_ << ")\n";
+  s << "     Available memory " << (size_t) (end_ - head_)
+    << " [" << (void*) head_ << ", " << (void*) end_ << ")\n";
+  s << "     High water temp alloc " << highWaterMemoryUsed_ << "\n";
+  s << "     High water cudaMalloc " << highWaterMalloc_ << "\n";
+
+  int i = lastUsers_.size();
+  for (auto it = lastUsers_.rbegin(); it != lastUsers_.rend(); ++it) {
+    s << i-- << ": size " << (size_t) (it->end_ - it->start_)
+      << " stream " << it->stream_
+      << " [" << (void*) it->start_ << ", " << (void*) it->end_ << ")\n";
+  }
+
+  return s.str();
+}
+
+size_t
+StackDeviceMemory::Stack::getHighWaterCudaMalloc() const {
+  return highWaterMalloc_;
+}
+
+StackDeviceMemory::StackDeviceMemory(int device, size_t allocPerDevice)
+    : device_(device),
+      stack_(device, allocPerDevice) {
+}
+
+StackDeviceMemory::StackDeviceMemory(int device,
+                                     void* p, size_t size, bool isOwner)
+    : device_(device),
+      stack_(device, p, size, isOwner) {
+}
+
+StackDeviceMemory::~StackDeviceMemory() {
+}
+
+void
+StackDeviceMemory::setCudaMallocWarning(bool b) {
+  stack_.cudaMallocWarning_ = b;
+}
+
+int
+StackDeviceMemory::getDevice() const {
+  return device_;
+}
+
+DeviceMemoryReservation
+StackDeviceMemory::getMemory(cudaStream_t stream, size_t size) {
+  // We guarantee 16 byte alignment for allocations, so bump up `size`
+  // to the next highest multiple of 16
+  size = utils::roundUp(size, (size_t) 16);
+
+  return DeviceMemoryReservation(this,
+                                 device_,
+                                 stack_.getAlloc(size, stream),
+                                 size,
+                                 stream);
+}
+
+size_t
+StackDeviceMemory::getSizeAvailable() const {
+  return stack_.getSizeAvailable();
+}
+
+std::string
+StackDeviceMemory::toString() const {
+  return stack_.toString();
+}
+
+size_t
+StackDeviceMemory::getHighWaterCudaMalloc() const {
+  return stack_.getHighWaterCudaMalloc();
+}
+
+void
+StackDeviceMemory::returnAllocation(DeviceMemoryReservation& m) {
+  FAISS_ASSERT(m.get());
+  FAISS_ASSERT(device_ == m.device());
+
+  stack_.returnAlloc((char*) m.get(), m.size(), m.stream());
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/StackDeviceMemory.h b/core/src/index/thirdparty/faiss/gpu/utils/StackDeviceMemory.h
new file mode 100644
index 0000000000..f7c3ea14e4
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/StackDeviceMemory.h
@@ -0,0 +1,129 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/DeviceMemory.h>
+#include <list>
+#include <memory>
+#include <unordered_map>
+
+namespace faiss { namespace gpu {
+
+/// Device memory manager that provides temporary memory allocations
+/// out of a region of memory
+class StackDeviceMemory : public DeviceMemory {
+ public:
+  /// Allocate a new region of memory that we manage
+  explicit StackDeviceMemory(int device, size_t allocPerDevice);
+
+  /// Manage a region of memory for a particular device, with or
+  /// without ownership
+  StackDeviceMemory(int device, void* p, size_t size, bool isOwner);
+
+  ~StackDeviceMemory() override;
+
+  /// Enable or disable the warning about not having enough temporary memory
+  /// when cudaMalloc gets called
+  void setCudaMallocWarning(bool b);
+
+  int getDevice() const override;
+
+  DeviceMemoryReservation getMemory(cudaStream_t stream,
+                                    size_t size) override;
+
+  size_t getSizeAvailable() const override;
+  std::string toString() const override;
+  size_t getHighWaterCudaMalloc() const override;
+
+ protected:
+  void returnAllocation(DeviceMemoryReservation& m) override;
+
+ protected:
+  /// Previous allocation ranges and the streams for which
+  /// synchronization is required
+  struct Range {
+    inline Range(char* s, char* e, cudaStream_t str) :
+        start_(s), end_(e), stream_(str) {
+    }
+
+    // References a memory range [start, end)
+    char* start_;
+    char* end_;
+    cudaStream_t stream_;
+  };
+
+  struct Stack {
+    /// Constructor that allocates memory via cudaMalloc
+    Stack(int device, size_t size);
+
+    /// Constructor that references a pre-allocated region of memory
+    Stack(int device, void* p, size_t size, bool isOwner);
+    ~Stack();
+
+    /// Returns how much size is available for an allocation without
+    /// calling cudaMalloc
+    size_t getSizeAvailable() const;
+
+    /// Obtains an allocation; all allocations are guaranteed to be 16
+    /// byte aligned
+    char* getAlloc(size_t size, cudaStream_t stream);
+
+    /// Returns an allocation
+    void returnAlloc(char* p, size_t size, cudaStream_t stream);
+
+    /// Returns the stack state
+    std::string toString() const;
+
+    /// Returns the high-water mark of cudaMalloc activity
+    size_t getHighWaterCudaMalloc() const;
+
+    /// Device this allocation is on
+    int device_;
+
+    /// Do we own our region of memory?
+    bool isOwner_;
+
+    /// Where our allocation begins and ends
+    /// [start_, end_) is valid
+    char* start_;
+    char* end_;
+
+    /// Total size end_ - start_
+    size_t size_;
+
+    /// Stack head within [start, end)
+    char* head_;
+
+    /// List of previous last users of allocations on our stack, for
+    /// possible synchronization purposes
+    std::list<Range> lastUsers_;
+
+    /// How much cudaMalloc memory is currently outstanding?
+    size_t mallocCurrent_;
+
+    /// What's the high water mark in terms of memory used from the
+    /// temporary buffer?
+    size_t highWaterMemoryUsed_;
+
+    /// What's the high water mark in terms of memory allocated via
+    /// cudaMalloc?
+    size_t highWaterMalloc_;
+
+    /// Whether or not a warning upon cudaMalloc is generated
+    bool cudaMallocWarning_;
+  };
+
+  /// Our device
+  int device_;
+
+  /// Memory stack
+  Stack stack_;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/StaticUtils.h b/core/src/index/thirdparty/faiss/gpu/utils/StaticUtils.h
new file mode 100644
index 0000000000..f6e5505afb
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/StaticUtils.h
@@ -0,0 +1,83 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <cuda.h>
+
+namespace faiss { namespace gpu { namespace utils {
+
+template <typename U, typename V>
+constexpr __host__ __device__ auto divDown(U a, V b) -> decltype(a + b) {
+  return (a / b);
+}
+
+template <typename U, typename V>
+constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) {
+  return (a + b - 1) / b;
+}
+
+template <typename U, typename V>
+constexpr __host__ __device__ auto roundDown(U a, V b) -> decltype(a + b) {
+  return divDown(a, b) * b;
+}
+
+template <typename U, typename V>
+constexpr __host__ __device__ auto roundUp(U a, V b) -> decltype(a + b) {
+  return divUp(a, b) * b;
+}
+
+template <class T>
+constexpr __host__ __device__ T pow(T n, T power) {
+  return (power > 0 ? n * pow(n, power - 1) : 1);
+}
+
+template <class T>
+constexpr __host__ __device__ T pow2(T n) {
+  return pow(2, (T) n);
+}
+
+static_assert(pow2(8) == 256, "pow2");
+
+template <typename T>
+constexpr __host__ __device__ int log2(T n, int p = 0) {
+  return (n <= 1) ? p : log2(n / 2, p + 1);
+}
+
+static_assert(log2(2) == 1, "log2");
+static_assert(log2(3) == 1, "log2");
+static_assert(log2(4) == 2, "log2");
+
+template <typename T>
+constexpr __host__ __device__ bool isPowerOf2(T v) {
+  return (v && !(v & (v - 1)));
+}
+
+static_assert(isPowerOf2(2048), "isPowerOf2");
+static_assert(!isPowerOf2(3333), "isPowerOf2");
+
+template <typename T>
+constexpr __host__ __device__ T nextHighestPowerOf2(T v) {
+  return (isPowerOf2(v) ? (T) 2 * v : ((T) 1 << (log2(v) + 1)));
+}
+
+static_assert(nextHighestPowerOf2(1) == 2, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2(2) == 4, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2(3) == 4, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2(4) == 8, "nextHighestPowerOf2");
+
+static_assert(nextHighestPowerOf2(15) == 16, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2(16) == 32, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2(17) == 32, "nextHighestPowerOf2");
+
+static_assert(nextHighestPowerOf2(1536000000u) == 2147483648u,
+              "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2((size_t) 2147483648ULL) ==
+              (size_t) 4294967296ULL, "nextHighestPowerOf2");
+
+} } } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Tensor-inl.cuh b/core/src/index/thirdparty/faiss/gpu/utils/Tensor-inl.cuh
new file mode 100644
index 0000000000..0f5aef1315
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/Tensor-inl.cuh
@@ -0,0 +1,717 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/GpuFaissAssert.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <limits>
+
+namespace faiss { namespace gpu {
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor()
+    : data_(nullptr) {
+  static_assert(Dim > 0, "must have > 0 dimensions");
+
+  for (int i = 0; i < Dim; ++i) {
+    size_[i] = 0;
+    stride_[i] = (IndexT) 1;
+  }
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) {
+  this->operator=(t);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
+  this->operator=(std::move(t));
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) {
+  data_ = t.data_;
+  for (int i = 0; i < Dim; ++i) {
+    size_[i] = t.size_[i];
+    stride_[i] = t.stride_[i];
+  }
+
+  return *this;
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
+  data_ = t.data_; t.data_ = nullptr;
+  for (int i = 0; i < Dim; ++i) {
+    stride_[i] = t.stride_[i]; t.stride_[i] = 0;
+    size_[i] = t.size_[i]; t.size_[i] = 0;
+  }
+
+  return *this;
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
+Tensor(DataPtrType data, const IndexT sizes[Dim])
+    : data_(data) {
+  static_assert(Dim > 0, "must have > 0 dimensions");
+
+  for (int i = 0; i < Dim; ++i) {
+    size_[i] = sizes[i];
+  }
+
+  stride_[Dim - 1] = (IndexT) 1;
+  for (int i = Dim - 2; i >= 0; --i) {
+    stride_[i] = stride_[i + 1] * sizes[i + 1];
+  }
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
+Tensor(DataPtrType data, std::initializer_list<IndexT> sizes)
+    : data_(data) {
+  GPU_FAISS_ASSERT(sizes.size() == Dim);
+  static_assert(Dim > 0, "must have > 0 dimensions");
+
+  int i = 0;
+  for (auto s : sizes) {
+    size_[i++] = s;
+  }
+
+  stride_[Dim - 1] = (IndexT) 1;
+  for (int j = Dim - 2; j >= 0; --j) {
+    stride_[j] = stride_[j + 1] * size_[j + 1];
+  }
+}
+
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
+  DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim])
+    : data_(data) {
+  static_assert(Dim > 0, "must have > 0 dimensions");
+
+  for (int i = 0; i < Dim; ++i) {
+    size_[i] = sizes[i];
+    stride_[i] = strides[i];
+  }
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ void
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::copyFrom(
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
+  cudaStream_t stream) {
+  // The tensor must be fully contiguous
+  GPU_FAISS_ASSERT(this->isContiguous());
+
+  // Size must be the same (since dimensions are checked and
+  // continuity is assumed, we need only check total number of
+  // elements
+  GPU_FAISS_ASSERT(this->numElements() == t.numElements());
+
+  if (t.numElements() > 0) {
+    GPU_FAISS_ASSERT(this->data_);
+    GPU_FAISS_ASSERT(t.data());
+
+    int ourDev = getDeviceForAddress(this->data_);
+    int tDev = getDeviceForAddress(t.data());
+
+    if (tDev == -1) {
+      CUDA_VERIFY(cudaMemcpyAsync(this->data_,
+                                  t.data(),
+                                  this->getSizeInBytes(),
+                                  ourDev == -1 ? cudaMemcpyHostToHost :
+                                  cudaMemcpyHostToDevice,
+                                  stream));
+    } else {
+      CUDA_VERIFY(cudaMemcpyAsync(this->data_,
+                                  t.data(),
+                                  this->getSizeInBytes(),
+                                  ourDev == -1 ? cudaMemcpyDeviceToHost :
+                                  cudaMemcpyDeviceToDevice,
+                                  stream));
+    }
+  }
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ void
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::copyTo(
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
+  cudaStream_t stream) {
+  // The tensor must be fully contiguous
+  GPU_FAISS_ASSERT(this->isContiguous());
+
+  // Size must be the same (since dimensions are checked and
+  // continuity is assumed, we need only check total number of
+  // elements
+  GPU_FAISS_ASSERT(this->numElements() == t.numElements());
+
+  if (t.numElements() > 0) {
+    GPU_FAISS_ASSERT(this->data_);
+    GPU_FAISS_ASSERT(t.data());
+
+    int ourDev = getDeviceForAddress(this->data_);
+    int tDev = getDeviceForAddress(t.data());
+
+    if (tDev == -1) {
+      CUDA_VERIFY(cudaMemcpyAsync(t.data(),
+                                  this->data_,
+                                  this->getSizeInBytes(),
+                                  ourDev == -1 ? cudaMemcpyHostToHost :
+                                  cudaMemcpyDeviceToHost,
+                                  stream));
+    } else {
+      CUDA_VERIFY(cudaMemcpyAsync(t.data(),
+                                  this->data_,
+                                  this->getSizeInBytes(),
+                                  ourDev == -1 ? cudaMemcpyHostToDevice :
+                                  cudaMemcpyDeviceToDevice,
+                                  stream));
+    }
+  }
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename OtherT, int OtherDim>
+__host__ __device__ bool
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isSame(
+  const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs) const {
+  if (Dim != OtherDim) {
+    return false;
+  }
+
+  for (int i = 0; i < Dim; ++i) {
+    if (this->getSize(i) != rhs.getSize(i)) {
+      return false;
+    }
+
+    if (this->getStride(i) != rhs.getStride(i)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename OtherT, int OtherDim>
+__host__ __device__ bool
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isSameSize(
+  const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs) const {
+  if (Dim != OtherDim) {
+    return false;
+  }
+
+  for (int i = 0; i < Dim; ++i) {
+    if (this->getSize(i) != rhs.getSize(i)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename U>
+__host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::cast() {
+  static_assert(sizeof(U) == sizeof(T), "cast must be to same size object");
+
+  return Tensor<U, Dim, InnerContig, IndexT, PtrTraits>(
+    reinterpret_cast<U*>(data_), size_, stride_);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename U>
+__host__ __device__ const Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::cast() const {
+  static_assert(sizeof(U) == sizeof(T), "cast must be to same size object");
+
+  return Tensor<U, Dim, InnerContig, IndexT, PtrTraits>(
+    reinterpret_cast<U*>(data_), size_, stride_);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename U>
+__host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::castResize() {
+  static_assert(sizeof(U) >= sizeof(T), "only handles greater sizes");
+  constexpr int kMultiple = sizeof(U) / sizeof(T);
+
+  GPU_FAISS_ASSERT(canCastResize<U>());
+
+  IndexT newSize[Dim];
+  IndexT newStride[Dim];
+
+  for (int i = 0; i < Dim - 1; ++i) {
+    newSize[i] = size_[i];
+    newStride[i] = stride_[i] / kMultiple;
+  }
+
+  newStride[Dim - 1] = 1; // this is the same as the old stride
+  newSize[Dim - 1] = size_[Dim - 1] / kMultiple;
+
+  return Tensor<U, Dim, InnerContig, IndexT, PtrTraits>(
+    reinterpret_cast<U*>(data_), newSize, newStride);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename U>
+__host__ __device__ const Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::castResize() const {
+  return const_cast<Tensor<T, Dim, InnerContig, IndexT, PtrTraits>*>(this)->
+    castResize<U>();
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename U>
+__host__ __device__ bool
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::canCastResize() const {
+  static_assert(sizeof(U) >= sizeof(T), "only handles greater sizes");
+  constexpr int kMultiple = sizeof(U) / sizeof(T);
+
+  // Ensure that the base pointer is sizeof(U) aligned
+  if (((uintptr_t) data_) % sizeof(U) != 0) {
+    return false;
+  }
+
+  // Check all outer strides
+  for (int i = 0; i < Dim - 1; ++i) {
+    if (stride_[i] % kMultiple != 0) {
+      return false;
+    }
+  }
+
+  // Check inner size
+  if (size_[Dim - 1] % kMultiple != 0) {
+    return false;
+  }
+
+  if (stride_[Dim - 1] != 1) {
+    return false;
+  }
+
+  return true;
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename NewIndexT>
+__host__ Tensor<T, Dim, InnerContig, NewIndexT, PtrTraits>
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::castIndexType() const {
+  if (sizeof(NewIndexT) < sizeof(IndexT)) {
+    GPU_FAISS_ASSERT(this->canUseIndexType<NewIndexT>());
+  }
+
+  NewIndexT newSize[Dim];
+  NewIndexT newStride[Dim];
+  for (int i = 0; i < Dim; ++i) {
+    newSize[i] = (NewIndexT) size_[i];
+    newStride[i] = (NewIndexT) stride_[i];
+  }
+
+  return Tensor<T, Dim, InnerContig, NewIndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename NewIndexT>
+__host__ bool
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::canUseIndexType() const {
+  static_assert(sizeof(size_t) >= sizeof(IndexT),
+                "index size too large");
+  static_assert(sizeof(size_t) >= sizeof(NewIndexT),
+                "new index size too large");
+
+  // Find maximum offset that can be calculated
+  // FIXME: maybe also consider offset in bytes? multiply by sizeof(T)?
+  size_t maxOffset = 0;
+
+  for (int i = 0; i < Dim; ++i) {
+    size_t curMaxOffset = (size_t) size_[i] * (size_t) stride_[i];
+    if (curMaxOffset > maxOffset) {
+      maxOffset = curMaxOffset;
+    }
+  }
+
+  if (maxOffset > (size_t) std::numeric_limits<NewIndexT>::max()) {
+    return false;
+  }
+
+  return true;
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ size_t
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::numElements() const {
+  size_t size = (size_t) getSize(0);
+
+  for (int i = 1; i < Dim; ++i) {
+    size *= (size_t) getSize(i);
+  }
+
+  return size;
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ bool
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isContiguous() const {
+  long prevSize = 1;
+
+  for (int i = Dim - 1; i >= 0; --i) {
+    if (getSize(i) != (IndexT) 1) {
+      if (getStride(i) == prevSize) {
+        prevSize *= getSize(i);
+      } else {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ bool
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isConsistentlySized(int i) const {
+  if (i == 0 && getStride(i) > 0 && getSize(i) > 0) {
+    return true;
+  } else if ((i > 0) && (i < Dim) && (getStride(i) > 0) &&
+             ((getStride(i - 1) / getStride(i)) >= getSize(i))) {
+    return true;
+  }
+
+  return false;
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ bool
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isConsistentlySized() const {
+  for (int i = 0; i < Dim; ++i) {
+    if (!isConsistentlySized(i)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ bool
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isContiguousDim(int i) const {
+  return (i == Dim - 1) || // just in case
+    ((i < Dim - 1) &&
+     ((getStride(i) / getStride(i + 1)) == getSize(i + 1)));
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::transpose(int dim1,
+                                                     int dim2) const {
+  GPU_FAISS_ASSERT(dim1 >= 0 && dim1 < Dim);
+  GPU_FAISS_ASSERT(dim1 >= 0 && dim2 < Dim);
+
+  // If a tensor is innermost contiguous, one cannot transpose the innermost
+  // dimension
+  if (InnerContig) {
+    GPU_FAISS_ASSERT(dim1 != Dim - 1 && dim2 != Dim - 1);
+  }
+
+  IndexT newSize[Dim];
+  IndexT newStride[Dim];
+
+  for (int i = 0; i < Dim; ++i) {
+    newSize[i] = size_[i];
+    newStride[i] = stride_[i];
+  }
+
+  IndexT tmp = newSize[dim1];
+  newSize[dim1] = newSize[dim2];
+  newSize[dim2] = tmp;
+
+  tmp = newStride[dim1];
+  newStride[dim1] = newStride[dim2];
+  newStride[dim2] = tmp;
+
+  return Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data_, newSize, newStride);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::upcastOuter() {
+  // Can only create tensors of greater dimension
+  static_assert(NewDim > Dim, "Can only upcast to greater dim");
+
+  IndexT newSize[NewDim];
+  IndexT newStride[NewDim];
+
+  int shift = NewDim - Dim;
+
+  for (int i = 0; i < NewDim; ++i) {
+    if (i < shift) {
+      // These are the extended dimensions
+      newSize[i] = (IndexT) 1;
+      newStride[i] = size_[0] * stride_[0];
+    } else {
+      // Shift the remaining dimensions
+      newSize[i] = size_[i - shift];
+      newStride[i] = stride_[i - shift];
+    }
+  }
+
+  return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::upcastInner() {
+  // Can only create tensors of greater dimension
+  static_assert(NewDim > Dim, "Can only upcast to greater dim");
+
+  IndexT newSize[NewDim];
+  IndexT newStride[NewDim];
+
+  for (int i = 0; i < NewDim; ++i) {
+    if (i < Dim) {
+      // Existing dimensions get copied over
+      newSize[i] = size_[i];
+      newStride[i] = stride_[i];
+    } else {
+      // Extended dimensions
+      newSize[i] = (IndexT) 1;
+      newStride[i] = (IndexT) 1;
+    }
+  }
+
+  return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::downcastOuter() {
+  // Can only create tensors of lesser dimension
+  static_assert(NewDim < Dim, "Can only downcast to lesser dim");
+
+  // We can't downcast non-contiguous tensors, since it leaves
+  // garbage data in the tensor. The tensor needs to be contiguous
+  // in all of the dimensions we are collapsing (no padding in
+  // them).
+  for (int i = 0; i < Dim - NewDim; ++i) {
+    bool cont = isContiguousDim(i);
+    GPU_FAISS_ASSERT(cont);
+  }
+
+  IndexT newSize[NewDim];
+  IndexT newStride[NewDim];
+
+  int ignoredDims = Dim - NewDim;
+  IndexT collapsedSize = 1;
+
+  for (int i = 0; i < Dim; ++i) {
+    if (i < ignoredDims) {
+      // Collapse these dimensions
+      collapsedSize *= getSize(i);
+    } else {
+      // Non-collapsed dimensions
+      if (i == ignoredDims) {
+        // This is the first non-collapsed dimension
+        newSize[i - ignoredDims] = collapsedSize * getSize(i);
+      } else {
+        // Subsequent non-collapsed dimensions
+        newSize[i - ignoredDims] = getSize(i);
+      }
+
+      newStride[i - ignoredDims] = getStride(i);
+    }
+  }
+
+  return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::downcastInner() {
+  // Can only create tensors of lesser dimension
+  static_assert(NewDim < Dim, "Can only downcast to lesser dim");
+
+  // We can't downcast non-contiguous tensors, since it leaves
+  // garbage data in the tensor. The tensor needs to be contiguous
+  // in all of the dimensions we are collapsing (no padding in
+  // them).
+  for (int i = NewDim; i < Dim; ++i) {
+    GPU_FAISS_ASSERT(isContiguousDim(i));
+  }
+
+  IndexT newSize[NewDim];
+  IndexT newStride[NewDim];
+
+  IndexT collapsedSize = 1;
+
+  for (int i = Dim - 1; i >= 0; --i) {
+    if (i >= NewDim) {
+      // Collapse these dimensions
+      collapsedSize *= getSize(i);
+    } else {
+      // Non-collapsed dimensions
+      if (i == NewDim - 1) {
+        // This is the first non-collapsed dimension
+        newSize[i] = collapsedSize * getSize(i);
+        newStride[i] = getStride(Dim - 1);
+      } else {
+        // Subsequent non-collapsed dimensions
+        newSize[i] = getSize(i);
+        newStride[i] = getStride(i);
+      }
+    }
+  }
+
+  return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int SubDim>
+__host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::view(DataPtrType at) {
+  static_assert(SubDim >= 1 && SubDim < Dim,
+                "can only create view of lesser dim");
+
+  IndexT viewSizes[SubDim];
+  IndexT viewStrides[SubDim];
+
+  for (int i = 0; i < SubDim; ++i) {
+    viewSizes[i] = size_[Dim - SubDim + i];
+    viewStrides[i] = stride_[Dim - SubDim + i];
+  }
+
+  return Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>(
+    at, viewSizes, viewStrides);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int SubDim>
+__host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::view() {
+  return view<SubDim>(data_);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::narrowOutermost(IndexT start,
+                                                                IndexT size) {
+  return this->narrow(0, start, size);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::narrow(int dim,
+                                                       IndexT start,
+                                                       IndexT size) {
+  DataPtrType newData = data_;
+
+  GPU_FAISS_ASSERT(start >= 0 &&
+                   start < size_[dim] &&
+                   (start + size) <= size_[dim]);
+
+  if (start > 0) {
+    newData += (size_t) start * stride_[dim];
+  }
+
+  IndexT newSize[Dim];
+  for (int i = 0; i < Dim; ++i) {
+    if (i == dim) {
+      GPU_FAISS_ASSERT(start + size <= size_[dim]);
+      newSize[i] = size;
+    } else {
+      newSize[i] = size_[i];
+    }
+  }
+
+  // If we were innermost contiguous before, we are still innermost contiguous
+  return Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(newData, newSize, stride_);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::view(
+  std::initializer_list<IndexT> sizes) {
+  GPU_FAISS_ASSERT(this->isContiguous());
+
+  GPU_FAISS_ASSERT(sizes.size() == NewDim);
+
+  // The total size of the new view must be the same as the total size
+  // of the old view
+  size_t curSize = numElements();
+  size_t newSize = 1;
+
+  for (auto s : sizes) {
+    newSize *= s;
+  }
+
+  GPU_FAISS_ASSERT(curSize == newSize);
+  return Tensor<T, NewDim, true, IndexT, PtrTraits>(data(), sizes);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Tensor.cuh b/core/src/index/thirdparty/faiss/gpu/utils/Tensor.cuh
new file mode 100644
index 0000000000..7f737a87ed
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/Tensor.cuh
@@ -0,0 +1,651 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <assert.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <initializer_list>
+
+/// Multi-dimensional array class for CUDA device and host usage.
+/// Originally from Facebook's fbcunn, since added to the Torch GPU
+/// library cutorch as well.
+
+namespace faiss { namespace gpu {
+
+/// Our tensor type
+template <typename T,
+          int Dim,
+          bool InnerContig,
+          typename IndexT,
+          template <typename U> class PtrTraits>
+class Tensor;
+
+/// Type of a subspace of a tensor
+namespace detail {
+template <typename TensorType,
+          int SubDim,
+          template <typename U> class PtrTraits>
+class SubTensor;
+}
+
+namespace traits {
+
+template <typename T>
+struct RestrictPtrTraits {
+  typedef T* __restrict__ PtrType;
+};
+
+template <typename T>
+struct DefaultPtrTraits {
+  typedef T* PtrType;
+};
+
+}
+
+/**
+   Templated multi-dimensional array that supports strided access of
+   elements. Main access is through `operator[]`; e.g.,
+   `tensor[x][y][z]`.
+
+   - `T` is the contained type (e.g., `float`)
+   - `Dim` is the tensor rank
+   - If `InnerContig` is true, then the tensor is assumed to be innermost
+   - contiguous, and only operations that make sense on contiguous
+   - arrays are allowed (e.g., no transpose). Strides are still
+   - calculated, but innermost stride is assumed to be 1.
+   - `IndexT` is the integer type used for size/stride arrays, and for
+   - all indexing math. Default is `int`, but for large tensors, `long`
+   - can be used instead.
+   - `PtrTraits` are traits applied to our data pointer (T*). By default,
+   - this is just T*, but RestrictPtrTraits can be used to apply T*
+   - __restrict__ for alias-free analysis.
+*/
+template <typename T,
+          int Dim,
+          bool InnerContig = false,
+          typename IndexT = int,
+          template <typename U> class PtrTraits = traits::DefaultPtrTraits>
+class Tensor {
+ public:
+  enum { NumDim = Dim };
+  typedef T DataType;
+  typedef IndexT IndexType;
+  enum { IsInnerContig = InnerContig };
+  typedef typename PtrTraits<T>::PtrType DataPtrType;
+  typedef Tensor<T, Dim, InnerContig, IndexT, PtrTraits> TensorType;
+
+  /// Default constructor
+  __host__ __device__ Tensor();
+
+  /// Copy constructor
+  __host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t);
+
+  /// Move constructor
+  __host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
+
+  /// Assignment
+  __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
+  operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t);
+
+  /// Move assignment
+  __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
+  operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
+
+  /// Constructor that calculates strides with no padding
+  __host__ __device__ Tensor(DataPtrType data,
+                             const IndexT sizes[Dim]);
+  __host__ __device__ Tensor(DataPtrType data,
+                             std::initializer_list<IndexT> sizes);
+
+  /// Constructor that takes arbitrary size/stride arrays.
+  /// Errors if you attempt to pass non-contiguous strides to a
+  /// contiguous tensor.
+  __host__ __device__ Tensor(DataPtrType data,
+                             const IndexT sizes[Dim],
+                             const IndexT strides[Dim]);
+
+  /// Copies a tensor into ourselves; sizes must match
+  __host__ void copyFrom(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
+                         cudaStream_t stream);
+
+  /// Copies ourselves into a tensor; sizes must match
+  __host__ void copyTo(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
+                       cudaStream_t stream);
+
+  /// Returns true if the two tensors are of the same dimensionality,
+  /// size and stride.
+  template <typename OtherT, int OtherDim>
+  __host__ __device__ bool
+  isSame(const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs) const;
+
+  /// Returns true if the two tensors are of the same dimensionality and size
+  template <typename OtherT, int OtherDim>
+  __host__ __device__ bool
+  isSameSize(const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs) const;
+
+  /// Cast to a tensor of a different type of the same size and
+  /// stride. U and our type T must be of the same size
+  template <typename U>
+  __host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits> cast();
+
+  /// Const version of `cast`
+  template <typename U>
+  __host__ __device__
+  const Tensor<U, Dim, InnerContig, IndexT, PtrTraits> cast() const;
+
+  /// Cast to a tensor of a different type which is potentially a
+  /// different size than our type T. Tensor must be aligned and the
+  /// innermost dimension must be a size that is a multiple of
+  /// sizeof(U) / sizeof(T), and the stride of the innermost dimension
+  /// must be contiguous. The stride of all outer dimensions must be a
+  /// multiple of sizeof(U) / sizeof(T) as well.
+  template <typename U>
+  __host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits> castResize();
+
+  /// Const version of `castResize`
+  template <typename U>
+  __host__ __device__ const Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
+  castResize() const;
+
+  /// Returns true if we can castResize() this tensor to the new type
+  template <typename U>
+  __host__ __device__ bool canCastResize() const;
+
+  /// Attempts to cast this tensor to a tensor of a different IndexT.
+  /// Fails if size or stride entries are not representable in the new
+  /// IndexT.
+  template <typename NewIndexT>
+  __host__ Tensor<T, Dim, InnerContig, NewIndexT, PtrTraits>
+  castIndexType() const;
+
+  /// Returns true if we can use this indexing type to access all elements
+  /// index type
+  template <typename NewIndexT>
+  __host__ bool canUseIndexType() const;
+
+  /// Returns a raw pointer to the start of our data.
+  __host__ __device__ inline DataPtrType data() {
+    return data_;
+  }
+
+  /// Returns a raw pointer to the end of our data, assuming
+  /// continuity
+  __host__ __device__ inline DataPtrType end() {
+    return data() + numElements();
+  }
+
+  /// Returns a raw pointer to the start of our data (const).
+  __host__ __device__ inline
+  const DataPtrType data() const {
+    return data_;
+  }
+
+  /// Returns a raw pointer to the end of our data, assuming
+  /// continuity (const)
+  __host__ __device__ inline DataPtrType end() const {
+    return data() + numElements();
+  }
+
+  /// Cast to a different datatype
+  template <typename U>
+  __host__ __device__ inline
+  typename PtrTraits<U>::PtrType dataAs() {
+    return reinterpret_cast<typename PtrTraits<U>::PtrType>(data_);
+  }
+
+  /// Cast to a different datatype
+  template <typename U>
+  __host__ __device__ inline
+  const typename PtrTraits<const U>::PtrType dataAs() const {
+    return reinterpret_cast<typename PtrTraits<const U>::PtrType>(data_);
+  }
+
+  /// Returns a read/write view of a portion of our tensor.
+  __host__ __device__ inline
+  detail::SubTensor<TensorType, Dim - 1, PtrTraits>
+    operator[](IndexT);
+
+  /// Returns a read/write view of a portion of our tensor (const).
+  __host__ __device__ inline
+  const detail::SubTensor<TensorType, Dim - 1, PtrTraits>
+    operator[](IndexT) const;
+
+  /// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
+  /// checking.
+  __host__ __device__ inline IndexT getSize(int i) const {
+    return size_[i];
+  }
+
+  /// Returns the stride of a given dimension, `[0, Dim - 1]`. No bounds
+  /// checking.
+  __host__ __device__ inline IndexT getStride(int i) const {
+    return stride_[i];
+  }
+
+  /// Returns the total number of elements contained within our data
+  /// (product of `getSize(i)`)
+  __host__ __device__ size_t numElements() const;
+
+  /// If we are contiguous, returns the total size in bytes of our
+  /// data
+  __host__ __device__ size_t getSizeInBytes() const {
+    return numElements() * sizeof(T);
+  }
+
+  /// Returns the size array.
+  __host__ __device__ inline const IndexT* sizes() const {
+    return size_;
+  }
+
+  /// Returns the stride array.
+  __host__ __device__ inline const IndexT* strides() const {
+    return stride_;
+  }
+
+  /// Returns true if there is no padding within the tensor and no
+  /// re-ordering of the dimensions.
+  /// ~~~
+  /// (stride(i) == size(i + 1) * stride(i + 1)) && stride(dim - 1) == 0
+  /// ~~~
+  __host__ __device__ bool isContiguous() const;
+
+  /// Returns whether a given dimension has only increasing stride
+  /// from the previous dimension. A tensor that was permuted by
+  /// exchanging size and stride only will fail this check.
+  /// If `i == 0` just check `size > 0`. Returns `false` if `stride` is `<= 0`.
+  __host__ __device__ bool isConsistentlySized(int i) const;
+
+  // Returns whether at each dimension `stride <= size`.
+  // If this is not the case then iterating once over the size space will
+  // touch the same memory locations multiple times.
+  __host__ __device__ bool isConsistentlySized() const;
+
+  /// Returns true if the given dimension index has no padding
+  __host__ __device__ bool isContiguousDim(int i) const;
+
+  /// Returns a tensor of the same dimension after transposing the two
+  /// dimensions given. Does not actually move elements; transposition
+  /// is made by permuting the size/stride arrays.
+  /// If the dimensions are not valid, asserts.
+  __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
+  transpose(int dim1, int dim2) const;
+
+  /// Upcast a tensor of dimension `D` to some tensor of dimension
+  /// D' > D by padding the leading dimensions by 1
+  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
+  template <int NewDim>
+  __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
+  upcastOuter();
+
+  /// Upcast a tensor of dimension `D` to some tensor of dimension
+  /// D' > D by padding the lowest/most varying dimensions by 1
+  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
+  template <int NewDim>
+  __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
+  upcastInner();
+
+  /// Downcast a tensor of dimension `D` to some tensor of dimension
+  /// D' < D by collapsing the leading dimensions. asserts if there is
+  /// padding on the leading dimensions.
+  template <int NewDim>
+  __host__ __device__
+  Tensor<T, NewDim, InnerContig, IndexT, PtrTraits> downcastOuter();
+
+  /// Downcast a tensor of dimension `D` to some tensor of dimension
+  /// D' < D by collapsing the leading dimensions. asserts if there is
+  /// padding on the leading dimensions.
+  template <int NewDim>
+  __host__ __device__
+  Tensor<T, NewDim, InnerContig, IndexT, PtrTraits> downcastInner();
+
+  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
+  /// of this tensor, starting at `at`.
+  template <int SubDim>
+  __host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>
+  view(DataPtrType at);
+
+  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
+  /// of this tensor, starting where our data begins
+  template <int SubDim>
+  __host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>
+  view();
+
+  /// Returns a tensor of the same dimension that is a view of the
+  /// original tensor with the specified dimension restricted to the
+  /// elements in the range [start, start + size)
+  __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
+  narrowOutermost(IndexT start, IndexT size);
+
+  /// Returns a tensor of the same dimension that is a view of the
+  /// original tensor with the specified dimension restricted to the
+  /// elements in the range [start, start + size).
+  /// Can occur in an arbitrary dimension
+  __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
+  narrow(int dim, IndexT start, IndexT size);
+
+  /// Returns a view of the given tensor expressed as a tensor of a
+  /// different number of dimensions.
+  /// Only works if we are contiguous.
+  template <int NewDim>
+  __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
+  view(std::initializer_list<IndexT> sizes);
+
+  protected:
+  /// Raw pointer to where the tensor data begins
+  DataPtrType data_;
+
+  /// Array of strides (in sizeof(T) terms) per each dimension
+  IndexT stride_[Dim];
+
+  /// Size per each dimension
+  IndexT size_[Dim];
+};
+
+// Utilities for checking a collection of tensors
+namespace detail {
+
+template <typename IndexType>
+bool canUseIndexType() {
+  return true;
+}
+
+template <typename IndexType, typename T, typename... U>
+bool canUseIndexType(const T& arg, const U&... args) {
+  return arg.template canUseIndexType() &&
+    canUseIndexType(args...);
+}
+
+} // namespace detail
+
+template <typename IndexType, typename... T>
+bool canUseIndexType(const T&... args) {
+  return detail::canUseIndexType(args...);
+}
+
+namespace detail {
+
+/// Specialization for a view of a single value (0-dimensional)
+template <typename TensorType, template <typename U> class PtrTraits>
+class SubTensor<TensorType, 0, PtrTraits> {
+ public:
+  __host__ __device__ SubTensor<TensorType, 0, PtrTraits>
+  operator=(typename TensorType::DataType val) {
+    *data_ = val;
+    return *this;
+  }
+
+  // operator T&
+  __host__ __device__ operator typename TensorType::DataType&() {
+    return *data_;
+  }
+
+  // const operator T& returning const T&
+  __host__ __device__ operator const typename TensorType::DataType&() const {
+    return *data_;
+  }
+
+  // operator& returning T*
+  __host__ __device__ typename TensorType::DataType* operator&() {
+    return data_;
+  }
+
+  // const operator& returning const T*
+  __host__ __device__ const typename TensorType::DataType* operator&() const {
+    return data_;
+  }
+
+  /// Returns a raw accessor to our slice.
+  __host__ __device__ inline typename TensorType::DataPtrType data() {
+    return data_;
+  }
+
+  /// Returns a raw accessor to our slice (const).
+  __host__ __device__ inline
+  const typename TensorType::DataPtrType data() const {
+    return data_;
+  }
+
+  /// Cast to a different datatype.
+  template <typename T>
+  __host__ __device__ T& as() {
+    return *dataAs<T>();
+  }
+
+  /// Cast to a different datatype (const).
+  template <typename T>
+  __host__ __device__ const T& as() const {
+    return *dataAs<T>();
+  }
+
+  /// Cast to a different datatype
+  template <typename T>
+  __host__ __device__ inline
+  typename PtrTraits<T>::PtrType dataAs() {
+    return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
+  }
+
+  /// Cast to a different datatype (const)
+  template <typename T>
+  __host__ __device__ inline
+  typename PtrTraits<const T>::PtrType dataAs() const {
+    return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
+  }
+
+  /// Use the texture cache for reads
+  __device__ inline typename TensorType::DataType ldg() const {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(data_);
+#else
+    return *data_;
+#endif
+  }
+
+  /// Use the texture cache for reads; cast as a particular type
+  template <typename T>
+  __device__ inline T ldgAs() const {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(dataAs<T>());
+#else
+    return as<T>();
+#endif
+  }
+
+ protected:
+  /// One dimension greater can create us
+  friend class SubTensor<TensorType, 1, PtrTraits>;
+
+  /// Our parent tensor can create us
+  friend class Tensor<typename TensorType::DataType,
+                      1,
+                      TensorType::IsInnerContig,
+                      typename TensorType::IndexType,
+                      PtrTraits>;
+
+  __host__ __device__ inline SubTensor(
+    TensorType& t,
+    typename TensorType::DataPtrType data)
+      : tensor_(t),
+        data_(data) {
+  }
+
+  /// The tensor we're referencing
+  TensorType& tensor_;
+
+  /// Where our value is located
+  typename TensorType::DataPtrType const data_;
+};
+
+/// A `SubDim`-rank slice of a parent Tensor
+template <typename TensorType,
+          int SubDim,
+          template <typename U> class PtrTraits>
+class SubTensor {
+ public:
+  /// Returns a view of the data located at our offset (the dimension
+  /// `SubDim` - 1 tensor).
+  __host__ __device__ inline
+  SubTensor<TensorType, SubDim - 1, PtrTraits>
+    operator[](typename TensorType::IndexType index) {
+    if (TensorType::IsInnerContig && SubDim == 1) {
+      // Innermost dimension is stride 1 for contiguous arrays
+      return SubTensor<TensorType, SubDim - 1, PtrTraits>(
+        tensor_, data_ + index);
+    } else {
+      return SubTensor<TensorType, SubDim - 1, PtrTraits>(
+        tensor_,
+        data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
+    }
+  }
+
+  /// Returns a view of the data located at our offset (the dimension
+  /// `SubDim` - 1 tensor) (const).
+  __host__ __device__ inline
+  const SubTensor<TensorType, SubDim - 1, PtrTraits>
+    operator[](typename TensorType::IndexType index) const {
+    if (TensorType::IsInnerContig && SubDim == 1) {
+      // Innermost dimension is stride 1 for contiguous arrays
+      return SubTensor<TensorType, SubDim - 1, PtrTraits>(
+        tensor_, data_ + index);
+    } else {
+      return SubTensor<TensorType, SubDim - 1, PtrTraits>(
+        tensor_,
+        data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
+    }
+  }
+
+  // operator& returning T*
+  __host__ __device__ typename TensorType::DataType* operator&() {
+    return data_;
+  }
+
+  // const operator& returning const T*
+  __host__ __device__ const typename TensorType::DataType* operator&() const {
+    return data_;
+  }
+
+  /// Returns a raw accessor to our slice.
+  __host__ __device__ inline typename TensorType::DataPtrType data() {
+    return data_;
+  }
+
+  /// Returns a raw accessor to our slice (const).
+  __host__ __device__ inline
+  const typename TensorType::DataPtrType data() const {
+    return data_;
+  }
+
+  /// Cast to a different datatype.
+  template <typename T>
+  __host__ __device__ T& as() {
+    return *dataAs<T>();
+  }
+
+  /// Cast to a different datatype (const).
+  template <typename T>
+  __host__ __device__ const T& as() const {
+    return *dataAs<T>();
+  }
+
+  /// Cast to a different datatype
+  template <typename T>
+  __host__ __device__ inline
+  typename PtrTraits<T>::PtrType dataAs() {
+    return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
+  }
+
+  /// Cast to a different datatype (const)
+  template <typename T>
+  __host__ __device__ inline
+  typename PtrTraits<const T>::PtrType dataAs() const {
+    return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
+  }
+
+  /// Use the texture cache for reads
+  __device__ inline typename TensorType::DataType ldg() const {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(data_);
+#else
+    return *data_;
+#endif
+  }
+
+  /// Use the texture cache for reads; cast as a particular type
+  template <typename T>
+  __device__ inline T ldgAs() const {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(dataAs<T>());
+#else
+    return as<T>();
+#endif
+  }
+
+  /// Returns a tensor that is a view of the SubDim-dimensional slice
+  /// of this tensor, starting where our data begins
+  Tensor<typename TensorType::DataType,
+         SubDim,
+         TensorType::IsInnerContig,
+         typename TensorType::IndexType,
+         PtrTraits> view() {
+    return tensor_.template view<SubDim>(data_);
+  }
+
+ protected:
+  /// One dimension greater can create us
+  friend class SubTensor<TensorType, SubDim + 1, PtrTraits>;
+
+  /// Our parent tensor can create us
+  friend class
+  Tensor<typename TensorType::DataType,
+         TensorType::NumDim,
+         TensorType::IsInnerContig,
+         typename TensorType::IndexType,
+         PtrTraits>;
+
+  __host__ __device__ inline SubTensor(
+    TensorType& t,
+    typename TensorType::DataPtrType data)
+      : tensor_(t),
+        data_(data) {
+  }
+
+  /// The tensor we're referencing
+  TensorType& tensor_;
+
+  /// The start of our sub-region
+  typename TensorType::DataPtrType const data_;
+};
+
+} // namespace detail
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ inline
+detail::SubTensor<Tensor<T, Dim, InnerContig, IndexT, PtrTraits>,
+                  Dim - 1, PtrTraits>
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator[](IndexT index) {
+  return detail::SubTensor<TensorType, Dim - 1, PtrTraits>(
+    detail::SubTensor<TensorType, Dim, PtrTraits>(
+      *this, data_)[index]);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ inline
+const detail::SubTensor<Tensor<T, Dim, InnerContig, IndexT, PtrTraits>,
+                        Dim - 1, PtrTraits>
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator[](IndexT index) const {
+  return detail::SubTensor<TensorType, Dim - 1, PtrTraits>(
+    detail::SubTensor<TensorType, Dim, PtrTraits>(
+      const_cast<TensorType&>(*this), data_)[index]);
+}
+
+} } // namespace
+
+#include <faiss/gpu/utils/Tensor-inl.cuh>
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/ThrustAllocator.cuh b/core/src/index/thirdparty/faiss/gpu/utils/ThrustAllocator.cuh
new file mode 100644
index 0000000000..4ca0415bfa
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/ThrustAllocator.cuh
@@ -0,0 +1,69 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/MemorySpace.h>
+#include <cuda.h>
+#include <unordered_set>
+
+namespace faiss { namespace gpu {
+
+/// Allocator for Thrust that comes out of a specified memory space
+class GpuResourcesThrustAllocator {
+ public:
+  typedef char value_type;
+
+  GpuResourcesThrustAllocator(void* mem, size_t size)
+      : start_((char*) mem),
+        cur_((char*) mem),
+        end_((char*) mem + size) {
+  }
+
+  ~GpuResourcesThrustAllocator() {
+    // In the case of an exception being thrown, we may not have called
+    // deallocate on all of our sub-allocations. Free them here
+    for (auto p : mallocAllocs_) {
+      freeMemorySpace(MemorySpace::Device, p);
+    }
+  }
+
+  char* allocate(std::ptrdiff_t size) {
+    if (size <= (end_ - cur_)) {
+      char* p = cur_;
+      cur_ += size;
+      FAISS_ASSERT(cur_ <= end_);
+
+      return p;
+    } else {
+      char* p = nullptr;
+      allocMemorySpace(MemorySpace::Device, &p, size);
+      mallocAllocs_.insert(p);
+      return p;
+    }
+  }
+
+  void deallocate(char* p, size_t size) {
+    // Allocations could be returned out-of-order; ignore those we
+    // didn't cudaMalloc
+    auto it = mallocAllocs_.find(p);
+    if (it != mallocAllocs_.end()) {
+      freeMemorySpace(MemorySpace::Device, p);
+      mallocAllocs_.erase(it);
+    }
+  }
+
+ private:
+  char* start_;
+  char* cur_;
+  char* end_;
+  std::unordered_set<char*> mallocAllocs_;
+};
+
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Timer.cpp b/core/src/index/thirdparty/faiss/gpu/utils/Timer.cpp
new file mode 100644
index 0000000000..1764fec10a
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/Timer.cpp
@@ -0,0 +1,60 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/utils/Timer.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/impl/FaissAssert.h>
+
+namespace faiss { namespace gpu {
+
+KernelTimer::KernelTimer(cudaStream_t stream)
+    : startEvent_(0),
+      stopEvent_(0),
+      stream_(stream),
+      valid_(true) {
+  CUDA_VERIFY(cudaEventCreate(&startEvent_));
+  CUDA_VERIFY(cudaEventCreate(&stopEvent_));
+
+  CUDA_VERIFY(cudaEventRecord(startEvent_, stream_));
+}
+
+KernelTimer::~KernelTimer() {
+  CUDA_VERIFY(cudaEventDestroy(startEvent_));
+  CUDA_VERIFY(cudaEventDestroy(stopEvent_));
+}
+
+float
+KernelTimer::elapsedMilliseconds() {
+  FAISS_ASSERT(valid_);
+
+  CUDA_VERIFY(cudaEventRecord(stopEvent_, stream_));
+  CUDA_VERIFY(cudaEventSynchronize(stopEvent_));
+
+  auto time = 0.0f;
+  CUDA_VERIFY(cudaEventElapsedTime(&time, startEvent_, stopEvent_));
+  valid_ = false;
+
+  return time;
+}
+
+CpuTimer::CpuTimer() {
+  clock_gettime(CLOCK_REALTIME, &start_);
+}
+
+float
+CpuTimer::elapsedMilliseconds() {
+  struct timespec end;
+  clock_gettime(CLOCK_REALTIME, &end);
+
+  auto diffS = end.tv_sec - start_.tv_sec;
+  auto diffNs = end.tv_nsec - start_.tv_nsec;
+
+  return 1000.0f * (float) diffS + ((float) diffNs) / 1000000.0f;
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Timer.h b/core/src/index/thirdparty/faiss/gpu/utils/Timer.h
new file mode 100644
index 0000000000..ef2a161a32
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/Timer.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <time.h>
+
+namespace faiss { namespace gpu {
+
+/// Utility class for timing execution of a kernel
+class KernelTimer {
+ public:
+  /// Constructor starts the timer and adds an event into the current
+  /// device stream
+  KernelTimer(cudaStream_t stream = 0);
+
+  /// Destructor releases event resources
+  ~KernelTimer();
+
+  /// Adds a stop event then synchronizes on the stop event to get the
+  /// actual GPU-side kernel timings for any kernels launched in the
+  /// current stream. Returns the number of milliseconds elapsed.
+  /// Can only be called once.
+  float elapsedMilliseconds();
+
+ private:
+  cudaEvent_t startEvent_;
+  cudaEvent_t stopEvent_;
+  cudaStream_t stream_;
+  bool valid_;
+};
+
+/// CPU wallclock elapsed timer
+class CpuTimer {
+ public:
+  /// Creates and starts a new timer
+  CpuTimer();
+
+  /// Returns elapsed time in milliseconds
+  float elapsedMilliseconds();
+
+ private:
+  struct timespec start_;
+};
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Transpose.cuh b/core/src/index/thirdparty/faiss/gpu/utils/Transpose.cuh
new file mode 100644
index 0000000000..c6137d9f0d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/Transpose.cuh
@@ -0,0 +1,154 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <cuda.h>
+
+namespace faiss { namespace gpu {
+
+template <typename T, typename IndexT>
+struct TensorInfo {
+  static constexpr int kMaxDims = 8;
+
+  T* data;
+  IndexT sizes[kMaxDims];
+  IndexT strides[kMaxDims];
+  int dims;
+};
+
+template <typename T, typename IndexT, int Dim>
+struct TensorInfoOffset {
+  __device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
+                                            IndexT linearId) {
+    IndexT offset = 0;
+
+#pragma unroll
+    for (int i = Dim - 1; i >= 0; --i) {
+      IndexT curDimIndex = linearId % info.sizes[i];
+      IndexT curDimOffset = curDimIndex * info.strides[i];
+
+      offset += curDimOffset;
+
+      if (i > 0) {
+        linearId /= info.sizes[i];
+      }
+    }
+
+    return offset;
+  }
+};
+
+template <typename T, typename IndexT>
+struct TensorInfoOffset<T, IndexT, -1> {
+  __device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
+                                            IndexT linearId) {
+    return linearId;
+  }
+};
+
+template <typename T, typename IndexT, int Dim>
+TensorInfo<T, IndexT> getTensorInfo(const Tensor<T, Dim, true>& t) {
+  TensorInfo<T, IndexT> info;
+
+  for (int i = 0; i < Dim; ++i) {
+    info.sizes[i] = (IndexT) t.getSize(i);
+    info.strides[i] = (IndexT) t.getStride(i);
+  }
+
+  info.data = t.data();
+  info.dims = Dim;
+
+  return info;
+}
+
+template <typename T, typename IndexT, int DimInput, int DimOutput>
+__global__ void transposeAny(TensorInfo<T, IndexT> input,
+                             TensorInfo<T, IndexT> output,
+                             IndexT totalSize) {
+  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x;
+       i < totalSize;
+       i += gridDim.x + blockDim.x) {
+    auto inputOffset = TensorInfoOffset<T, IndexT, DimInput>::get(input, i);
+    auto outputOffset = TensorInfoOffset<T, IndexT, DimOutput>::get(output, i);
+
+#if __CUDA_ARCH__ >= 350
+    output.data[outputOffset] = __ldg(&input.data[inputOffset]);
+#else
+    output.data[outputOffset] = input.data[inputOffset];
+#endif
+  }
+}
+
+/// Performs an out-of-place transposition between any two dimensions.
+/// Best performance is if the transposed dimensions are not
+/// innermost, since the reads and writes will be coalesced.
+/// Could include a shared memory transposition if the dimensions
+/// being transposed are innermost, but would require support for
+/// arbitrary rectangular matrices.
+/// This linearized implementation seems to perform well enough,
+/// especially for cases that we care about (outer dimension
+/// transpositions).
+template <typename T, int Dim>
+void runTransposeAny(Tensor<T, Dim, true>& in,
+                     int dim1, int dim2,
+                     Tensor<T, Dim, true>& out,
+                     cudaStream_t stream) {
+  static_assert(Dim <= TensorInfo<T, unsigned int>::kMaxDims,
+                "too many dimensions");
+
+  FAISS_ASSERT(dim1 != dim2);
+  FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
+
+  int outSize[Dim];
+
+  for (int i = 0; i < Dim; ++i) {
+    outSize[i] = in.getSize(i);
+  }
+
+  std::swap(outSize[dim1], outSize[dim2]);
+
+  for (int i = 0; i < Dim; ++i) {
+    FAISS_ASSERT(out.getSize(i) == outSize[i]);
+  }
+
+  size_t totalSize = in.numElements();
+  size_t block = std::min((size_t) getMaxThreadsCurrentDevice(), totalSize);
+
+  if (totalSize <= (size_t) std::numeric_limits<int>::max()) {
+    // div/mod seems faster with unsigned types
+    auto inInfo = getTensorInfo<T, unsigned int, Dim>(in);
+    auto outInfo = getTensorInfo<T, unsigned int, Dim>(out);
+
+    std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
+    std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
+
+    auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
+
+    transposeAny<T, unsigned int, Dim, -1>
+      <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
+  } else {
+    auto inInfo = getTensorInfo<T, unsigned long, Dim>(in);
+    auto outInfo = getTensorInfo<T, unsigned long, Dim>(out);
+
+    std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
+    std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
+
+    auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
+
+    transposeAny<T, unsigned long, Dim, -1>
+      <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
+  }
+  CUDA_TEST_ERROR();
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectFloat.cu b/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectFloat.cu
new file mode 100644
index 0000000000..4a03ab1311
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectFloat.cu
@@ -0,0 +1,94 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+
+namespace faiss { namespace gpu {
+
+// warp Q to thread Q:
+// 1, 1
+// 32, 2
+// 64, 3
+// 128, 3
+// 256, 4
+// 512, 8
+// 1024, 8
+// 2048, 8
+
+WARP_SELECT_DECL(float, true, 1);
+WARP_SELECT_DECL(float, true, 32);
+WARP_SELECT_DECL(float, true, 64);
+WARP_SELECT_DECL(float, true, 128);
+WARP_SELECT_DECL(float, true, 256);
+WARP_SELECT_DECL(float, true, 512);
+WARP_SELECT_DECL(float, true, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+WARP_SELECT_DECL(float, true, 2048);
+#endif
+
+WARP_SELECT_DECL(float, false, 1);
+WARP_SELECT_DECL(float, false, 32);
+WARP_SELECT_DECL(float, false, 64);
+WARP_SELECT_DECL(float, false, 128);
+WARP_SELECT_DECL(float, false, 256);
+WARP_SELECT_DECL(float, false, 512);
+WARP_SELECT_DECL(float, false, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+WARP_SELECT_DECL(float, false, 2048);
+#endif
+
+void runWarpSelect(Tensor<float, 2, true>& in,
+                      Tensor<float, 2, true>& outK,
+                      Tensor<int, 2, true>& outV,
+                      bool dir, int k, cudaStream_t stream) {
+  FAISS_ASSERT(k <= 2048);
+
+  if (dir) {
+    if (k == 1) {
+      WARP_SELECT_CALL(float, true, 1);
+    } else if (k <= 32) {
+      WARP_SELECT_CALL(float, true, 32);
+    } else if (k <= 64) {
+      WARP_SELECT_CALL(float, true, 64);
+    } else if (k <= 128) {
+      WARP_SELECT_CALL(float, true, 128);
+    } else if (k <= 256) {
+      WARP_SELECT_CALL(float, true, 256);
+    } else if (k <= 512) {
+      WARP_SELECT_CALL(float, true, 512);
+    } else if (k <= 1024) {
+      WARP_SELECT_CALL(float, true, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+    } else if (k <= 2048) {
+      WARP_SELECT_CALL(float, true, 2048);
+#endif
+    }
+  } else {
+    if (k == 1) {
+      WARP_SELECT_CALL(float, false, 1);
+    } else if (k <= 32) {
+      WARP_SELECT_CALL(float, false, 32);
+    } else if (k <= 64) {
+      WARP_SELECT_CALL(float, false, 64);
+    } else if (k <= 128) {
+      WARP_SELECT_CALL(float, false, 128);
+    } else if (k <= 256) {
+      WARP_SELECT_CALL(float, false, 256);
+    } else if (k <= 512) {
+      WARP_SELECT_CALL(float, false, 512);
+    } else if (k <= 1024) {
+      WARP_SELECT_CALL(float, false, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+    } else if (k <= 2048) {
+      WARP_SELECT_CALL(float, false, 2048);
+#endif
+    }
+  }
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectHalf.cu b/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectHalf.cu
new file mode 100644
index 0000000000..54e10be1e5
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectHalf.cu
@@ -0,0 +1,94 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+
+namespace faiss { namespace gpu {
+
+// warp Q to thread Q:
+// 1, 1
+// 32, 2
+// 64, 3
+// 128, 3
+// 256, 4
+// 512, 8
+// 1024, 8
+// 2048, 8
+
+WARP_SELECT_DECL(half, true, 1);
+WARP_SELECT_DECL(half, true, 32);
+WARP_SELECT_DECL(half, true, 64);
+WARP_SELECT_DECL(half, true, 128);
+WARP_SELECT_DECL(half, true, 256);
+WARP_SELECT_DECL(half, true, 512);
+WARP_SELECT_DECL(half, true, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+WARP_SELECT_DECL(half, true, 2048);
+#endif
+
+WARP_SELECT_DECL(half, false, 1);
+WARP_SELECT_DECL(half, false, 32);
+WARP_SELECT_DECL(half, false, 64);
+WARP_SELECT_DECL(half, false, 128);
+WARP_SELECT_DECL(half, false, 256);
+WARP_SELECT_DECL(half, false, 512);
+WARP_SELECT_DECL(half, false, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+WARP_SELECT_DECL(half, false, 2048);
+#endif
+
+void runWarpSelect(Tensor<half, 2, true>& in,
+                      Tensor<half, 2, true>& outK,
+                      Tensor<int, 2, true>& outV,
+                      bool dir, int k, cudaStream_t stream) {
+  FAISS_ASSERT(k <= 1024);
+
+  if (dir) {
+    if (k == 1) {
+      WARP_SELECT_CALL(half, true, 1);
+    } else if (k <= 32) {
+      WARP_SELECT_CALL(half, true, 32);
+    } else if (k <= 64) {
+      WARP_SELECT_CALL(half, true, 64);
+    } else if (k <= 128) {
+      WARP_SELECT_CALL(half, true, 128);
+    } else if (k <= 256) {
+      WARP_SELECT_CALL(half, true, 256);
+    } else if (k <= 512) {
+      WARP_SELECT_CALL(half, true, 512);
+    } else if (k <= 1024) {
+      WARP_SELECT_CALL(half, true, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+    } else if (k <= 2048) {
+      WARP_SELECT_CALL(half, true, 2048);
+#endif
+    }
+  } else {
+    if (k == 1) {
+      WARP_SELECT_CALL(half, false, 1);
+    } else if (k <= 32) {
+      WARP_SELECT_CALL(half, false, 32);
+    } else if (k <= 64) {
+      WARP_SELECT_CALL(half, false, 64);
+    } else if (k <= 128) {
+      WARP_SELECT_CALL(half, false, 128);
+    } else if (k <= 256) {
+      WARP_SELECT_CALL(half, false, 256);
+    } else if (k <= 512) {
+      WARP_SELECT_CALL(half, false, 512);
+    } else if (k <= 1024) {
+      WARP_SELECT_CALL(half, false, 1024);
+#if GPU_MAX_SELECTION_K >= 2048
+    } else if (k <= 2048) {
+      WARP_SELECT_CALL(half, false, 2048);
+#endif
+    }
+  }
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectKernel.cuh b/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectKernel.cuh
new file mode 100644
index 0000000000..3c122e8861
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectKernel.cuh
@@ -0,0 +1,70 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/utils/Select.cuh>
+
+namespace faiss { namespace gpu {
+
+template <typename K,
+          typename IndexType,
+          bool Dir,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
+__global__ void warpSelect(Tensor<K, 2, true> in,
+                           Tensor<K, 2, true> outK,
+                           Tensor<IndexType, 2, true> outV,
+                           K initK,
+                           IndexType initV,
+                           int k) {
+  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+
+  WarpSelect<K, IndexType, Dir, Comparator<K>,
+                NumWarpQ, NumThreadQ, ThreadsPerBlock>
+    heap(initK, initV, k);
+
+  int warpId = threadIdx.x / kWarpSize;
+  int row = blockIdx.x * kNumWarps + warpId;
+
+  if (row >= in.getSize(0)) {
+    return;
+  }
+
+  int i = getLaneId();
+  K* inStart = in[row][i].data();
+
+  // Whole warps must participate in the selection
+  int limit = utils::roundDown(in.getSize(1), kWarpSize);
+
+  for (; i < limit; i += kWarpSize) {
+    heap.add(*inStart, (IndexType) i);
+    inStart += kWarpSize;
+  }
+
+  // Handle non-warp multiple remainder
+  if (i < in.getSize(1)) {
+    heap.addThreadQ(*inStart, (IndexType) i);
+  }
+
+  heap.reduce();
+  heap.writeOut(outK[row].data(),
+                outV[row].data(), k);
+}
+
+void runWarpSelect(Tensor<float, 2, true>& in,
+                   Tensor<float, 2, true>& outKeys,
+                   Tensor<int, 2, true>& outIndices,
+                   bool dir, int k, cudaStream_t stream);
+
+void runWarpSelect(Tensor<half, 2, true>& in,
+                   Tensor<half, 2, true>& outKeys,
+                   Tensor<int, 2, true>& outIndices,
+                   bool dir, int k, cudaStream_t stream);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/WarpShuffles.cuh b/core/src/index/thirdparty/faiss/gpu/utils/WarpShuffles.cuh
new file mode 100644
index 0000000000..504c73f79a
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/WarpShuffles.cuh
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <cuda.h>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+
+namespace faiss { namespace gpu {
+
+template <typename T>
+inline __device__ T shfl(const T val,
+                         int srcLane, int width = kWarpSize) {
+#if CUDA_VERSION >= 9000
+  return __shfl_sync(0xffffffff, val, srcLane, width);
+#else
+  return __shfl(val, srcLane, width);
+#endif
+}
+
+// CUDA SDK does not provide specializations for T*
+template <typename T>
+inline __device__ T* shfl(T* const val,
+                         int srcLane, int width = kWarpSize) {
+  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
+  long long v = (long long) val;
+
+  return (T*) shfl(v, srcLane, width);
+}
+
+template <typename T>
+inline __device__ T shfl_up(const T val,
+                            unsigned int delta, int width = kWarpSize) {
+#if CUDA_VERSION >= 9000
+  return __shfl_up_sync(0xffffffff, val, delta, width);
+#else
+  return __shfl_up(val, delta, width);
+#endif
+}
+
+// CUDA SDK does not provide specializations for T*
+template <typename T>
+inline __device__ T* shfl_up(T* const val,
+                             unsigned int delta, int width = kWarpSize) {
+  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
+  long long v = (long long) val;
+
+  return (T*) shfl_up(v, delta, width);
+}
+
+template <typename T>
+inline __device__ T shfl_down(const T val,
+                              unsigned int delta, int width = kWarpSize) {
+#if CUDA_VERSION >= 9000
+  return __shfl_down_sync(0xffffffff, val, delta, width);
+#else
+  return __shfl_down(val, delta, width);
+#endif
+}
+
+// CUDA SDK does not provide specializations for T*
+template <typename T>
+inline __device__ T* shfl_down(T* const val,
+                              unsigned int delta, int width = kWarpSize) {
+  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
+  long long v = (long long) val;
+  return (T*) shfl_down(v, delta, width);
+}
+
+template <typename T>
+inline __device__ T shfl_xor(const T val,
+                             int laneMask, int width = kWarpSize) {
+#if CUDA_VERSION >= 9000
+  return __shfl_xor_sync(0xffffffff, val, laneMask, width);
+#else
+  return __shfl_xor(val, laneMask, width);
+#endif
+}
+
+// CUDA SDK does not provide specializations for T*
+template <typename T>
+inline __device__ T* shfl_xor(T* const val,
+                              int laneMask, int width = kWarpSize) {
+  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
+  long long v = (long long) val;
+  return (T*) shfl_xor(v, laneMask, width);
+}
+
+// CUDA 9.0+ has half shuffle
+#if CUDA_VERSION < 9000
+inline __device__ half shfl(half v,
+                            int srcLane, int width = kWarpSize) {
+  unsigned int vu = v.x;
+  vu = __shfl(vu, srcLane, width);
+
+  half h;
+  h.x = (unsigned short) vu;
+  return h;
+}
+
+inline __device__ half shfl_xor(half v,
+                                int laneMask, int width = kWarpSize) {
+  unsigned int vu = v.x;
+  vu = __shfl_xor(vu, laneMask, width);
+
+  half h;
+  h.x = (unsigned short) vu;
+  return h;
+}
+#endif // CUDA_VERSION
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloat1.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloat1.cu
new file mode 100644
index 0000000000..d53f4dc2aa
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloat1.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(float, true, 1, 1);
+BLOCK_SELECT_IMPL(float, false, 1, 1);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloat128.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloat128.cu
new file mode 100644
index 0000000000..2010034a18
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloat128.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(float, true, 128, 3);
+BLOCK_SELECT_IMPL(float, false, 128, 3);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloat256.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloat256.cu
new file mode 100644
index 0000000000..bcd93f3038
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloat256.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(float, true, 256, 4);
+BLOCK_SELECT_IMPL(float, false, 256, 4);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloat32.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloat32.cu
new file mode 100644
index 0000000000..35073dcfcd
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloat32.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(float, true, 32, 2);
+BLOCK_SELECT_IMPL(float, false, 32, 2);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloat64.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloat64.cu
new file mode 100644
index 0000000000..c2671068ee
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloat64.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(float, true, 64, 3);
+BLOCK_SELECT_IMPL(float, false, 64, 3);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatF1024.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatF1024.cu
new file mode 100644
index 0000000000..4c9c5188cb
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatF1024.cu
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(float, false, 1024, 8);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatF2048.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatF2048.cu
new file mode 100644
index 0000000000..7828c2045d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatF2048.cu
@@ -0,0 +1,17 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+
+namespace faiss { namespace gpu {
+
+#if GPU_MAX_SELECTION_K >= 2048
+BLOCK_SELECT_IMPL(float, false, 2048, 8);
+#endif
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatF512.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatF512.cu
new file mode 100644
index 0000000000..f24ee0bfa6
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatF512.cu
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(float, false, 512, 8);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatT1024.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatT1024.cu
new file mode 100644
index 0000000000..1f84b371e3
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatT1024.cu
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(float, true, 1024, 8);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatT2048.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatT2048.cu
new file mode 100644
index 0000000000..48037838a9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatT2048.cu
@@ -0,0 +1,17 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+
+namespace faiss { namespace gpu {
+
+#if GPU_MAX_SELECTION_K >= 2048
+BLOCK_SELECT_IMPL(float, true, 2048, 8);
+#endif
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatT512.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatT512.cu
new file mode 100644
index 0000000000..3c93edfc09
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectFloatT512.cu
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(float, true, 512, 8);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf1.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf1.cu
new file mode 100644
index 0000000000..88f1d21b57
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf1.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(half, true, 1, 1);
+BLOCK_SELECT_IMPL(half, false, 1, 1);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf128.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf128.cu
new file mode 100644
index 0000000000..b38c00b83e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf128.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(half, true, 128, 3);
+BLOCK_SELECT_IMPL(half, false, 128, 3);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf256.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf256.cu
new file mode 100644
index 0000000000..2cea11ace2
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf256.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(half, true, 256, 4);
+BLOCK_SELECT_IMPL(half, false, 256, 4);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf32.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf32.cu
new file mode 100644
index 0000000000..6045a52fea
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf32.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(half, true, 32, 2);
+BLOCK_SELECT_IMPL(half, false, 32, 2);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf64.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf64.cu
new file mode 100644
index 0000000000..ea4b0bf64b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf64.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(half, true, 64, 3);
+BLOCK_SELECT_IMPL(half, false, 64, 3);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF1024.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF1024.cu
new file mode 100644
index 0000000000..710e8c8460
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF1024.cu
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(half, false, 1024, 8);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF2048.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF2048.cu
new file mode 100644
index 0000000000..5f7f4d4f6b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF2048.cu
@@ -0,0 +1,17 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+
+namespace faiss { namespace gpu {
+
+#if GPU_MAX_SELECTION_K >= 2048
+BLOCK_SELECT_IMPL(half, false, 2048, 8);
+#endif
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF512.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF512.cu
new file mode 100644
index 0000000000..07ea1f9f6b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF512.cu
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(half, false, 512, 8);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT1024.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT1024.cu
new file mode 100644
index 0000000000..6dc37accf7
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT1024.cu
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(half, true, 1024, 8);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT2048.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT2048.cu
new file mode 100644
index 0000000000..dd38b8d6a5
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT2048.cu
@@ -0,0 +1,17 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+
+namespace faiss { namespace gpu {
+
+#if GPU_MAX_SELECTION_K >= 2048
+BLOCK_SELECT_IMPL(half, true, 2048, 8);
+#endif
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT512.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT512.cu
new file mode 100644
index 0000000000..ff2a9903fa
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT512.cu
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+BLOCK_SELECT_IMPL(half, true, 512, 8);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh
new file mode 100644
index 0000000000..fe50488e5f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh
@@ -0,0 +1,94 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/utils/BlockSelectKernel.cuh>
+#include <faiss/gpu/utils/Limits.cuh>
+
+#define BLOCK_SELECT_DECL(TYPE, DIR, WARP_Q)                            \
+  extern void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(  \
+    Tensor<TYPE, 2, true>& in,                                          \
+    Tensor<TYPE, 2, true>& outK,                                        \
+    Tensor<int, 2, true>& outV,                                         \
+    bool dir,                                                           \
+    int k,                                                              \
+    cudaStream_t stream);                                               \
+                                                                        \
+  extern void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
+    Tensor<TYPE, 2, true>& inK,                                         \
+    Tensor<int, 2, true>& inV,                                          \
+    Tensor<TYPE, 2, true>& outK,                                        \
+    Tensor<int, 2, true>& outV,                                         \
+    bool dir,                                                           \
+    int k,                                                              \
+    cudaStream_t stream)
+
+#define BLOCK_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q)                  \
+  void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(         \
+    Tensor<TYPE, 2, true>& in,                                          \
+    Tensor<TYPE, 2, true>& outK,                                        \
+    Tensor<int, 2, true>& outV,                                         \
+    bool dir,                                                           \
+    int k,                                                              \
+    cudaStream_t stream) {                                              \
+    FAISS_ASSERT(in.getSize(0) == outK.getSize(0));                     \
+    FAISS_ASSERT(in.getSize(0) == outV.getSize(0));                     \
+    FAISS_ASSERT(outK.getSize(1) == k);                                 \
+    FAISS_ASSERT(outV.getSize(1) == k);                                 \
+                                                                        \
+    auto grid = dim3(in.getSize(0));                                    \
+                                                                        \
+    constexpr int kBlockSelectNumThreads = (WARP_Q <= 1024) ? 128 : 64; \
+    auto block = dim3(kBlockSelectNumThreads);                          \
+                                                                        \
+    FAISS_ASSERT(k <= WARP_Q);                                          \
+    FAISS_ASSERT(dir == DIR);                                           \
+                                                                        \
+    auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
+    auto vInit = -1;                                                    \
+                                                                        \
+    blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
+      <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k);    \
+    CUDA_TEST_ERROR();                                                  \
+  }                                                                     \
+                                                                        \
+  void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(     \
+    Tensor<TYPE, 2, true>& inK,                                         \
+    Tensor<int, 2, true>& inV,                                          \
+    Tensor<TYPE, 2, true>& outK,                                        \
+    Tensor<int, 2, true>& outV,                                         \
+    bool dir,                                                           \
+    int k,                                                              \
+    cudaStream_t stream) {                                              \
+    FAISS_ASSERT(inK.isSameSize(inV));                                  \
+    FAISS_ASSERT(outK.isSameSize(outV));                                \
+                                                                        \
+    auto grid = dim3(inK.getSize(0));                                   \
+                                                                        \
+    constexpr int kBlockSelectNumThreads = (WARP_Q <= 1024) ? 128 : 64; \
+    auto block = dim3(kBlockSelectNumThreads);                          \
+                                                                        \
+    FAISS_ASSERT(k <= WARP_Q);                                          \
+    FAISS_ASSERT(dir == DIR);                                           \
+                                                                        \
+    auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
+    auto vInit = -1;                                                    \
+                                                                        \
+    blockSelectPair<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
+      <<<grid, block, 0, stream>>>(inK, inV, outK, outV, kInit, vInit, k); \
+    CUDA_TEST_ERROR();                                                  \
+  }
+
+
+#define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q)                    \
+  runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(      \
+    in, outK, outV, dir, k, stream)
+
+#define BLOCK_SELECT_PAIR_CALL(TYPE, DIR, WARP_Q)               \
+  runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(  \
+    inK, inV, outK, outV, dir, k, stream)
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/nvidia/fp16_emu.cu b/core/src/index/thirdparty/faiss/gpu/utils/nvidia/fp16_emu.cu
new file mode 100644
index 0000000000..97364cb512
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/nvidia/fp16_emu.cu
@@ -0,0 +1,165 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// from Nvidia cuDNN library samples; modified to compile within faiss
+
+#include <faiss/gpu/utils/nvidia/fp16_emu.cuh>
+
+namespace faiss { namespace gpu {
+
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+// Host functions for converting between FP32 and FP16 formats
+// Paulius Micikevicius (pauliusm@nvidia.com)
+
+half1 cpu_float2half_rn(float f)
+{
+    half1 ret;
+
+    union {
+      float f;
+      unsigned u;
+    } un;
+
+    un.f = f;
+
+    unsigned x = un.u;
+    unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
+    unsigned sign, exponent, mantissa;
+
+    // Get rid of +NaN/-NaN case first.
+    if (u > 0x7f800000) {
+        ret.x = 0x7fffU;
+        return ret;
+    }
+
+    sign = ((x >> 16) & 0x8000);
+
+    // Get rid of +Inf/-Inf, +0/-0.
+    if (u > 0x477fefff) {
+        ret.x = sign | 0x7c00U;
+        return ret;
+    }
+    if (u < 0x33000001) {
+        ret.x = (sign | 0x0000);
+        return ret;
+    }
+
+    exponent = ((u >> 23) & 0xff);
+    mantissa = (u & 0x7fffff);
+
+    if (exponent > 0x70) {
+        shift = 13;
+        exponent -= 0x70;
+    } else {
+        shift = 0x7e - exponent;
+        exponent = 0;
+        mantissa |= 0x800000;
+    }
+    lsb = (1 << shift);
+    lsb_s1 = (lsb >> 1);
+    lsb_m1 = (lsb - 1);
+
+    // Round to nearest even.
+    remainder = (mantissa & lsb_m1);
+    mantissa >>= shift;
+    if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
+        ++mantissa;
+        if (!(mantissa & 0x3ff)) {
+            ++exponent;
+            mantissa = 0;
+        }
+    }
+
+    ret.x = (sign | (exponent << 10) | mantissa);
+
+    return ret;
+}
+
+
+float cpu_half2float(half1 h)
+{
+    unsigned sign = ((h.x >> 15) & 1);
+    unsigned exponent = ((h.x >> 10) & 0x1f);
+    unsigned mantissa = ((h.x & 0x3ff) << 13);
+
+    if (exponent == 0x1f) {  /* NaN or Inf */
+        mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
+        exponent = 0xff;
+    } else if (!exponent) {  /* Denorm or Zero */
+        if (mantissa) {
+            unsigned int msb;
+            exponent = 0x71;
+            do {
+                msb = (mantissa & 0x400000);
+                mantissa <<= 1;  /* normalize */
+                --exponent;
+            } while (!msb);
+            mantissa &= 0x7fffff;  /* 1.mantissa is implicit */
+        }
+    } else {
+        exponent += 0x70;
+    }
+
+    union {
+      int i;
+      float f;
+    } un;
+
+    un.i = ((sign << 31) | (exponent << 23) | mantissa);
+
+    return un.f;
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/nvidia/fp16_emu.cuh b/core/src/index/thirdparty/faiss/gpu/utils/nvidia/fp16_emu.cuh
new file mode 100644
index 0000000000..e59ed8565d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/nvidia/fp16_emu.cuh
@@ -0,0 +1,118 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// from Nvidia cuDNN library samples; modified to compile within faiss
+
+#pragma once
+
+namespace faiss { namespace gpu {
+
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+// Conversion from/to 16-bit floating point (half-precision).
+
+#define HLF_EPSILON 4.887581E-04
+
+typedef struct __align__(2) {
+   unsigned short x;
+} half1;
+
+half1 cpu_float2half_rn(float f);
+
+float cpu_half2float(half1 h);
+
+static __inline__ __device__ __host__ half1 habs(half1 h)
+{
+    h.x &= 0x7fffU;
+    return h;
+}
+
+static __inline__ __device__ __host__ half1 hneg(half1 h)
+{
+    h.x ^= 0x8000U;
+    return h;
+}
+
+static __inline__ __device__ __host__ int ishnan(half1 h)
+{
+    // When input is NaN, exponent is all ones and mantissa is non-zero.
+    return (h.x & 0x7c00U) == 0x7c00U && (h.x & 0x03ffU) != 0;
+}
+
+static __inline__ __device__ __host__ int ishinf(half1 h)
+{
+    // When input is +/- inf, exponent is all ones and mantissa is zero.
+    return (h.x & 0x7c00U) == 0x7c00U && (h.x & 0x03ffU) == 0;
+}
+
+static __inline__ __device__ __host__ int ishequ(half1 x, half1 y)
+{
+    return ishnan(x) == 0 && ishnan(y) == 0 && x.x == y.x;
+}
+
+static __inline__ __device__ __host__ half1 hzero()
+{
+    half1 ret;
+    ret.x = 0x0000U;
+    return ret;
+}
+
+static __inline__ __device__ __host__ half1 hone()
+{
+    half1 ret;
+    ret.x = 0x3c00U;
+    return ret;
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloat1.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloat1.cu
new file mode 100644
index 0000000000..c641e50fdd
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloat1.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(float, true, 1, 1);
+WARP_SELECT_IMPL(float, false, 1, 1);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloat128.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloat128.cu
new file mode 100644
index 0000000000..76d98d1f20
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloat128.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(float, true, 128, 3);
+WARP_SELECT_IMPL(float, false, 128, 3);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloat256.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloat256.cu
new file mode 100644
index 0000000000..a0dd47feb1
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloat256.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(float, true, 256, 4);
+WARP_SELECT_IMPL(float, false, 256, 4);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloat32.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloat32.cu
new file mode 100644
index 0000000000..2461c94857
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloat32.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(float, true, 32, 2);
+WARP_SELECT_IMPL(float, false, 32, 2);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloat64.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloat64.cu
new file mode 100644
index 0000000000..a16c3830ca
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloat64.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(float, true, 64, 3);
+WARP_SELECT_IMPL(float, false, 64, 3);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatF1024.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatF1024.cu
new file mode 100644
index 0000000000..9effd9ee75
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatF1024.cu
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(float, false, 1024, 8);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatF2048.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatF2048.cu
new file mode 100644
index 0000000000..3abc7e61f8
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatF2048.cu
@@ -0,0 +1,17 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+
+namespace faiss { namespace gpu {
+
+#if GPU_MAX_SELECTION_K >= 2048
+WARP_SELECT_IMPL(float, false, 2048, 8);
+#endif
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatF512.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatF512.cu
new file mode 100644
index 0000000000..0d92dc0361
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatF512.cu
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(float, false, 512, 8);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatT1024.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatT1024.cu
new file mode 100644
index 0000000000..caae455f26
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatT1024.cu
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(float, true, 1024, 8);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatT2048.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatT2048.cu
new file mode 100644
index 0000000000..b7cb048461
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatT2048.cu
@@ -0,0 +1,17 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+
+namespace faiss { namespace gpu {
+
+#if GPU_MAX_SELECTION_K >= 2048
+WARP_SELECT_IMPL(float, true, 2048, 8);
+#endif
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatT512.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatT512.cu
new file mode 100644
index 0000000000..c8de86a237
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectFloatT512.cu
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(float, true, 512, 8);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf1.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf1.cu
new file mode 100644
index 0000000000..79876207f7
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf1.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(half, true, 1, 1);
+WARP_SELECT_IMPL(half, false, 1, 1);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf128.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf128.cu
new file mode 100644
index 0000000000..150c9507da
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf128.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(half, true, 128, 3);
+WARP_SELECT_IMPL(half, false, 128, 3);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf256.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf256.cu
new file mode 100644
index 0000000000..cd8b49b18f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf256.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(half, true, 256, 4);
+WARP_SELECT_IMPL(half, false, 256, 4);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf32.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf32.cu
new file mode 100644
index 0000000000..ce1b7e4c74
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf32.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(half, true, 32, 2);
+WARP_SELECT_IMPL(half, false, 32, 2);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf64.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf64.cu
new file mode 100644
index 0000000000..9d4311ec01
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf64.cu
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(half, true, 64, 3);
+WARP_SELECT_IMPL(half, false, 64, 3);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF1024.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF1024.cu
new file mode 100644
index 0000000000..0241300141
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF1024.cu
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(half, false, 1024, 8);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF2048.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF2048.cu
new file mode 100644
index 0000000000..1a16ee45c9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF2048.cu
@@ -0,0 +1,17 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+
+namespace faiss { namespace gpu {
+
+#if GPU_MAX_SELECTION_K >= 2048
+WARP_SELECT_IMPL(half, false, 2048, 8);
+#endif
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF512.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF512.cu
new file mode 100644
index 0000000000..4cb138837b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF512.cu
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(half, false, 512, 8);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT1024.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT1024.cu
new file mode 100644
index 0000000000..6a95007ff8
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT1024.cu
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(half, true, 1024, 8);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT2048.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT2048.cu
new file mode 100644
index 0000000000..94586d0100
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT2048.cu
@@ -0,0 +1,17 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+
+namespace faiss { namespace gpu {
+
+#if GPU_MAX_SELECTION_K >= 2048
+WARP_SELECT_IMPL(half, true, 2048, 8);
+#endif
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT512.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT512.cu
new file mode 100644
index 0000000000..6ca08a16ab
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT512.cu
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
+
+namespace faiss { namespace gpu {
+
+WARP_SELECT_IMPL(half, true, 512, 8);
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectImpl.cuh b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectImpl.cuh
new file mode 100644
index 0000000000..eee8ef0d5c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectImpl.cuh
@@ -0,0 +1,47 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/WarpSelectKernel.cuh>
+#include <faiss/gpu/utils/Limits.cuh>
+
+#define WARP_SELECT_DECL(TYPE, DIR, WARP_Q)                             \
+  extern void runWarpSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(   \
+    Tensor<TYPE, 2, true>& in,                                          \
+    Tensor<TYPE, 2, true>& outK,                                        \
+    Tensor<int, 2, true>& outV,                                         \
+    bool dir,                                                           \
+    int k,                                                              \
+    cudaStream_t stream)
+
+#define WARP_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q)                   \
+  void runWarpSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(          \
+    Tensor<TYPE, 2, true>& in,                                          \
+    Tensor<TYPE, 2, true>& outK,                                        \
+    Tensor<int, 2, true>& outV,                                         \
+    bool dir,                                                           \
+    int k,                                                              \
+    cudaStream_t stream) {                                              \
+                                                                        \
+    constexpr int kWarpSelectNumThreads = 128;                          \
+    auto grid = dim3(utils::divUp(in.getSize(0),                        \
+                                  (kWarpSelectNumThreads / kWarpSize))); \
+    auto block = dim3(kWarpSelectNumThreads);                           \
+                                                                        \
+    FAISS_ASSERT(k <= WARP_Q);                                          \
+    FAISS_ASSERT(dir == DIR);                                           \
+                                                                        \
+    auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
+    auto vInit = -1;                                                    \
+                                                                        \
+    warpSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kWarpSelectNumThreads> \
+      <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k);    \
+    CUDA_TEST_ERROR();                                                  \
+  }
+
+#define WARP_SELECT_CALL(TYPE, DIR, WARP_Q)                     \
+  runWarpSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(       \
+    in, outK, outV, dir, k, stream)
diff --git a/core/src/index/thirdparty/faiss/impl/AuxIndexStructures.cpp b/core/src/index/thirdparty/faiss/impl/AuxIndexStructures.cpp
new file mode 100644
index 0000000000..2d7a9269d6
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/AuxIndexStructures.cpp
@@ -0,0 +1,305 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <cstring>
+
+#include <faiss/impl/AuxIndexStructures.h>
+
+#include <faiss/impl/FaissAssert.h>
+
+
+namespace faiss {
+
+
+/***********************************************************************
+ * RangeSearchResult
+ ***********************************************************************/
+
+RangeSearchResult::RangeSearchResult (idx_t nq, bool alloc_lims): nq (nq) {
+    if (alloc_lims) {
+        lims = new size_t [nq + 1];
+        memset (lims, 0, sizeof(*lims) * (nq + 1));
+    } else {
+        lims = nullptr;
+    }
+    labels = nullptr;
+    distances = nullptr;
+    buffer_size = 1024 * 256;
+}
+
+/// called when lims contains the nb of elements result entries
+/// for each query
+void RangeSearchResult::do_allocation () {
+    size_t ofs = 0;
+    for (int i = 0; i < nq; i++) {
+        size_t n = lims[i];
+        lims [i] = ofs;
+        ofs += n;
+    }
+    lims [nq] = ofs;
+    labels = new idx_t [ofs];
+    distances = new float [ofs];
+}
+
+RangeSearchResult::~RangeSearchResult () {
+    delete [] labels;
+    delete [] distances;
+    delete [] lims;
+}
+
+
+
+
+
+/***********************************************************************
+ * BufferList
+ ***********************************************************************/
+
+
+BufferList::BufferList (size_t buffer_size):
+    buffer_size (buffer_size)
+{
+    wp = buffer_size;
+}
+
+BufferList::~BufferList ()
+{
+    for (int i = 0; i < buffers.size(); i++) {
+        delete [] buffers[i].ids;
+        delete [] buffers[i].dis;
+    }
+}
+
+void BufferList::add (idx_t id, float dis) {
+    if (wp == buffer_size) { // need new buffer
+        append_buffer();
+    }
+    Buffer & buf = buffers.back();
+    buf.ids [wp] = id;
+    buf.dis [wp] = dis;
+    wp++;
+}
+
+
+void BufferList::append_buffer ()
+{
+    Buffer buf = {new idx_t [buffer_size], new float [buffer_size]};
+    buffers.push_back (buf);
+    wp = 0;
+}
+
+/// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
+/// tables dest_ids, dest_dis
+void BufferList::copy_range (size_t ofs, size_t n,
+                             idx_t * dest_ids, float *dest_dis)
+{
+    size_t bno = ofs / buffer_size;
+    ofs -= bno * buffer_size;
+    while (n > 0) {
+        size_t ncopy = ofs + n < buffer_size ? n : buffer_size - ofs;
+        Buffer buf = buffers [bno];
+        memcpy (dest_ids, buf.ids + ofs, ncopy * sizeof(*dest_ids));
+        memcpy (dest_dis, buf.dis + ofs, ncopy * sizeof(*dest_dis));
+        dest_ids += ncopy;
+        dest_dis += ncopy;
+        ofs = 0;
+        bno ++;
+        n -= ncopy;
+    }
+}
+
+
+/***********************************************************************
+ * RangeSearchPartialResult
+ ***********************************************************************/
+
+void RangeQueryResult::add (float dis, idx_t id) {
+    nres++;
+    pres->add (id, dis);
+}
+
+
+
+RangeSearchPartialResult::RangeSearchPartialResult (RangeSearchResult * res_in):
+    BufferList(res_in->buffer_size),
+    res(res_in)
+{}
+
+
+/// begin a new result
+RangeQueryResult &
+    RangeSearchPartialResult::new_result (idx_t qno)
+{
+    RangeQueryResult qres = {qno, 0, this};
+    queries.push_back (qres);
+    return queries.back();
+}
+
+
+void RangeSearchPartialResult::finalize ()
+{
+    set_lims ();
+#pragma omp barrier
+
+#pragma omp single
+    res->do_allocation ();
+
+#pragma omp barrier
+    copy_result ();
+}
+
+
+/// called by range_search before do_allocation
+void RangeSearchPartialResult::set_lims ()
+{
+    for (int i = 0; i < queries.size(); i++) {
+        RangeQueryResult & qres = queries[i];
+        res->lims[qres.qno] = qres.nres;
+    }
+}
+
+/// called by range_search after do_allocation
+void RangeSearchPartialResult::copy_result (bool incremental)
+{
+    size_t ofs = 0;
+    for (int i = 0; i < queries.size(); i++) {
+        RangeQueryResult & qres = queries[i];
+
+        copy_range (ofs, qres.nres,
+                    res->labels + res->lims[qres.qno],
+                    res->distances + res->lims[qres.qno]);
+        if (incremental) {
+            res->lims[qres.qno] += qres.nres;
+        }
+        ofs += qres.nres;
+    }
+}
+
+void RangeSearchPartialResult::merge (std::vector <RangeSearchPartialResult *> &
+                                      partial_results, bool do_delete)
+{
+
+    int npres = partial_results.size();
+    if (npres == 0) return;
+    RangeSearchResult *result = partial_results[0]->res;
+    size_t nx = result->nq;
+
+    // count
+    for (const RangeSearchPartialResult * pres : partial_results) {
+        if (!pres) continue;
+        for (const RangeQueryResult &qres : pres->queries) {
+            result->lims[qres.qno] += qres.nres;
+        }
+    }
+    result->do_allocation ();
+    for (int j = 0; j < npres; j++) {
+        if (!partial_results[j]) continue;
+        partial_results[j]->copy_result (true);
+        if (do_delete) {
+            delete partial_results[j];
+            partial_results[j] = nullptr;
+        }
+    }
+
+    // reset the limits
+    for (size_t i = nx; i > 0; i--) {
+        result->lims [i] = result->lims [i - 1];
+    }
+    result->lims [0] = 0;
+}
+
+/***********************************************************************
+ * IDSelectorRange
+ ***********************************************************************/
+
+IDSelectorRange::IDSelectorRange (idx_t imin, idx_t imax):
+    imin (imin), imax (imax)
+{
+}
+
+bool IDSelectorRange::is_member (idx_t id) const
+{
+    return id >= imin && id < imax;
+}
+
+
+/***********************************************************************
+ * IDSelectorBatch
+ ***********************************************************************/
+
+IDSelectorBatch::IDSelectorBatch (size_t n, const idx_t *indices)
+{
+    nbits = 0;
+    while (n > (1L << nbits)) nbits++;
+    nbits += 5;
+    // for n = 1M, nbits = 25 is optimal, see P56659518
+
+    mask = (1L << nbits) - 1;
+    bloom.resize (1UL << (nbits - 3), 0);
+    for (long i = 0; i < n; i++) {
+        Index::idx_t id = indices[i];
+        set.insert(id);
+        id &= mask;
+        bloom[id >> 3] |= 1 << (id & 7);
+    }
+}
+
+bool IDSelectorBatch::is_member (idx_t i) const
+{
+    long im = i & mask;
+    if(!(bloom[im>>3] & (1 << (im & 7)))) {
+        return 0;
+    }
+    return set.count(i);
+}
+
+
+/***********************************************************
+ * Interrupt callback
+ ***********************************************************/
+
+
+std::unique_ptr<InterruptCallback> InterruptCallback::instance;
+
+std::mutex InterruptCallback::lock;
+
+void InterruptCallback::clear_instance () {
+    delete instance.release ();
+}
+
+void InterruptCallback::check () {
+    if (!instance.get()) {
+        return;
+    }
+    if (instance->want_interrupt ()) {
+        FAISS_THROW_MSG ("computation interrupted");
+    }
+}
+
+bool InterruptCallback::is_interrupted () {
+    if (!instance.get()) {
+        return false;
+    }
+    std::lock_guard<std::mutex> guard(lock);
+    return instance->want_interrupt();
+}
+
+
+size_t InterruptCallback::get_period_hint (size_t flops) {
+    if (!instance.get()) {
+        return 1L << 30; // never check
+    }
+    // for 10M flops, it is reasonable to check once every 10 iterations
+    return std::max((size_t)10 * 10 * 1000 * 1000 / (flops + 1), (size_t)1);
+}
+
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/impl/AuxIndexStructures.h b/core/src/index/thirdparty/faiss/impl/AuxIndexStructures.h
new file mode 100644
index 0000000000..fee0026a78
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/AuxIndexStructures.h
@@ -0,0 +1,246 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+// Auxiliary index structures, that are used in indexes but that can
+// be forward-declared
+
+#ifndef FAISS_AUX_INDEX_STRUCTURES_H
+#define FAISS_AUX_INDEX_STRUCTURES_H
+
+#include <stdint.h>
+
+#include <vector>
+#include <unordered_set>
+#include <memory>
+#include <mutex>
+
+#include <faiss/Index.h>
+
+namespace faiss {
+
+/** The objective is to have a simple result structure while
+ *  minimizing the number of mem copies in the result. The method
+ *  do_allocation can be overloaded to allocate the result tables in
+ *  the matrix type of a scripting language like Lua or Python. */
+struct RangeSearchResult {
+    size_t nq;      ///< nb of queries
+    size_t *lims;   ///< size (nq + 1)
+
+    typedef Index::idx_t idx_t;
+
+    idx_t *labels;     ///< result for query i is labels[lims[i]:lims[i+1]]
+    float *distances;  ///< corresponding distances (not sorted)
+
+    size_t buffer_size; ///< size of the result buffers used
+
+    /// lims must be allocated on input to range_search.
+    explicit RangeSearchResult (idx_t nq, bool alloc_lims=true);
+
+    /// called when lims contains the nb of elements result entries
+    /// for each query
+
+    virtual void do_allocation ();
+
+    virtual ~RangeSearchResult ();
+};
+
+
+/**
+
+ Encapsulates a set of ids to remove. */
+struct IDSelector {
+    typedef Index::idx_t idx_t;
+    virtual bool is_member (idx_t id) const = 0;
+    virtual ~IDSelector() {}
+};
+
+
+
+/** remove ids between [imni, imax) */
+struct IDSelectorRange: IDSelector {
+    idx_t imin, imax;
+
+    IDSelectorRange (idx_t imin, idx_t imax);
+    bool is_member(idx_t id) const override;
+    ~IDSelectorRange() override {}
+};
+
+
+/** Remove ids from a set. Repetitions of ids in the indices set
+ * passed to the constructor does not hurt performance. The hash
+ * function used for the bloom filter and GCC's implementation of
+ * unordered_set are just the least significant bits of the id. This
+ * works fine for random ids or ids in sequences but will produce many
+ * hash collisions if lsb's are always the same */
+struct IDSelectorBatch: IDSelector {
+
+    std::unordered_set<idx_t> set;
+
+    typedef unsigned char uint8_t;
+    std::vector<uint8_t> bloom; // assumes low bits of id are a good hash value
+    int nbits;
+    idx_t mask;
+
+    IDSelectorBatch (size_t n, const idx_t *indices);
+    bool is_member(idx_t id) const override;
+    ~IDSelectorBatch() override {}
+};
+
+/****************************************************************
+ * Result structures for range search.
+ *
+ * The main constraint here is that we want to support parallel
+ * queries from different threads in various ways: 1 thread per query,
+ * several threads per query. We store the actual results in blocks of
+ * fixed size rather than exponentially increasing memory. At the end,
+ * we copy the block content to a linear result array.
+ *****************************************************************/
+
+/** List of temporary buffers used to store results before they are
+ *  copied to the RangeSearchResult object. */
+struct BufferList {
+    typedef Index::idx_t idx_t;
+
+    // buffer sizes in # entries
+    size_t buffer_size;
+
+    struct Buffer {
+        idx_t *ids;
+        float *dis;
+    };
+
+    std::vector<Buffer> buffers;
+    size_t wp; ///< write pointer in the last buffer.
+
+    explicit BufferList (size_t buffer_size);
+
+    ~BufferList ();
+
+    /// create a new buffer
+    void append_buffer ();
+
+    /// add one result, possibly appending a new buffer if needed
+    void add (idx_t id, float dis);
+
+    /// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
+    /// tables dest_ids, dest_dis
+    void copy_range (size_t ofs, size_t n,
+                     idx_t * dest_ids, float *dest_dis);
+
+};
+
+struct RangeSearchPartialResult;
+
+/// result structure for a single query
+struct RangeQueryResult {
+    using idx_t = Index::idx_t;
+    idx_t qno;    //< id of the query
+    size_t nres;  //< nb of results for this query
+    RangeSearchPartialResult * pres;
+
+    /// called by search function to report a new result
+    void add (float dis, idx_t id);
+};
+
+/// the entries in the buffers are split per query
+struct RangeSearchPartialResult: BufferList {
+    RangeSearchResult * res;
+
+    /// eventually the result will be stored in res_in
+    explicit RangeSearchPartialResult (RangeSearchResult * res_in);
+
+    /// query ids + nb of results per query.
+    std::vector<RangeQueryResult> queries;
+
+    /// begin a new result
+    RangeQueryResult & new_result (idx_t qno);
+
+    /*****************************************
+     * functions used at the end of the search to merge the result
+     * lists */
+    void finalize ();
+
+    /// called by range_search before do_allocation
+    void set_lims ();
+
+    /// called by range_search after do_allocation
+    void copy_result (bool incremental = false);
+
+    /// merge a set of PartialResult's into one RangeSearchResult
+    /// on ouptut the partialresults are empty!
+    static void merge (std::vector <RangeSearchPartialResult *> &
+                       partial_results, bool do_delete=true);
+
+};
+
+
+/***********************************************************
+ * The distance computer maintains a current query and computes
+ * distances to elements in an index that supports random access.
+ *
+ * The DistanceComputer is not intended to be thread-safe (eg. because
+ * it maintains counters) so the distance functions are not const,
+ * instanciate one from each thread if needed.
+ ***********************************************************/
+struct DistanceComputer {
+     using idx_t = Index::idx_t;
+
+     /// called before computing distances
+     virtual void set_query(const float *x) = 0;
+
+     /// compute distance of vector i to current query
+     virtual float operator () (idx_t i) = 0;
+
+     /// compute distance between two stored vectors
+     virtual float symmetric_dis (idx_t i, idx_t j) = 0;
+
+     virtual ~DistanceComputer() {}
+};
+
+/***********************************************************
+ * Interrupt callback
+ ***********************************************************/
+
+struct InterruptCallback {
+    virtual bool want_interrupt () = 0;
+    virtual ~InterruptCallback() {}
+
+    // lock that protects concurrent calls to is_interrupted
+    static std::mutex lock;
+
+    static std::unique_ptr<InterruptCallback> instance;
+
+    static void clear_instance ();
+
+    /** check if:
+     * - an interrupt callback is set
+     * - the callback retuns true
+     * if this is the case, then throw an exception. Should not be called
+     * from multiple threds.
+     */
+    static void check ();
+
+    /// same as check() but return true if is interrupted instead of
+    /// throwing. Can be called from multiple threads.
+    static bool is_interrupted ();
+
+    /** assuming each iteration takes a certain number of flops, what
+     * is a reasonable interval to check for interrupts?
+     */
+    static size_t get_period_hint (size_t flops);
+
+};
+
+
+
+}; // namespace faiss
+
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/impl/FaissAssert.h b/core/src/index/thirdparty/faiss/impl/FaissAssert.h
new file mode 100644
index 0000000000..f906589d46
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/FaissAssert.h
@@ -0,0 +1,95 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_ASSERT_INCLUDED
+#define FAISS_ASSERT_INCLUDED
+
+#include <faiss/impl/FaissException.h>
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+
+///
+/// Assertions
+///
+
+#define FAISS_ASSERT(X)                                                 \
+  do {                                                                  \
+    if (! (X)) {                                                        \
+      fprintf(stderr, "Faiss assertion '%s' failed in %s "              \
+               "at %s:%d\n",                                            \
+               #X, __PRETTY_FUNCTION__, __FILE__, __LINE__);            \
+      abort();                                                          \
+    }                                                                   \
+  } while (false)
+
+#define FAISS_ASSERT_MSG(X, MSG)                                        \
+  do {                                                                  \
+    if (! (X)) {                                                        \
+      fprintf(stderr, "Faiss assertion '%s' failed in %s "              \
+               "at %s:%d; details: " MSG "\n",                          \
+               #X, __PRETTY_FUNCTION__, __FILE__, __LINE__);            \
+      abort();                                                          \
+    }                                                                   \
+  } while (false)
+
+#define FAISS_ASSERT_FMT(X, FMT, ...)                                   \
+  do {                                                                  \
+    if (! (X)) {                                                        \
+      fprintf(stderr, "Faiss assertion '%s' failed in %s "              \
+               "at %s:%d; details: " FMT "\n",                          \
+               #X, __PRETTY_FUNCTION__, __FILE__, __LINE__, __VA_ARGS__); \
+      abort();                                                          \
+    }                                                                   \
+  } while (false)
+
+///
+/// Exceptions for returning user errors
+///
+
+#define FAISS_THROW_MSG(MSG)                                            \
+  do {                                                                  \
+    throw faiss::FaissException(MSG, __PRETTY_FUNCTION__, __FILE__, __LINE__); \
+  } while (false)
+
+#define FAISS_THROW_FMT(FMT, ...)                                       \
+  do {                                                                  \
+    std::string __s;                                                    \
+    int __size = snprintf(nullptr, 0, FMT, __VA_ARGS__);                \
+    __s.resize(__size + 1);                                             \
+    snprintf(&__s[0], __s.size(), FMT, __VA_ARGS__);                    \
+    throw faiss::FaissException(__s, __PRETTY_FUNCTION__, __FILE__, __LINE__); \
+  } while (false)
+
+///
+/// Exceptions thrown upon a conditional failure
+///
+
+#define FAISS_THROW_IF_NOT(X)                           \
+  do {                                                  \
+    if (!(X)) {                                         \
+      FAISS_THROW_FMT("Error: '%s' failed", #X);        \
+    }                                                   \
+  } while (false)
+
+#define FAISS_THROW_IF_NOT_MSG(X, MSG)                  \
+  do {                                                  \
+    if (!(X)) {                                         \
+      FAISS_THROW_FMT("Error: '%s' failed: " MSG, #X);  \
+    }                                                   \
+  } while (false)
+
+#define FAISS_THROW_IF_NOT_FMT(X, FMT, ...)                             \
+  do {                                                                  \
+    if (!(X)) {                                                         \
+      FAISS_THROW_FMT("Error: '%s' failed: " FMT, #X, __VA_ARGS__);     \
+    }                                                                   \
+  } while (false)
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/impl/FaissException.cpp b/core/src/index/thirdparty/faiss/impl/FaissException.cpp
new file mode 100644
index 0000000000..c79930e55e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/FaissException.cpp
@@ -0,0 +1,66 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/impl/FaissException.h>
+#include <sstream>
+
+namespace faiss {
+
+FaissException::FaissException(const std::string& m)
+    : msg(m) {
+}
+
+FaissException::FaissException(const std::string& m,
+                               const char* funcName,
+                               const char* file,
+                               int line) {
+  int size = snprintf(nullptr, 0, "Error in %s at %s:%d: %s",
+                      funcName, file, line, m.c_str());
+  msg.resize(size + 1);
+  snprintf(&msg[0], msg.size(), "Error in %s at %s:%d: %s",
+           funcName, file, line, m.c_str());
+}
+
+const char*
+FaissException::what() const noexcept {
+  return msg.c_str();
+}
+
+void handleExceptions(
+  std::vector<std::pair<int, std::exception_ptr>>& exceptions) {
+  if (exceptions.size() == 1) {
+    // throw the single received exception directly
+    std::rethrow_exception(exceptions.front().second);
+
+  } else if (exceptions.size() > 1) {
+    // multiple exceptions; aggregate them and return a single exception
+    std::stringstream ss;
+
+    for (auto& p : exceptions) {
+      try {
+        std::rethrow_exception(p.second);
+      } catch (std::exception& ex) {
+        if (ex.what()) {
+          // exception message available
+          ss << "Exception thrown from index " << p.first << ": "
+             << ex.what() << "\n";
+        } else {
+          // No message available
+          ss << "Unknown exception thrown from index " << p.first << "\n";
+        }
+      } catch (...) {
+        ss << "Unknown exception thrown from index " << p.first << "\n";
+      }
+    }
+
+    throw FaissException(ss.str());
+  }
+}
+
+}
diff --git a/core/src/index/thirdparty/faiss/impl/FaissException.h b/core/src/index/thirdparty/faiss/impl/FaissException.h
new file mode 100644
index 0000000000..9d54edbad5
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/FaissException.h
@@ -0,0 +1,71 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_EXCEPTION_INCLUDED
+#define FAISS_EXCEPTION_INCLUDED
+
+#include <exception>
+#include <string>
+#include <vector>
+#include <utility>
+
+namespace faiss {
+
+/// Base class for Faiss exceptions
+class FaissException : public std::exception {
+ public:
+  explicit FaissException(const std::string& msg);
+
+  FaissException(const std::string& msg,
+                 const char* funcName,
+                 const char* file,
+                 int line);
+
+  /// from std::exception
+  const char* what() const noexcept override;
+
+  std::string msg;
+};
+
+/// Handle multiple exceptions from worker threads, throwing an appropriate
+/// exception that aggregates the information
+/// The pair int is the thread that generated the exception
+void
+handleExceptions(std::vector<std::pair<int, std::exception_ptr>>& exceptions);
+
+/** bare-bones unique_ptr
+ * this one deletes with delete [] */
+template<class T>
+struct ScopeDeleter {
+    const T * ptr;
+    explicit ScopeDeleter (const T* ptr = nullptr): ptr (ptr) {}
+    void release () {ptr = nullptr; }
+    void set (const T * ptr_in) { ptr = ptr_in; }
+    void swap (ScopeDeleter<T> &other) {std::swap (ptr, other.ptr); }
+    ~ScopeDeleter () {
+        delete [] ptr;
+    }
+};
+
+/** same but deletes with the simple delete (least common case) */
+template<class T>
+struct ScopeDeleter1 {
+    const T * ptr;
+    explicit ScopeDeleter1 (const T* ptr = nullptr): ptr (ptr) {}
+    void release () {ptr = nullptr; }
+    void set (const T * ptr_in) { ptr = ptr_in; }
+    void swap (ScopeDeleter1<T> &other) {std::swap (ptr, other.ptr); }
+    ~ScopeDeleter1 () {
+        delete ptr;
+    }
+};
+
+}
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/impl/HNSW.cpp b/core/src/index/thirdparty/faiss/impl/HNSW.cpp
new file mode 100644
index 0000000000..58d113e3f4
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/HNSW.cpp
@@ -0,0 +1,818 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/impl/HNSW.h>
+
+#include <string>
+
+#include <faiss/impl/AuxIndexStructures.h>
+
+namespace faiss {
+
+using idx_t = Index::idx_t;
+
+/**************************************************************
+ * HNSW structure implementation
+ **************************************************************/
+
+int HNSW::nb_neighbors(int layer_no) const
+{
+  return cum_nneighbor_per_level[layer_no + 1] -
+    cum_nneighbor_per_level[layer_no];
+}
+
+void HNSW::set_nb_neighbors(int level_no, int n)
+{
+  FAISS_THROW_IF_NOT(levels.size() == 0);
+  int cur_n = nb_neighbors(level_no);
+  for (int i = level_no + 1; i < cum_nneighbor_per_level.size(); i++) {
+    cum_nneighbor_per_level[i] += n - cur_n;
+  }
+}
+
+int HNSW::cum_nb_neighbors(int layer_no) const
+{
+  return cum_nneighbor_per_level[layer_no];
+}
+
+void HNSW::neighbor_range(idx_t no, int layer_no,
+                          size_t * begin, size_t * end) const
+{
+  size_t o = offsets[no];
+  *begin = o + cum_nb_neighbors(layer_no);
+  *end = o + cum_nb_neighbors(layer_no + 1);
+}
+
+
+
+HNSW::HNSW(int M) : rng(12345) {
+  set_default_probas(M, 1.0 / log(M));
+  max_level = -1;
+  entry_point = -1;
+  efSearch = 16;
+  efConstruction = 40;
+  upper_beam = 1;
+  offsets.push_back(0);
+}
+
+
+int HNSW::random_level()
+{
+  double f = rng.rand_float();
+  // could be a bit faster with bissection
+  for (int level = 0; level < assign_probas.size(); level++) {
+    if (f < assign_probas[level]) {
+      return level;
+    }
+    f -= assign_probas[level];
+  }
+  // happens with exponentially low probability
+  return assign_probas.size() - 1;
+}
+
+void HNSW::set_default_probas(int M, float levelMult)
+{
+  int nn = 0;
+  cum_nneighbor_per_level.push_back (0);
+  for (int level = 0; ;level++) {
+    float proba = exp(-level / levelMult) * (1 - exp(-1 / levelMult));
+    if (proba < 1e-9) break;
+    assign_probas.push_back(proba);
+    nn += level == 0 ? M * 2 : M;
+    cum_nneighbor_per_level.push_back (nn);
+  }
+}
+
+void HNSW::clear_neighbor_tables(int level)
+{
+  for (int i = 0; i < levels.size(); i++) {
+    size_t begin, end;
+    neighbor_range(i, level, &begin, &end);
+    for (size_t j = begin; j < end; j++) {
+      neighbors[j] = -1;
+    }
+  }
+}
+
+
+void HNSW::reset() {
+  max_level = -1;
+  entry_point = -1;
+  offsets.clear();
+  offsets.push_back(0);
+  levels.clear();
+  neighbors.clear();
+}
+
+
+
+void HNSW::print_neighbor_stats(int level) const
+{
+  FAISS_THROW_IF_NOT (level < cum_nneighbor_per_level.size());
+  printf("stats on level %d, max %d neighbors per vertex:\n",
+         level, nb_neighbors(level));
+  size_t tot_neigh = 0, tot_common = 0, tot_reciprocal = 0, n_node = 0;
+#pragma omp parallel for reduction(+: tot_neigh) reduction(+: tot_common) \
+  reduction(+: tot_reciprocal) reduction(+: n_node)
+  for (int i = 0; i < levels.size(); i++) {
+    if (levels[i] > level) {
+      n_node++;
+      size_t begin, end;
+      neighbor_range(i, level, &begin, &end);
+      std::unordered_set<int> neighset;
+      for (size_t j = begin; j < end; j++) {
+        if (neighbors [j] < 0) break;
+        neighset.insert(neighbors[j]);
+      }
+      int n_neigh = neighset.size();
+      int n_common = 0;
+      int n_reciprocal = 0;
+      for (size_t j = begin; j < end; j++) {
+        storage_idx_t i2 = neighbors[j];
+        if (i2 < 0) break;
+        FAISS_ASSERT(i2 != i);
+        size_t begin2, end2;
+        neighbor_range(i2, level, &begin2, &end2);
+        for (size_t j2 = begin2; j2 < end2; j2++) {
+          storage_idx_t i3 = neighbors[j2];
+          if (i3 < 0) break;
+          if (i3 == i) {
+            n_reciprocal++;
+            continue;
+          }
+          if (neighset.count(i3)) {
+            neighset.erase(i3);
+            n_common++;
+          }
+        }
+      }
+      tot_neigh += n_neigh;
+      tot_common += n_common;
+      tot_reciprocal += n_reciprocal;
+    }
+  }
+  float normalizer = n_node;
+  printf("   nb of nodes at that level %ld\n", n_node);
+  printf("   neighbors per node: %.2f (%ld)\n",
+         tot_neigh / normalizer, tot_neigh);
+  printf("   nb of reciprocal neighbors: %.2f\n", tot_reciprocal / normalizer);
+  printf("   nb of neighbors that are also neighbor-of-neighbors: %.2f (%ld)\n",
+         tot_common / normalizer, tot_common);
+
+
+
+}
+
+
+void HNSW::fill_with_random_links(size_t n)
+{
+  int max_level = prepare_level_tab(n);
+  RandomGenerator rng2(456);
+
+  for (int level = max_level - 1; level >= 0; --level) {
+    std::vector<int> elts;
+    for (int i = 0; i < n; i++) {
+      if (levels[i] > level) {
+        elts.push_back(i);
+      }
+    }
+    printf ("linking %ld elements in level %d\n",
+            elts.size(), level);
+
+    if (elts.size() == 1) continue;
+
+    for (int ii = 0; ii < elts.size(); ii++) {
+      int i = elts[ii];
+      size_t begin, end;
+      neighbor_range(i, 0, &begin, &end);
+      for (size_t j = begin; j < end; j++) {
+        int other = 0;
+        do {
+          other = elts[rng2.rand_int(elts.size())];
+        } while(other == i);
+
+        neighbors[j] = other;
+      }
+    }
+  }
+}
+
+
+int HNSW::prepare_level_tab(size_t n, bool preset_levels)
+{
+  size_t n0 = offsets.size() - 1;
+
+  if (preset_levels) {
+    FAISS_ASSERT (n0 + n == levels.size());
+  } else {
+    FAISS_ASSERT (n0 == levels.size());
+    for (int i = 0; i < n; i++) {
+      int pt_level = random_level();
+      levels.push_back(pt_level + 1);
+    }
+  }
+
+  int max_level = 0;
+  for (int i = 0; i < n; i++) {
+    int pt_level = levels[i + n0] - 1;
+    if (pt_level > max_level) max_level = pt_level;
+    offsets.push_back(offsets.back() +
+                      cum_nb_neighbors(pt_level + 1));
+    neighbors.resize(offsets.back(), -1);
+  }
+
+  return max_level;
+}
+
+
+/** Enumerate vertices from farthest to nearest from query, keep a
+ * neighbor only if there is no previous neighbor that is closer to
+ * that vertex than the query.
+ */
+void HNSW::shrink_neighbor_list(
+  DistanceComputer& qdis,
+  std::priority_queue<NodeDistFarther>& input,
+  std::vector<NodeDistFarther>& output,
+  int max_size)
+{
+  while (input.size() > 0) {
+    NodeDistFarther v1 = input.top();
+    input.pop();
+    float dist_v1_q = v1.d;
+
+    bool good = true;
+    for (NodeDistFarther v2 : output) {
+      float dist_v1_v2 = qdis.symmetric_dis(v2.id, v1.id);
+
+      if (dist_v1_v2 < dist_v1_q) {
+        good = false;
+        break;
+      }
+    }
+
+    if (good) {
+      output.push_back(v1);
+      if (output.size() >= max_size) {
+        return;
+      }
+    }
+  }
+}
+
+
+namespace {
+
+
+using storage_idx_t = HNSW::storage_idx_t;
+using NodeDistCloser = HNSW::NodeDistCloser;
+using NodeDistFarther = HNSW::NodeDistFarther;
+
+
+/**************************************************************
+ * Addition subroutines
+ **************************************************************/
+
+
+/// remove neighbors from the list to make it smaller than max_size
+void shrink_neighbor_list(
+  DistanceComputer& qdis,
+  std::priority_queue<NodeDistCloser>& resultSet1,
+  int max_size)
+{
+    if (resultSet1.size() < max_size) {
+        return;
+    }
+    std::priority_queue<NodeDistFarther> resultSet;
+    std::vector<NodeDistFarther> returnlist;
+
+    while (resultSet1.size() > 0) {
+        resultSet.emplace(resultSet1.top().d, resultSet1.top().id);
+        resultSet1.pop();
+    }
+
+    HNSW::shrink_neighbor_list(qdis, resultSet, returnlist, max_size);
+
+    for (NodeDistFarther curen2 : returnlist) {
+        resultSet1.emplace(curen2.d, curen2.id);
+    }
+
+}
+
+
+/// add a link between two elements, possibly shrinking the list
+/// of links to make room for it.
+void add_link(HNSW& hnsw,
+              DistanceComputer& qdis,
+              storage_idx_t src, storage_idx_t dest,
+              int level)
+{
+  size_t begin, end;
+  hnsw.neighbor_range(src, level, &begin, &end);
+  if (hnsw.neighbors[end - 1] == -1) {
+    // there is enough room, find a slot to add it
+    size_t i = end;
+    while(i > begin) {
+      if (hnsw.neighbors[i - 1] != -1) break;
+      i--;
+    }
+    hnsw.neighbors[i] = dest;
+    return;
+  }
+
+  // otherwise we let them fight out which to keep
+
+  // copy to resultSet...
+  std::priority_queue<NodeDistCloser> resultSet;
+  resultSet.emplace(qdis.symmetric_dis(src, dest), dest);
+  for (size_t i = begin; i < end; i++) { // HERE WAS THE BUG
+    storage_idx_t neigh = hnsw.neighbors[i];
+    resultSet.emplace(qdis.symmetric_dis(src, neigh), neigh);
+  }
+
+  shrink_neighbor_list(qdis, resultSet, end - begin);
+
+  // ...and back
+  size_t i = begin;
+  while (resultSet.size()) {
+    hnsw.neighbors[i++] = resultSet.top().id;
+    resultSet.pop();
+  }
+  // they may have shrunk more than just by 1 element
+  while(i < end) {
+    hnsw.neighbors[i++] = -1;
+  }
+}
+
+/// search neighbors on a single level, starting from an entry point
+void search_neighbors_to_add(
+  HNSW& hnsw,
+  DistanceComputer& qdis,
+  std::priority_queue<NodeDistCloser>& results,
+  int entry_point,
+  float d_entry_point,
+  int level,
+  VisitedTable &vt)
+{
+  // top is nearest candidate
+  std::priority_queue<NodeDistFarther> candidates;
+
+  NodeDistFarther ev(d_entry_point, entry_point);
+  candidates.push(ev);
+  results.emplace(d_entry_point, entry_point);
+  vt.set(entry_point);
+
+  while (!candidates.empty()) {
+    // get nearest
+    const NodeDistFarther &currEv = candidates.top();
+
+    if (currEv.d > results.top().d) {
+      break;
+    }
+    int currNode = currEv.id;
+    candidates.pop();
+
+    // loop over neighbors
+    size_t begin, end;
+    hnsw.neighbor_range(currNode, level, &begin, &end);
+    for(size_t i = begin; i < end; i++) {
+      storage_idx_t nodeId = hnsw.neighbors[i];
+      if (nodeId < 0) break;
+      if (vt.get(nodeId)) continue;
+      vt.set(nodeId);
+
+      float dis = qdis(nodeId);
+      NodeDistFarther evE1(dis, nodeId);
+
+      if (results.size() < hnsw.efConstruction ||
+          results.top().d > dis) {
+
+        results.emplace(dis, nodeId);
+        candidates.emplace(dis, nodeId);
+        if (results.size() > hnsw.efConstruction) {
+          results.pop();
+        }
+      }
+    }
+  }
+  vt.advance();
+}
+
+
+/**************************************************************
+ * Searching subroutines
+ **************************************************************/
+
+/// greedily update a nearest vector at a given level
+void greedy_update_nearest(const HNSW& hnsw,
+                           DistanceComputer& qdis,
+                           int level,
+                           storage_idx_t& nearest,
+                           float& d_nearest)
+{
+  for(;;) {
+    storage_idx_t prev_nearest = nearest;
+
+    size_t begin, end;
+    hnsw.neighbor_range(nearest, level, &begin, &end);
+    for(size_t i = begin; i < end; i++) {
+      storage_idx_t v = hnsw.neighbors[i];
+      if (v < 0) break;
+      float dis = qdis(v);
+      if (dis < d_nearest) {
+        nearest = v;
+        d_nearest = dis;
+      }
+    }
+    if (nearest == prev_nearest) {
+      return;
+    }
+  }
+}
+
+
+}  // namespace
+
+
+/// Finds neighbors and builds links with them, starting from an entry
+/// point. The own neighbor list is assumed to be locked.
+void HNSW::add_links_starting_from(DistanceComputer& ptdis,
+                                   storage_idx_t pt_id,
+                                   storage_idx_t nearest,
+                                   float d_nearest,
+                                   int level,
+                                   omp_lock_t *locks,
+                                   VisitedTable &vt)
+{
+  std::priority_queue<NodeDistCloser> link_targets;
+
+  search_neighbors_to_add(*this, ptdis, link_targets, nearest, d_nearest,
+                          level, vt);
+
+  // but we can afford only this many neighbors
+  int M = nb_neighbors(level);
+
+  ::faiss::shrink_neighbor_list(ptdis, link_targets, M);
+
+  while (!link_targets.empty()) {
+    int other_id = link_targets.top().id;
+
+    omp_set_lock(&locks[other_id]);
+    add_link(*this, ptdis, other_id, pt_id, level);
+    omp_unset_lock(&locks[other_id]);
+
+    add_link(*this, ptdis, pt_id, other_id, level);
+
+    link_targets.pop();
+  }
+}
+
+
+/**************************************************************
+ * Building, parallel
+ **************************************************************/
+
+void HNSW::add_with_locks(DistanceComputer& ptdis, int pt_level, int pt_id,
+                          std::vector<omp_lock_t>& locks,
+                          VisitedTable& vt)
+{
+  //  greedy search on upper levels
+
+  storage_idx_t nearest;
+#pragma omp critical
+  {
+    nearest = entry_point;
+
+    if (nearest == -1) {
+      max_level = pt_level;
+      entry_point = pt_id;
+    }
+  }
+
+  if (nearest < 0) {
+    return;
+  }
+
+  omp_set_lock(&locks[pt_id]);
+
+  int level = max_level; // level at which we start adding neighbors
+  float d_nearest = ptdis(nearest);
+
+  for(; level > pt_level; level--) {
+    greedy_update_nearest(*this, ptdis, level, nearest, d_nearest);
+  }
+
+  for(; level >= 0; level--) {
+    add_links_starting_from(ptdis, pt_id, nearest, d_nearest,
+                            level, locks.data(), vt);
+  }
+
+  omp_unset_lock(&locks[pt_id]);
+
+  if (pt_level > max_level) {
+    max_level = pt_level;
+    entry_point = pt_id;
+  }
+}
+
+
+/** Do a BFS on the candidates list */
+
+int HNSW::search_from_candidates(
+  DistanceComputer& qdis, int k,
+  idx_t *I, float *D,
+  MinimaxHeap& candidates,
+  VisitedTable& vt,
+  int level, int nres_in) const
+{
+  int nres = nres_in;
+  int ndis = 0;
+  for (int i = 0; i < candidates.size(); i++) {
+    idx_t v1 = candidates.ids[i];
+    float d = candidates.dis[i];
+    FAISS_ASSERT(v1 >= 0);
+    if (nres < k) {
+      faiss::maxheap_push(++nres, D, I, d, v1);
+    } else if (d < D[0]) {
+      faiss::maxheap_pop(nres--, D, I);
+      faiss::maxheap_push(++nres, D, I, d, v1);
+    }
+    vt.set(v1);
+  }
+
+  bool do_dis_check = check_relative_distance;
+  int nstep = 0;
+
+  while (candidates.size() > 0) {
+    float d0 = 0;
+    int v0 = candidates.pop_min(&d0);
+
+    if (do_dis_check) {
+      // tricky stopping condition: there are more that ef
+      // distances that are processed already that are smaller
+      // than d0
+
+      int n_dis_below = candidates.count_below(d0);
+      if(n_dis_below >= efSearch) {
+        break;
+      }
+    }
+
+    size_t begin, end;
+    neighbor_range(v0, level, &begin, &end);
+
+    for (size_t j = begin; j < end; j++) {
+      int v1 = neighbors[j];
+      if (v1 < 0) break;
+      if (vt.get(v1)) {
+        continue;
+      }
+      vt.set(v1);
+      ndis++;
+      float d = qdis(v1);
+      if (nres < k) {
+        faiss::maxheap_push(++nres, D, I, d, v1);
+      } else if (d < D[0]) {
+        faiss::maxheap_pop(nres--, D, I);
+        faiss::maxheap_push(++nres, D, I, d, v1);
+      }
+      candidates.push(v1, d);
+    }
+
+    nstep++;
+    if (!do_dis_check && nstep > efSearch) {
+      break;
+    }
+  }
+
+  if (level == 0) {
+#pragma omp critical
+    {
+      hnsw_stats.n1 ++;
+      if (candidates.size() == 0) {
+        hnsw_stats.n2 ++;
+      }
+      hnsw_stats.n3 += ndis;
+    }
+  }
+
+  return nres;
+}
+
+
+/**************************************************************
+ * Searching
+ **************************************************************/
+
+std::priority_queue<HNSW::Node> HNSW::search_from_candidate_unbounded(
+  const Node& node,
+  DistanceComputer& qdis,
+  int ef,
+  VisitedTable *vt) const
+{
+  int ndis = 0;
+  std::priority_queue<Node> top_candidates;
+  std::priority_queue<Node, std::vector<Node>, std::greater<Node>> candidates;
+
+  top_candidates.push(node);
+  candidates.push(node);
+
+  vt->set(node.second);
+
+  while (!candidates.empty()) {
+    float d0;
+    storage_idx_t v0;
+    std::tie(d0, v0) = candidates.top();
+
+    if (d0 > top_candidates.top().first) {
+      break;
+    }
+
+    candidates.pop();
+
+    size_t begin, end;
+    neighbor_range(v0, 0, &begin, &end);
+
+    for (size_t j = begin; j < end; ++j) {
+      int v1 = neighbors[j];
+
+      if (v1 < 0) {
+        break;
+      }
+      if (vt->get(v1)) {
+        continue;
+      }
+
+      vt->set(v1);
+
+      float d1 = qdis(v1);
+      ++ndis;
+
+      if (top_candidates.top().first > d1 || top_candidates.size() < ef) {
+        candidates.emplace(d1, v1);
+        top_candidates.emplace(d1, v1);
+
+        if (top_candidates.size() > ef) {
+          top_candidates.pop();
+        }
+      }
+    }
+  }
+
+#pragma omp critical
+  {
+    ++hnsw_stats.n1;
+    if (candidates.size() == 0) {
+      ++hnsw_stats.n2;
+    }
+    hnsw_stats.n3 += ndis;
+  }
+
+  return top_candidates;
+}
+
+void HNSW::search(DistanceComputer& qdis, int k,
+                  idx_t *I, float *D,
+                  VisitedTable& vt) const
+{
+  if (upper_beam == 1) {
+
+    //  greedy search on upper levels
+    storage_idx_t nearest = entry_point;
+    float d_nearest = qdis(nearest);
+
+    for(int level = max_level; level >= 1; level--) {
+      greedy_update_nearest(*this, qdis, level, nearest, d_nearest);
+    }
+
+    int ef = std::max(efSearch, k);
+    if (search_bounded_queue) {
+      MinimaxHeap candidates(ef);
+
+      candidates.push(nearest, d_nearest);
+
+      search_from_candidates(qdis, k, I, D, candidates, vt, 0);
+    } else {
+      std::priority_queue<Node> top_candidates =
+        search_from_candidate_unbounded(Node(d_nearest, nearest),
+                                        qdis, ef, &vt);
+
+      while (top_candidates.size() > k) {
+        top_candidates.pop();
+      }
+
+      int nres = 0;
+      while (!top_candidates.empty()) {
+        float d;
+        storage_idx_t label;
+        std::tie(d, label) = top_candidates.top();
+        faiss::maxheap_push(++nres, D, I, d, label);
+        top_candidates.pop();
+      }
+    }
+
+    vt.advance();
+
+  } else {
+    int candidates_size = upper_beam;
+    MinimaxHeap candidates(candidates_size);
+
+    std::vector<idx_t> I_to_next(candidates_size);
+    std::vector<float> D_to_next(candidates_size);
+
+    int nres = 1;
+    I_to_next[0] = entry_point;
+    D_to_next[0] = qdis(entry_point);
+
+    for(int level = max_level; level >= 0; level--) {
+
+      // copy I, D -> candidates
+
+      candidates.clear();
+
+      for (int i = 0; i < nres; i++) {
+        candidates.push(I_to_next[i], D_to_next[i]);
+      }
+
+      if (level == 0) {
+        nres = search_from_candidates(qdis, k, I, D, candidates, vt, 0);
+      } else  {
+        nres = search_from_candidates(
+          qdis, candidates_size,
+          I_to_next.data(), D_to_next.data(),
+          candidates, vt, level
+        );
+      }
+      vt.advance();
+    }
+  }
+}
+
+
+void HNSW::MinimaxHeap::push(storage_idx_t i, float v) {
+  if (k == n) {
+    if (v >= dis[0]) return;
+    faiss::heap_pop<HC> (k--, dis.data(), ids.data());
+    --nvalid;
+  }
+  faiss::heap_push<HC> (++k, dis.data(), ids.data(), v, i);
+  ++nvalid;
+}
+
+float HNSW::MinimaxHeap::max() const {
+  return dis[0];
+}
+
+int HNSW::MinimaxHeap::size() const {
+  return nvalid;
+}
+
+void HNSW::MinimaxHeap::clear() {
+  nvalid = k = 0;
+}
+
+int HNSW::MinimaxHeap::pop_min(float *vmin_out) {
+  assert(k > 0);
+  // returns min. This is an O(n) operation
+  int i = k - 1;
+  while (i >= 0) {
+    if (ids[i] != -1) break;
+    i--;
+  }
+  if (i == -1) return -1;
+  int imin = i;
+  float vmin = dis[i];
+  i--;
+  while(i >= 0) {
+    if (ids[i] != -1 && dis[i] < vmin) {
+      vmin = dis[i];
+      imin = i;
+    }
+    i--;
+  }
+  if (vmin_out) *vmin_out = vmin;
+  int ret = ids[imin];
+  ids[imin] = -1;
+  --nvalid;
+
+  return ret;
+}
+
+int HNSW::MinimaxHeap::count_below(float thresh) {
+  int n_below = 0;
+  for(int i = 0; i < k; i++) {
+    if (dis[i] < thresh) {
+      n_below++;
+    }
+  }
+
+  return n_below;
+}
+
+
+}  // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/impl/HNSW.h b/core/src/index/thirdparty/faiss/impl/HNSW.h
new file mode 100644
index 0000000000..cde99c1c29
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/HNSW.h
@@ -0,0 +1,275 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <vector>
+#include <unordered_set>
+#include <queue>
+
+#include <omp.h>
+
+#include <faiss/Index.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/Heap.h>
+
+
+namespace faiss {
+
+
+/** Implementation of the Hierarchical Navigable Small World
+ * datastructure.
+ *
+ * Efficient and robust approximate nearest neighbor search using
+ * Hierarchical Navigable Small World graphs
+ *
+ *  Yu. A. Malkov, D. A. Yashunin, arXiv 2017
+ *
+ * This implmentation is heavily influenced by the NMSlib
+ * implementation by Yury Malkov and Leonid Boystov
+ * (https://github.com/searchivarius/nmslib)
+ *
+ * The HNSW object stores only the neighbor link structure, see
+ * IndexHNSW.h for the full index object.
+ */
+
+
+struct VisitedTable;
+struct DistanceComputer; // from AuxIndexStructures
+
+struct HNSW {
+  /// internal storage of vectors (32 bits: this is expensive)
+  typedef int storage_idx_t;
+
+  /// Faiss results are 64-bit
+  typedef Index::idx_t idx_t;
+
+  typedef std::pair<float, storage_idx_t> Node;
+
+  /** Heap structure that allows fast
+   */
+  struct MinimaxHeap {
+    int n;
+    int k;
+    int nvalid;
+
+    std::vector<storage_idx_t> ids;
+    std::vector<float> dis;
+    typedef faiss::CMax<float, storage_idx_t> HC;
+
+    explicit MinimaxHeap(int n): n(n), k(0), nvalid(0), ids(n), dis(n) {}
+
+    void push(storage_idx_t i, float v);
+
+    float max() const;
+
+    int size() const;
+
+    void clear();
+
+    int pop_min(float *vmin_out = nullptr);
+
+    int count_below(float thresh);
+  };
+
+
+  /// to sort pairs of (id, distance) from nearest to fathest or the reverse
+  struct NodeDistCloser {
+    float d;
+    int id;
+    NodeDistCloser(float d, int id): d(d), id(id) {}
+    bool operator < (const NodeDistCloser &obj1) const { return d < obj1.d; }
+  };
+
+  struct NodeDistFarther {
+    float d;
+    int id;
+    NodeDistFarther(float d, int id): d(d), id(id) {}
+    bool operator < (const NodeDistFarther &obj1) const { return d > obj1.d; }
+  };
+
+
+  /// assignment probability to each layer (sum=1)
+  std::vector<double> assign_probas;
+
+  /// number of neighbors stored per layer (cumulative), should not
+  /// be changed after first add
+  std::vector<int> cum_nneighbor_per_level;
+
+  /// level of each vector (base level = 1), size = ntotal
+  std::vector<int> levels;
+
+  /// offsets[i] is the offset in the neighbors array where vector i is stored
+  /// size ntotal + 1
+  std::vector<size_t> offsets;
+
+  /// neighbors[offsets[i]:offsets[i+1]] is the list of neighbors of vector i
+  /// for all levels. this is where all storage goes.
+  std::vector<storage_idx_t> neighbors;
+
+  /// entry point in the search structure (one of the points with maximum level
+  storage_idx_t entry_point;
+
+  faiss::RandomGenerator rng;
+
+  /// maximum level
+  int max_level;
+
+  /// expansion factor at construction time
+  int efConstruction;
+
+  /// expansion factor at search time
+  int efSearch;
+
+  /// during search: do we check whether the next best distance is good enough?
+  bool check_relative_distance = true;
+
+  /// number of entry points in levels > 0.
+  int upper_beam;
+
+  /// use bounded queue during exploration
+  bool search_bounded_queue = true;
+
+  // methods that initialize the tree sizes
+
+  /// initialize the assign_probas and cum_nneighbor_per_level to
+  /// have 2*M links on level 0 and M links on levels > 0
+  void set_default_probas(int M, float levelMult);
+
+  /// set nb of neighbors for this level (before adding anything)
+  void set_nb_neighbors(int level_no, int n);
+
+  // methods that access the tree sizes
+
+  /// nb of neighbors for this level
+  int nb_neighbors(int layer_no) const;
+
+  /// cumumlative nb up to (and excluding) this level
+  int cum_nb_neighbors(int layer_no) const;
+
+  /// range of entries in the neighbors table of vertex no at layer_no
+  void neighbor_range(idx_t no, int layer_no,
+                      size_t * begin, size_t * end) const;
+
+  /// only mandatory parameter: nb of neighbors
+  explicit HNSW(int M = 32);
+
+  /// pick a random level for a new point
+  int random_level();
+
+  /// add n random levels to table (for debugging...)
+  void fill_with_random_links(size_t n);
+
+  void add_links_starting_from(DistanceComputer& ptdis,
+                               storage_idx_t pt_id,
+                               storage_idx_t nearest,
+                               float d_nearest,
+                               int level,
+                               omp_lock_t *locks,
+                               VisitedTable &vt);
+
+
+  /** add point pt_id on all levels <= pt_level and build the link
+   * structure for them. */
+  void add_with_locks(DistanceComputer& ptdis, int pt_level, int pt_id,
+                      std::vector<omp_lock_t>& locks,
+                      VisitedTable& vt);
+
+  int search_from_candidates(DistanceComputer& qdis, int k,
+                             idx_t *I, float *D,
+                             MinimaxHeap& candidates,
+                             VisitedTable &vt,
+                             int level, int nres_in = 0) const;
+
+  std::priority_queue<Node> search_from_candidate_unbounded(
+    const Node& node,
+    DistanceComputer& qdis,
+    int ef,
+    VisitedTable *vt
+  ) const;
+
+  /// search interface
+  void search(DistanceComputer& qdis, int k,
+              idx_t *I, float *D,
+              VisitedTable& vt) const;
+
+  void reset();
+
+  void clear_neighbor_tables(int level);
+  void print_neighbor_stats(int level) const;
+
+  int prepare_level_tab(size_t n, bool preset_levels = false);
+
+  static void shrink_neighbor_list(
+    DistanceComputer& qdis,
+    std::priority_queue<NodeDistFarther>& input,
+    std::vector<NodeDistFarther>& output,
+    int max_size);
+
+};
+
+
+/**************************************************************
+ * Auxiliary structures
+ **************************************************************/
+
+/// set implementation optimized for fast access.
+struct VisitedTable {
+  std::vector<uint8_t> visited;
+  int visno;
+
+  explicit VisitedTable(int size)
+    : visited(size), visno(1) {}
+
+  /// set flog #no to true
+  void set(int no) {
+    visited[no] = visno;
+  }
+
+  /// get flag #no
+  bool get(int no) const {
+    return visited[no] == visno;
+  }
+
+  /// reset all flags to false
+  void advance() {
+    visno++;
+    if (visno == 250) {
+      // 250 rather than 255 because sometimes we use visno and visno+1
+      memset(visited.data(), 0, sizeof(visited[0]) * visited.size());
+      visno = 1;
+    }
+  }
+};
+
+
+struct HNSWStats {
+  size_t n1, n2, n3;
+  size_t ndis;
+  size_t nreorder;
+  bool view;
+
+  HNSWStats() {
+    reset();
+  }
+
+  void reset() {
+    n1 = n2 = n3 = 0;
+    ndis = 0;
+    nreorder = 0;
+    view = false;
+  }
+};
+
+// global var that collects them all
+extern HNSWStats hnsw_stats;
+
+
+}  // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/impl/PolysemousTraining.cpp b/core/src/index/thirdparty/faiss/impl/PolysemousTraining.cpp
new file mode 100644
index 0000000000..a2177aa249
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/PolysemousTraining.cpp
@@ -0,0 +1,953 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/impl/PolysemousTraining.h>
+
+#include <cstdlib>
+#include <cmath>
+#include <cstring>
+#include <stdint.h>
+
+#include <algorithm>
+
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/hamming.h>
+
+#include <faiss/impl/FaissAssert.h>
+
+/*****************************************
+ * Mixed PQ / Hamming
+ ******************************************/
+
+namespace faiss {
+
+
+/****************************************************
+ * Optimization code
+ ****************************************************/
+
+SimulatedAnnealingParameters::SimulatedAnnealingParameters ()
+{
+    // set some reasonable defaults for the optimization
+    init_temperature = 0.7;
+    temperature_decay = pow (0.9, 1/500.);
+    // reduce by a factor 0.9 every 500 it
+    n_iter = 500000;
+    n_redo = 2;
+    seed = 123;
+    verbose = 0;
+    only_bit_flips = false;
+    init_random = false;
+}
+
+// what would the cost update be if iw and jw were swapped?
+// default implementation just computes both and computes the difference
+double PermutationObjective::cost_update (
+        const int *perm, int iw, int jw) const
+{
+    double orig_cost = compute_cost (perm);
+
+    std::vector<int> perm2 (n);
+    for (int i = 0; i < n; i++)
+        perm2[i] = perm[i];
+    perm2[iw] = perm[jw];
+    perm2[jw] = perm[iw];
+
+    double new_cost = compute_cost (perm2.data());
+    return new_cost - orig_cost;
+}
+
+
+
+
+SimulatedAnnealingOptimizer::SimulatedAnnealingOptimizer (
+        PermutationObjective *obj,
+        const SimulatedAnnealingParameters &p):
+    SimulatedAnnealingParameters (p),
+    obj (obj),
+    n(obj->n),
+    logfile (nullptr)
+{
+    rnd = new RandomGenerator (p.seed);
+    FAISS_THROW_IF_NOT (n < 100000 && n >=0 );
+}
+
+SimulatedAnnealingOptimizer::~SimulatedAnnealingOptimizer ()
+{
+    delete rnd;
+}
+
+// run the optimization and return the best result in best_perm
+double SimulatedAnnealingOptimizer::run_optimization (int * best_perm)
+{
+    double min_cost = 1e30;
+
+    // just do a few runs of the annealing and keep the lowest output cost
+    for (int it = 0; it < n_redo; it++) {
+        std::vector<int> perm(n);
+        for (int i = 0; i < n; i++)
+            perm[i] = i;
+         if (init_random) {
+            for (int i = 0; i < n; i++) {
+                int j = i + rnd->rand_int (n - i);
+                std::swap (perm[i], perm[j]);
+            }
+        }
+         float cost = optimize (perm.data());
+        if (logfile) fprintf (logfile, "\n");
+        if(verbose > 1) {
+            printf ("    optimization run %d: cost=%g %s\n",
+                    it, cost, cost < min_cost ? "keep" : "");
+        }
+        if (cost < min_cost) {
+            memcpy (best_perm, perm.data(), sizeof(perm[0]) * n);
+            min_cost = cost;
+        }
+    }
+     return min_cost;
+}
+
+// perform the optimization loop, starting from and modifying
+// permutation in-place
+double SimulatedAnnealingOptimizer::optimize (int *perm)
+{
+    double cost = init_cost = obj->compute_cost (perm);
+    int log2n = 0;
+    while (!(n <= (1 << log2n))) log2n++;
+    double temperature = init_temperature;
+     int n_swap = 0, n_hot = 0;
+    for (int it = 0; it < n_iter; it++) {
+        temperature = temperature * temperature_decay;
+        int iw, jw;
+        if (only_bit_flips) {
+            iw = rnd->rand_int (n);
+            jw = iw ^ (1 << rnd->rand_int (log2n));
+        } else {
+            iw = rnd->rand_int (n);
+            jw = rnd->rand_int (n - 1);
+            if (jw == iw) jw++;
+        }
+         double delta_cost = obj->cost_update (perm, iw, jw);
+         if (delta_cost < 0 || rnd->rand_float () < temperature) {
+            std::swap (perm[iw], perm[jw]);
+            cost += delta_cost;
+            n_swap++;
+            if (delta_cost >= 0) n_hot++;
+        }
+         if (verbose > 2 || (verbose > 1 && it % 10000 == 0)) {
+            printf ("      iteration %d cost %g temp %g n_swap %d "
+                    "(%d hot)     \r",
+                    it, cost, temperature, n_swap, n_hot);
+            fflush(stdout);
+        }
+        if (logfile) {
+            fprintf (logfile, "%d %g %g %d %d\n",
+                    it, cost, temperature, n_swap, n_hot);
+        }
+     }
+    if (verbose > 1) printf("\n");
+    return cost;
+}
+
+
+
+
+
+/****************************************************
+ * Cost functions: ReproduceDistanceTable
+ ****************************************************/
+
+
+
+
+
+
+static inline int hamming_dis (uint64_t a, uint64_t b)
+{
+    return __builtin_popcountl (a ^ b);
+}
+
+namespace {
+
+/// optimize permutation to reproduce a distance table with Hamming distances
+struct ReproduceWithHammingObjective : PermutationObjective {
+    int nbits;
+    double dis_weight_factor;
+
+    static double sqr (double x) { return x * x; }
+
+
+    // weihgting of distances: it is more important to reproduce small
+    // distances well
+    double dis_weight (double x) const
+    {
+        return exp (-dis_weight_factor * x);
+    }
+
+    std::vector<double> target_dis; // wanted distances (size n^2)
+    std::vector<double> weights;    // weights for each distance (size n^2)
+
+    // cost = quadratic difference between actual distance and Hamming distance
+    double compute_cost(const int* perm) const override {
+      double cost = 0;
+      for (int i = 0; i < n; i++) {
+        for (int j = 0; j < n; j++) {
+          double wanted = target_dis[i * n + j];
+          double w = weights[i * n + j];
+          double actual = hamming_dis(perm[i], perm[j]);
+          cost += w * sqr(wanted - actual);
+        }
+      }
+      return cost;
+    }
+
+
+    // what would the cost update be if iw and jw were swapped?
+    // computed in O(n) instead of O(n^2) for the full re-computation
+    double cost_update(const int* perm, int iw, int jw) const override {
+      double delta_cost = 0;
+
+      for (int i = 0; i < n; i++) {
+        if (i == iw) {
+          for (int j = 0; j < n; j++) {
+            double wanted = target_dis[i * n + j], w = weights[i * n + j];
+            double actual = hamming_dis(perm[i], perm[j]);
+            delta_cost -= w * sqr(wanted - actual);
+            double new_actual =
+                hamming_dis(perm[jw], perm[j == iw ? jw : j == jw ? iw : j]);
+            delta_cost += w * sqr(wanted - new_actual);
+          }
+        } else if (i == jw) {
+          for (int j = 0; j < n; j++) {
+            double wanted = target_dis[i * n + j], w = weights[i * n + j];
+            double actual = hamming_dis(perm[i], perm[j]);
+            delta_cost -= w * sqr(wanted - actual);
+            double new_actual =
+                hamming_dis(perm[iw], perm[j == iw ? jw : j == jw ? iw : j]);
+            delta_cost += w * sqr(wanted - new_actual);
+          }
+        } else {
+          int j = iw;
+          {
+            double wanted = target_dis[i * n + j], w = weights[i * n + j];
+            double actual = hamming_dis(perm[i], perm[j]);
+            delta_cost -= w * sqr(wanted - actual);
+            double new_actual = hamming_dis(perm[i], perm[jw]);
+            delta_cost += w * sqr(wanted - new_actual);
+          }
+          j = jw;
+          {
+            double wanted = target_dis[i * n + j], w = weights[i * n + j];
+            double actual = hamming_dis(perm[i], perm[j]);
+            delta_cost -= w * sqr(wanted - actual);
+            double new_actual = hamming_dis(perm[i], perm[iw]);
+            delta_cost += w * sqr(wanted - new_actual);
+          }
+        }
+      }
+
+      return delta_cost;
+    }
+
+
+
+    ReproduceWithHammingObjective (
+           int nbits,
+           const std::vector<double> & dis_table,
+           double dis_weight_factor):
+        nbits (nbits), dis_weight_factor (dis_weight_factor)
+    {
+        n = 1 << nbits;
+        FAISS_THROW_IF_NOT (dis_table.size() == n * n);
+        set_affine_target_dis (dis_table);
+    }
+
+    void set_affine_target_dis (const std::vector<double> & dis_table)
+    {
+        double sum = 0, sum2 = 0;
+        int n2 = n * n;
+        for (int i = 0; i < n2; i++) {
+            sum += dis_table [i];
+            sum2 += dis_table [i] * dis_table [i];
+        }
+        double mean = sum / n2;
+        double stddev = sqrt(sum2 / n2 - (sum / n2) * (sum / n2));
+
+        target_dis.resize (n2);
+
+        for (int i = 0; i < n2; i++) {
+            // the mapping function
+            double td = (dis_table [i] - mean) / stddev * sqrt(nbits / 4) +
+                nbits / 2;
+            target_dis[i] = td;
+            // compute a weight
+            weights.push_back (dis_weight (td));
+        }
+
+    }
+
+    ~ReproduceWithHammingObjective() override {}
+};
+
+} // anonymous namespace
+
+// weihgting of distances: it is more important to reproduce small
+// distances well
+double ReproduceDistancesObjective::dis_weight (double x) const
+{
+    return exp (-dis_weight_factor * x);
+}
+
+
+double ReproduceDistancesObjective::get_source_dis (int i, int j) const
+{
+    return source_dis [i * n + j];
+}
+
+// cost = quadratic difference between actual distance and Hamming distance
+double ReproduceDistancesObjective::compute_cost (const int *perm) const
+{
+    double cost = 0;
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < n; j++) {
+            double wanted = target_dis [i * n + j];
+            double w = weights [i * n + j];
+            double actual = get_source_dis (perm[i], perm[j]);
+            cost += w * sqr (wanted - actual);
+        }
+    }
+    return cost;
+}
+
+// what would the cost update be if iw and jw were swapped?
+// computed in O(n) instead of O(n^2) for the full re-computation
+double ReproduceDistancesObjective::cost_update(
+        const int *perm, int iw, int jw) const
+{
+    double delta_cost = 0;
+     for (int i = 0; i < n; i++) {
+        if (i == iw) {
+            for (int j = 0; j < n; j++) {
+                double wanted = target_dis [i * n + j],
+                    w = weights [i * n + j];
+                double actual = get_source_dis (perm[i], perm[j]);
+                delta_cost -= w * sqr (wanted - actual);
+                double new_actual = get_source_dis (
+                       perm[jw],
+                       perm[j == iw ? jw : j == jw ? iw : j]);
+                delta_cost += w * sqr (wanted - new_actual);
+            }
+        } else if (i == jw) {
+            for (int j = 0; j < n; j++) {
+                double wanted = target_dis [i * n + j],
+                    w = weights [i * n + j];
+                double actual = get_source_dis (perm[i], perm[j]);
+                delta_cost -= w * sqr (wanted - actual);
+                double new_actual = get_source_dis (
+                       perm[iw],
+                       perm[j == iw ? jw : j == jw ? iw : j]);
+                delta_cost += w * sqr (wanted - new_actual);
+            }
+        } else  {
+            int j = iw;
+            {
+                double wanted = target_dis [i * n + j],
+                    w = weights [i * n + j];
+                double actual = get_source_dis (perm[i], perm[j]);
+                delta_cost -= w * sqr (wanted - actual);
+                double new_actual = get_source_dis (perm[i], perm[jw]);
+                delta_cost += w * sqr (wanted - new_actual);
+            }
+            j = jw;
+            {
+                double wanted = target_dis [i * n + j],
+                    w = weights [i * n + j];
+                double actual = get_source_dis (perm[i], perm[j]);
+                delta_cost -= w * sqr (wanted - actual);
+                double new_actual = get_source_dis (perm[i], perm[iw]);
+                delta_cost += w * sqr (wanted - new_actual);
+            }
+        }
+    }
+     return delta_cost;
+}
+
+
+
+ReproduceDistancesObjective::ReproduceDistancesObjective (
+       int n,
+       const double *source_dis_in,
+       const double *target_dis_in,
+       double dis_weight_factor):
+    dis_weight_factor (dis_weight_factor),
+    target_dis (target_dis_in)
+{
+    this->n = n;
+    set_affine_target_dis (source_dis_in);
+}
+
+void ReproduceDistancesObjective::compute_mean_stdev (
+          const double *tab, size_t n2,
+          double *mean_out, double *stddev_out)
+{
+    double sum = 0, sum2 = 0;
+    for (int i = 0; i < n2; i++) {
+        sum += tab [i];
+        sum2 += tab [i] * tab [i];
+    }
+    double mean = sum / n2;
+    double stddev = sqrt(sum2 / n2 - (sum / n2) * (sum / n2));
+    *mean_out = mean;
+    *stddev_out = stddev;
+}
+
+void ReproduceDistancesObjective::set_affine_target_dis (
+          const double *source_dis_in)
+{
+    int n2 = n * n;
+
+    double mean_src, stddev_src;
+    compute_mean_stdev (source_dis_in, n2, &mean_src, &stddev_src);
+
+    double mean_target, stddev_target;
+    compute_mean_stdev (target_dis, n2, &mean_target, &stddev_target);
+
+    printf ("map mean %g std %g -> mean %g std %g\n",
+            mean_src, stddev_src, mean_target, stddev_target);
+
+    source_dis.resize (n2);
+    weights.resize (n2);
+
+    for (int i = 0; i < n2; i++) {
+        // the mapping function
+        source_dis[i] = (source_dis_in[i] - mean_src) / stddev_src
+            * stddev_target + mean_target;
+
+        // compute a weight
+        weights [i] = dis_weight (target_dis[i]);
+    }
+
+}
+
+/****************************************************
+ * Cost functions: RankingScore
+ ****************************************************/
+
+/// Maintains a 3D table of elementary costs.
+/// Accumulates elements based on Hamming distance comparisons
+template <typename Ttab, typename Taccu>
+struct Score3Computer: PermutationObjective {
+
+    int nc;
+
+    // cost matrix of size nc * nc *nc
+    // n_gt (i,j,k) = count of d_gt(x, y-) < d_gt(x, y+)
+    // where x has PQ code i, y- PQ code j and y+ PQ code k
+    std::vector<Ttab> n_gt;
+
+
+    /// the cost is a triple loop on the nc * nc * nc matrix of entries.
+    ///
+    Taccu compute (const int * perm) const
+    {
+        Taccu accu = 0;
+        const Ttab *p = n_gt.data();
+        for (int i = 0; i < nc; i++) {
+            int ip = perm [i];
+            for (int j = 0; j < nc; j++) {
+                int jp = perm [j];
+                for (int k = 0; k < nc; k++) {
+                    int kp = perm [k];
+                    if (hamming_dis (ip, jp) <
+                        hamming_dis (ip, kp)) {
+                        accu += *p; // n_gt [ ( i * nc + j) * nc + k];
+                    }
+                    p++;
+                }
+            }
+        }
+        return accu;
+    }
+
+
+    /** cost update if entries iw and jw of the permutation would be
+     * swapped.
+     *
+     * The computation is optimized by avoiding elements in the
+     * nc*nc*nc cube that are known not to change. For nc=256, this
+     * reduces the nb of cells to visit to about 6/256 th of the
+     * cells. Practical speedup is about 8x, and the code is quite
+     * complex :-/
+     */
+    Taccu compute_update (const int *perm, int iw, int jw) const
+    {
+        assert (iw != jw);
+        if (iw > jw) std::swap (iw, jw);
+
+        Taccu accu = 0;
+        const Ttab * n_gt_i = n_gt.data();
+        for (int i = 0; i < nc; i++) {
+            int ip0 = perm [i];
+            int ip = perm [i == iw ? jw : i == jw ? iw : i];
+
+            //accu += update_i (perm, iw, jw, ip0, ip, n_gt_i);
+
+            accu += update_i_cross (perm, iw, jw,
+                                    ip0, ip, n_gt_i);
+
+            if (ip != ip0)
+                accu += update_i_plane (perm, iw, jw,
+                                       ip0, ip, n_gt_i);
+
+            n_gt_i += nc * nc;
+        }
+
+        return accu;
+    }
+
+
+    Taccu update_i (const int *perm, int iw, int jw,
+                   int ip0, int ip, const Ttab * n_gt_i) const
+    {
+        Taccu accu = 0;
+        const Ttab *n_gt_ij = n_gt_i;
+        for (int j = 0; j < nc; j++) {
+            int jp0 = perm[j];
+            int jp = perm [j == iw ? jw : j == jw ? iw : j];
+            for (int k = 0; k < nc; k++) {
+                int kp0 = perm [k];
+                int kp = perm [k == iw ? jw : k == jw ? iw : k];
+                int ng = n_gt_ij [k];
+                if (hamming_dis (ip, jp) < hamming_dis (ip, kp)) {
+                    accu += ng;
+                }
+                if (hamming_dis (ip0, jp0) < hamming_dis (ip0, kp0)) {
+                    accu -= ng;
+                }
+            }
+            n_gt_ij += nc;
+        }
+        return accu;
+    }
+
+    // 2 inner loops for the case ip0 != ip
+    Taccu update_i_plane (const int *perm, int iw, int jw,
+                         int ip0, int ip, const Ttab * n_gt_i) const
+    {
+        Taccu accu = 0;
+        const Ttab *n_gt_ij = n_gt_i;
+
+        for (int j = 0; j < nc; j++) {
+            if (j != iw && j != jw) {
+                int jp = perm[j];
+                for (int k = 0; k < nc; k++) {
+                    if (k != iw && k != jw) {
+                        int kp = perm [k];
+                        Ttab ng = n_gt_ij [k];
+                        if (hamming_dis (ip, jp) < hamming_dis (ip, kp)) {
+                            accu += ng;
+                        }
+                        if (hamming_dis (ip0, jp) < hamming_dis (ip0, kp)) {
+                            accu -= ng;
+                        }
+                    }
+                }
+            }
+            n_gt_ij += nc;
+        }
+        return accu;
+    }
+
+    /// used for the 8 cells were the 3 indices are swapped
+    inline Taccu update_k (const int *perm, int iw, int jw,
+                          int ip0, int ip, int jp0, int jp,
+                          int k,
+                          const Ttab * n_gt_ij) const
+    {
+        Taccu accu = 0;
+        int kp0 = perm [k];
+        int kp = perm [k == iw ? jw : k == jw ? iw : k];
+        Ttab ng = n_gt_ij [k];
+        if (hamming_dis (ip, jp) < hamming_dis (ip, kp)) {
+            accu += ng;
+        }
+        if (hamming_dis (ip0, jp0) < hamming_dis (ip0, kp0)) {
+            accu -= ng;
+        }
+        return accu;
+    }
+
+    /// compute update on a line of k's, where i and j are swapped
+    Taccu update_j_line (const int *perm, int iw, int jw,
+                        int ip0, int ip, int jp0, int jp,
+                        const Ttab * n_gt_ij) const
+    {
+        Taccu accu = 0;
+        for (int k = 0; k < nc; k++) {
+            if (k == iw || k == jw) continue;
+            int kp = perm [k];
+            Ttab ng = n_gt_ij [k];
+            if (hamming_dis (ip, jp) < hamming_dis (ip, kp)) {
+                accu += ng;
+            }
+            if (hamming_dis (ip0, jp0) < hamming_dis (ip0, kp)) {
+                accu -= ng;
+            }
+        }
+        return accu;
+    }
+
+
+    /// considers the 2 pairs of crossing lines j=iw or jw and k = iw or kw
+    Taccu update_i_cross (const int *perm, int iw, int jw,
+                        int ip0, int ip, const Ttab * n_gt_i) const
+    {
+        Taccu accu = 0;
+        const Ttab *n_gt_ij = n_gt_i;
+
+        for (int j = 0; j < nc; j++) {
+            int jp0 = perm[j];
+            int jp = perm [j == iw ? jw : j == jw ? iw : j];
+
+            accu += update_k (perm, iw, jw, ip0, ip, jp0, jp, iw, n_gt_ij);
+            accu += update_k (perm, iw, jw, ip0, ip, jp0, jp, jw, n_gt_ij);
+
+            if (jp != jp0)
+                accu += update_j_line (perm, iw, jw, ip0, ip, jp0, jp, n_gt_ij);
+
+            n_gt_ij += nc;
+        }
+        return accu;
+    }
+
+
+    /// PermutationObjective implementeation (just negates the scores
+    /// for minimization)
+
+    double compute_cost(const int* perm) const override {
+      return -compute(perm);
+    }
+
+    double cost_update(const int* perm, int iw, int jw) const override {
+      double ret = -compute_update(perm, iw, jw);
+      return ret;
+    }
+
+    ~Score3Computer() override {}
+};
+
+
+
+
+
+struct IndirectSort {
+    const float *tab;
+    bool operator () (int a, int b) {return tab[a] < tab[b]; }
+};
+
+
+
+struct RankingScore2: Score3Computer<float, double> {
+    int nbits;
+    int nq, nb;
+    const uint32_t *qcodes, *bcodes;
+    const float *gt_distances;
+
+    RankingScore2 (int nbits, int nq, int nb,
+                  const uint32_t *qcodes, const uint32_t *bcodes,
+                  const float *gt_distances):
+        nbits(nbits), nq(nq), nb(nb), qcodes(qcodes),
+        bcodes(bcodes), gt_distances(gt_distances)
+    {
+        n = nc = 1 << nbits;
+        n_gt.resize (nc * nc * nc);
+        init_n_gt ();
+    }
+
+
+    double rank_weight (int r)
+    {
+        return 1.0 / (r + 1);
+    }
+
+    /// count nb of i, j in a x b st. i < j
+    /// a and b should be sorted on input
+    /// they are the ranks of j and k respectively.
+    /// specific version for diff-of-rank weighting, cannot optimized
+    /// with a cumulative table
+    double accum_gt_weight_diff (const std::vector<int> & a,
+                                 const std::vector<int> & b)
+    {
+        int nb = b.size(), na = a.size();
+
+        double accu = 0;
+        int j = 0;
+        for (int i = 0; i < na; i++) {
+            int ai = a[i];
+            while (j < nb && ai >= b[j]) j++;
+
+            double accu_i = 0;
+            for (int k = j; k < b.size(); k++)
+                accu_i += rank_weight (b[k] - ai);
+
+            accu += rank_weight (ai) * accu_i;
+
+        }
+        return accu;
+    }
+
+    void init_n_gt ()
+    {
+        for (int q = 0; q < nq; q++) {
+            const float *gtd = gt_distances + q * nb;
+            const uint32_t *cb = bcodes;// all same codes
+            float * n_gt_q = & n_gt [qcodes[q] * nc * nc];
+
+            printf("init gt for q=%d/%d    \r", q, nq); fflush(stdout);
+
+            std::vector<int> rankv (nb);
+            int * ranks = rankv.data();
+
+            // elements in each code bin, ordered by rank within each bin
+            std::vector<std::vector<int> > tab (nc);
+
+            { // build rank table
+                IndirectSort s = {gtd};
+                for (int j = 0; j < nb; j++) ranks[j] = j;
+                std::sort (ranks, ranks + nb, s);
+            }
+
+            for (int rank = 0; rank < nb; rank++) {
+                int i = ranks [rank];
+                tab [cb[i]].push_back (rank);
+            }
+
+
+            // this is very expensive. Any suggestion for improvement
+            // welcome.
+            for (int i = 0; i < nc; i++) {
+                std::vector<int> & di = tab[i];
+                for (int j = 0; j < nc; j++) {
+                    std::vector<int> & dj = tab[j];
+                    n_gt_q [i * nc + j] += accum_gt_weight_diff (di, dj);
+
+                }
+            }
+
+        }
+
+    }
+
+};
+
+
+/*****************************************
+ * PolysemousTraining
+ ******************************************/
+
+
+
+PolysemousTraining::PolysemousTraining ()
+{
+    optimization_type = OT_ReproduceDistances_affine;
+    ntrain_permutation = 0;
+    dis_weight_factor = log(2);
+}
+
+
+
+void PolysemousTraining::optimize_reproduce_distances (
+       ProductQuantizer &pq) const
+{
+
+    int dsub = pq.dsub;
+
+    int n = pq.ksub;
+    int nbits = pq.nbits;
+
+#pragma omp parallel for
+    for (int m = 0; m < pq.M; m++) {
+        std::vector<double> dis_table;
+
+        // printf ("Optimizing quantizer %d\n", m);
+
+        float * centroids = pq.get_centroids (m, 0);
+
+        for (int i = 0; i < n; i++) {
+            for (int j = 0; j < n; j++) {
+                dis_table.push_back (fvec_L2sqr (centroids + i * dsub,
+                                                 centroids + j * dsub,
+                                                 dsub));
+            }
+        }
+
+        std::vector<int> perm (n);
+        ReproduceWithHammingObjective obj (
+               nbits, dis_table,
+               dis_weight_factor);
+
+
+        SimulatedAnnealingOptimizer optim (&obj, *this);
+
+        if (log_pattern.size()) {
+            char fname[256];
+            snprintf (fname, 256, log_pattern.c_str(), m);
+            printf ("opening log file %s\n", fname);
+            optim.logfile = fopen (fname, "w");
+            FAISS_THROW_IF_NOT_MSG (optim.logfile, "could not open logfile");
+        }
+        double final_cost = optim.run_optimization (perm.data());
+
+        if (verbose > 0) {
+            printf ("SimulatedAnnealingOptimizer for m=%d: %g -> %g\n",
+                    m, optim.init_cost, final_cost);
+        }
+
+        if (log_pattern.size()) fclose (optim.logfile);
+
+        std::vector<float> centroids_copy;
+        for (int i = 0; i < dsub * n; i++)
+            centroids_copy.push_back (centroids[i]);
+
+        for (int i = 0; i < n; i++)
+            memcpy (centroids + perm[i] * dsub,
+                    centroids_copy.data() + i * dsub,
+                    dsub * sizeof(centroids[0]));
+
+    }
+
+}
+
+
+void PolysemousTraining::optimize_ranking (
+      ProductQuantizer &pq, size_t n, const float *x) const
+{
+
+    int dsub = pq.dsub;
+
+    int nbits = pq.nbits;
+
+    std::vector<uint8_t> all_codes (pq.code_size * n);
+
+    pq.compute_codes (x, all_codes.data(), n);
+
+    FAISS_THROW_IF_NOT (pq.nbits == 8);
+
+    if (n == 0)
+        pq.compute_sdc_table ();
+
+#pragma omp parallel for
+    for (int m = 0; m < pq.M; m++) {
+        size_t nq, nb;
+        std::vector <uint32_t> codes; // query codes, then db codes
+        std::vector <float> gt_distances; // nq * nb matrix of distances
+
+        if (n > 0) {
+            std::vector<float> xtrain (n * dsub);
+            for (int i = 0; i < n; i++)
+                memcpy (xtrain.data() + i * dsub,
+                        x + i * pq.d + m * dsub,
+                        sizeof(float) * dsub);
+
+            codes.resize (n);
+            for (int i = 0; i < n; i++)
+                codes [i] = all_codes [i * pq.code_size + m];
+
+            nq = n / 4; nb = n - nq;
+            const float *xq = xtrain.data();
+            const float *xb = xq + nq * dsub;
+
+            gt_distances.resize (nq * nb);
+
+            pairwise_L2sqr (dsub,
+                            nq, xq,
+                            nb, xb,
+                            gt_distances.data());
+        } else {
+            nq = nb = pq.ksub;
+            codes.resize (2 * nq);
+            for (int i = 0; i < nq; i++)
+                codes[i] = codes [i + nq] = i;
+
+            gt_distances.resize (nq * nb);
+
+            memcpy (gt_distances.data (),
+                    pq.sdc_table.data () + m * nq * nb,
+                    sizeof (float) * nq * nb);
+        }
+
+        double t0 = getmillisecs ();
+
+        PermutationObjective *obj = new RankingScore2 (
+                  nbits, nq, nb,
+                  codes.data(), codes.data() + nq,
+                  gt_distances.data ());
+        ScopeDeleter1<PermutationObjective> del (obj);
+
+        if (verbose > 0) {
+            printf("   m=%d, nq=%ld, nb=%ld, intialize RankingScore "
+                   "in %.3f ms\n",
+                   m, nq, nb, getmillisecs () - t0);
+        }
+
+        SimulatedAnnealingOptimizer optim (obj, *this);
+
+        if (log_pattern.size()) {
+            char fname[256];
+            snprintf (fname, 256, log_pattern.c_str(), m);
+            printf ("opening log file %s\n", fname);
+            optim.logfile = fopen (fname, "w");
+            FAISS_THROW_IF_NOT_FMT (optim.logfile,
+                                    "could not open logfile %s", fname);
+        }
+
+        std::vector<int> perm (pq.ksub);
+
+        double final_cost = optim.run_optimization (perm.data());
+        printf ("SimulatedAnnealingOptimizer for m=%d: %g -> %g\n",
+                m, optim.init_cost, final_cost);
+
+        if (log_pattern.size()) fclose (optim.logfile);
+
+        float * centroids = pq.get_centroids (m, 0);
+
+        std::vector<float> centroids_copy;
+        for (int i = 0; i < dsub * pq.ksub; i++)
+            centroids_copy.push_back (centroids[i]);
+
+        for (int i = 0; i < pq.ksub; i++)
+            memcpy (centroids + perm[i] * dsub,
+                    centroids_copy.data() + i * dsub,
+                    dsub * sizeof(centroids[0]));
+
+    }
+
+}
+
+
+
+void PolysemousTraining::optimize_pq_for_hamming (ProductQuantizer &pq,
+                                                size_t n, const float *x) const
+{
+    if (optimization_type == OT_None) {
+
+    } else if (optimization_type == OT_ReproduceDistances_affine) {
+        optimize_reproduce_distances (pq);
+    } else {
+        optimize_ranking (pq, n, x);
+    }
+
+    pq.compute_sdc_table ();
+
+}
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/impl/PolysemousTraining.h b/core/src/index/thirdparty/faiss/impl/PolysemousTraining.h
new file mode 100644
index 0000000000..cf511a74c5
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/PolysemousTraining.h
@@ -0,0 +1,158 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_POLYSEMOUS_TRAINING_INCLUDED
+#define FAISS_POLYSEMOUS_TRAINING_INCLUDED
+
+
+#include <faiss/impl/ProductQuantizer.h>
+
+
+namespace faiss {
+
+
+/// parameters used for the simulated annealing method
+struct SimulatedAnnealingParameters {
+
+    // optimization parameters
+    double init_temperature;   // init probaility of accepting a bad swap
+    double temperature_decay;  // at each iteration the temp is multiplied by this
+    int n_iter; // nb of iterations
+    int n_redo; // nb of runs of the simulation
+    int seed;   // random seed
+    int verbose;
+    bool only_bit_flips; // restrict permutation changes to bit flips
+    bool init_random; // intialize with a random permutation (not identity)
+
+    // set reasonable defaults
+    SimulatedAnnealingParameters ();
+
+};
+
+
+/// abstract class for the loss function
+struct PermutationObjective {
+
+    int n;
+
+    virtual double compute_cost (const int *perm) const = 0;
+
+    // what would the cost update be if iw and jw were swapped?
+    // default implementation just computes both and computes the difference
+    virtual double cost_update (const int *perm, int iw, int jw) const;
+
+    virtual ~PermutationObjective () {}
+};
+
+
+struct ReproduceDistancesObjective : PermutationObjective {
+
+    double dis_weight_factor;
+
+    static double sqr (double x) { return x * x; }
+
+    // weihgting of distances: it is more important to reproduce small
+    // distances well
+    double dis_weight (double x) const;
+
+    std::vector<double> source_dis; ///< "real" corrected distances (size n^2)
+    const double *      target_dis; ///< wanted distances (size n^2)
+    std::vector<double> weights;    ///< weights for each distance (size n^2)
+
+    double get_source_dis (int i, int j) const;
+
+    // cost = quadratic difference between actual distance and Hamming distance
+    double compute_cost(const int* perm) const override;
+
+    // what would the cost update be if iw and jw were swapped?
+    // computed in O(n) instead of O(n^2) for the full re-computation
+    double cost_update(const int* perm, int iw, int jw) const override;
+
+    ReproduceDistancesObjective (
+           int n,
+           const double *source_dis_in,
+           const double *target_dis_in,
+           double dis_weight_factor);
+
+    static void compute_mean_stdev (const double *tab, size_t n2,
+                                    double *mean_out, double *stddev_out);
+
+    void set_affine_target_dis (const double *source_dis_in);
+
+    ~ReproduceDistancesObjective() override {}
+};
+
+struct RandomGenerator;
+
+/// Simulated annealing optimization algorithm for permutations.
+ struct SimulatedAnnealingOptimizer: SimulatedAnnealingParameters {
+
+    PermutationObjective *obj;
+    int n;         ///< size of the permutation
+    FILE *logfile; /// logs values of the cost function
+
+    SimulatedAnnealingOptimizer (PermutationObjective *obj,
+                                 const SimulatedAnnealingParameters &p);
+    RandomGenerator *rnd;
+
+    /// remember intial cost of optimization
+    double init_cost;
+
+    // main entry point. Perform the optimization loop, starting from
+    // and modifying permutation in-place
+    double optimize (int *perm);
+
+    // run the optimization and return the best result in best_perm
+    double run_optimization (int * best_perm);
+
+    virtual ~SimulatedAnnealingOptimizer ();
+};
+
+
+
+
+/// optimizes the order of indices in a ProductQuantizer
+struct PolysemousTraining: SimulatedAnnealingParameters {
+
+    enum Optimization_type_t {
+        OT_None,
+        OT_ReproduceDistances_affine,  ///< default
+        OT_Ranking_weighted_diff  /// same as _2, but use rank of y+ - rank of y-
+    };
+    Optimization_type_t optimization_type;
+
+    // use 1/4 of the training points for the optimization, with
+    // max. ntrain_permutation. If ntrain_permutation == 0: train on
+    // centroids
+    int ntrain_permutation;
+    double dis_weight_factor; // decay of exp that weights distance loss
+
+    // filename pattern for the logging of iterations
+    std::string log_pattern;
+
+    // sets default values
+    PolysemousTraining ();
+
+    /// reorder the centroids so that the Hamming distace becomes a
+    /// good approximation of the SDC distance (called by train)
+    void optimize_pq_for_hamming (ProductQuantizer & pq,
+                                  size_t n, const float *x) const;
+
+    /// called by optimize_pq_for_hamming
+    void optimize_ranking (ProductQuantizer &pq, size_t n, const float *x) const;
+    /// called by optimize_pq_for_hamming
+    void optimize_reproduce_distances (ProductQuantizer &pq) const;
+
+};
+
+
+} // namespace faiss
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/impl/ProductQuantizer.cpp b/core/src/index/thirdparty/faiss/impl/ProductQuantizer.cpp
new file mode 100644
index 0000000000..bbd143611e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/ProductQuantizer.cpp
@@ -0,0 +1,876 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/impl/ProductQuantizer.h>
+
+
+#include <cstddef>
+#include <cstring>
+#include <cstdio>
+#include <memory>
+
+#include <algorithm>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/VectorTransform.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/utils/distances.h>
+
+
+extern "C" {
+
+/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
+
+int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+            n, FINTEGER *k, const float *alpha, const float *a,
+            FINTEGER *lda, const float *b, FINTEGER *
+            ldb, float *beta, float *c, FINTEGER *ldc);
+
+}
+
+
+namespace faiss {
+
+
+/* compute an estimator using look-up tables for typical values of M */
+template <typename CT, class C>
+void pq_estimators_from_tables_Mmul4 (int M, const CT * codes,
+                                      size_t ncodes,
+                                      const float * __restrict dis_table,
+                                      size_t ksub,
+                                      size_t k,
+                                      float * heap_dis,
+                                      int64_t * heap_ids)
+{
+
+    for (size_t j = 0; j < ncodes; j++) {
+        float dis = 0;
+        const float *dt = dis_table;
+
+        for (size_t m = 0; m < M; m+=4) {
+            float dism = 0;
+            dism  = dt[*codes++]; dt += ksub;
+            dism += dt[*codes++]; dt += ksub;
+            dism += dt[*codes++]; dt += ksub;
+            dism += dt[*codes++]; dt += ksub;
+            dis += dism;
+        }
+
+        if (C::cmp (heap_dis[0], dis)) {
+            heap_pop<C> (k, heap_dis, heap_ids);
+            heap_push<C> (k, heap_dis, heap_ids, dis, j);
+        }
+    }
+}
+
+
+template <typename CT, class C>
+void pq_estimators_from_tables_M4 (const CT * codes,
+                                   size_t ncodes,
+                                   const float * __restrict dis_table,
+                                   size_t ksub,
+                                   size_t k,
+                                   float * heap_dis,
+                                   int64_t * heap_ids)
+{
+
+    for (size_t j = 0; j < ncodes; j++) {
+        float dis = 0;
+        const float *dt = dis_table;
+        dis  = dt[*codes++]; dt += ksub;
+        dis += dt[*codes++]; dt += ksub;
+        dis += dt[*codes++]; dt += ksub;
+        dis += dt[*codes++];
+
+        if (C::cmp (heap_dis[0], dis)) {
+            heap_pop<C> (k, heap_dis, heap_ids);
+            heap_push<C> (k, heap_dis, heap_ids, dis, j);
+        }
+    }
+}
+
+
+template <typename CT, class C>
+static inline void pq_estimators_from_tables (const ProductQuantizer& pq,
+                                              const CT * codes,
+                                              size_t ncodes,
+                                              const float * dis_table,
+                                              size_t k,
+                                              float * heap_dis,
+                                              int64_t * heap_ids)
+{
+
+    if (pq.M == 4)  {
+
+        pq_estimators_from_tables_M4<CT, C> (codes, ncodes,
+                                             dis_table, pq.ksub, k,
+                                             heap_dis, heap_ids);
+        return;
+    }
+
+    if (pq.M % 4 == 0) {
+        pq_estimators_from_tables_Mmul4<CT, C> (pq.M, codes, ncodes,
+                                                dis_table, pq.ksub, k,
+                                                heap_dis, heap_ids);
+        return;
+    }
+
+    /* Default is relatively slow */
+    const size_t M = pq.M;
+    const size_t ksub = pq.ksub;
+    for (size_t j = 0; j < ncodes; j++) {
+        float dis = 0;
+        const float * __restrict dt = dis_table;
+        for (int m = 0; m < M; m++) {
+            dis += dt[*codes++];
+            dt += ksub;
+        }
+        if (C::cmp (heap_dis[0], dis)) {
+            heap_pop<C> (k, heap_dis, heap_ids);
+            heap_push<C> (k, heap_dis, heap_ids, dis, j);
+        }
+    }
+}
+
+template <class C>
+static inline void pq_estimators_from_tables_generic(const ProductQuantizer& pq,
+                                                     size_t nbits,
+                                                     const uint8_t *codes,
+                                                     size_t ncodes,
+                                                     const float *dis_table,
+                                                     size_t k,
+                                                     float *heap_dis,
+                                                     int64_t *heap_ids)
+{
+  const size_t M = pq.M;
+  const size_t ksub = pq.ksub;
+  for (size_t j = 0; j < ncodes; ++j) {
+    faiss::ProductQuantizer::PQDecoderGeneric decoder(
+      codes + j * pq.code_size, nbits
+    );
+    float dis = 0;
+    const float * __restrict dt = dis_table;
+    for (size_t m = 0; m < M; m++) {
+      uint64_t c = decoder.decode();
+      dis += dt[c];
+      dt += ksub;
+    }
+
+    if (C::cmp(heap_dis[0], dis)) {
+      heap_pop<C>(k, heap_dis, heap_ids);
+      heap_push<C>(k, heap_dis, heap_ids, dis, j);
+    }
+  }
+}
+
+/*********************************************
+ * PQ implementation
+ *********************************************/
+
+
+
+ProductQuantizer::ProductQuantizer (size_t d, size_t M, size_t nbits):
+    d(d), M(M), nbits(nbits), assign_index(nullptr)
+{
+    set_derived_values ();
+}
+
+ProductQuantizer::ProductQuantizer ()
+    : ProductQuantizer(0, 1, 0) {}
+
+void ProductQuantizer::set_derived_values () {
+    // quite a few derived values
+    FAISS_THROW_IF_NOT (d % M == 0);
+    dsub = d / M;
+    code_size = (nbits * M + 7) / 8;
+    ksub = 1 << nbits;
+    centroids.resize (d * ksub);
+    verbose = false;
+    train_type = Train_default;
+}
+
+void ProductQuantizer::set_params (const float * centroids_, int m)
+{
+  memcpy (get_centroids(m, 0), centroids_,
+            ksub * dsub * sizeof (centroids_[0]));
+}
+
+
+static void init_hypercube (int d, int nbits,
+                            int n, const float * x,
+                            float *centroids)
+{
+
+    std::vector<float> mean (d);
+    for (int i = 0; i < n; i++)
+        for (int j = 0; j < d; j++)
+            mean [j] += x[i * d + j];
+
+    float maxm = 0;
+    for (int j = 0; j < d; j++) {
+        mean [j] /= n;
+        if (fabs(mean[j]) > maxm) maxm = fabs(mean[j]);
+    }
+
+    for (int i = 0; i < (1 << nbits); i++) {
+        float * cent = centroids + i * d;
+        for (int j = 0; j < nbits; j++)
+            cent[j] = mean [j] + (((i >> j) & 1) ? 1 : -1) * maxm;
+        for (int j = nbits; j < d; j++)
+            cent[j] = mean [j];
+    }
+
+
+}
+
+static void init_hypercube_pca (int d, int nbits,
+                                int n, const float * x,
+                                float *centroids)
+{
+    PCAMatrix pca (d, nbits);
+    pca.train (n, x);
+
+
+    for (int i = 0; i < (1 << nbits); i++) {
+        float * cent = centroids + i * d;
+        for (int j = 0; j < d; j++) {
+            cent[j] = pca.mean[j];
+            float f = 1.0;
+            for (int k = 0; k < nbits; k++)
+                cent[j] += f *
+                    sqrt (pca.eigenvalues [k]) *
+                    (((i >> k) & 1) ? 1 : -1) *
+                    pca.PCAMat [j + k * d];
+        }
+    }
+
+}
+
+void ProductQuantizer::train (int n, const float * x)
+{
+    if (train_type != Train_shared) {
+        train_type_t final_train_type;
+        final_train_type = train_type;
+        if (train_type == Train_hypercube ||
+            train_type == Train_hypercube_pca) {
+            if (dsub < nbits) {
+                final_train_type = Train_default;
+                printf ("cannot train hypercube: nbits=%ld > log2(d=%ld)\n",
+                        nbits, dsub);
+            }
+        }
+
+        float * xslice = new float[n * dsub];
+        ScopeDeleter<float> del (xslice);
+        for (int m = 0; m < M; m++) {
+            for (int j = 0; j < n; j++)
+                memcpy (xslice + j * dsub,
+                        x + j * d + m * dsub,
+                        dsub * sizeof(float));
+
+            Clustering clus (dsub, ksub, cp);
+
+            // we have some initialization for the centroids
+            if (final_train_type != Train_default) {
+                clus.centroids.resize (dsub * ksub);
+            }
+
+            switch (final_train_type) {
+            case Train_hypercube:
+                init_hypercube (dsub, nbits, n, xslice,
+                                clus.centroids.data ());
+                break;
+            case  Train_hypercube_pca:
+                init_hypercube_pca (dsub, nbits, n, xslice,
+                                    clus.centroids.data ());
+                break;
+            case  Train_hot_start:
+                memcpy (clus.centroids.data(),
+                        get_centroids (m, 0),
+                        dsub * ksub * sizeof (float));
+                break;
+            default: ;
+            }
+
+            if(verbose) {
+                clus.verbose = true;
+                printf ("Training PQ slice %d/%zd\n", m, M);
+            }
+            IndexFlatL2 index (dsub);
+            clus.train (n, xslice, assign_index ? *assign_index : index);
+            set_params (clus.centroids.data(), m);
+        }
+
+
+    } else {
+
+        Clustering clus (dsub, ksub, cp);
+
+        if(verbose) {
+            clus.verbose = true;
+            printf ("Training all PQ slices at once\n");
+        }
+
+        IndexFlatL2 index (dsub);
+
+        clus.train (n * M, x, assign_index ? *assign_index : index);
+        for (int m = 0; m < M; m++) {
+            set_params (clus.centroids.data(), m);
+        }
+
+    }
+}
+
+template<class PQEncoder>
+void compute_code(const ProductQuantizer& pq, const float *x, uint8_t *code) {
+  float distances [pq.ksub];
+  PQEncoder encoder(code, pq.nbits);
+  for (size_t m = 0; m < pq.M; m++) {
+    float mindis = 1e20;
+    uint64_t idxm = 0;
+    const float * xsub = x + m * pq.dsub;
+
+    fvec_L2sqr_ny(distances, xsub, pq.get_centroids(m, 0), pq.dsub, pq.ksub);
+
+    /* Find best centroid */
+    for (size_t i = 0; i < pq.ksub; i++) {
+      float dis = distances[i];
+      if (dis < mindis) {
+        mindis = dis;
+        idxm = i;
+      }
+    }
+
+    encoder.encode(idxm);
+  }
+}
+
+void ProductQuantizer::compute_code(const float * x, uint8_t * code) const {
+  switch (nbits) {
+    case 8:
+      faiss::compute_code<PQEncoder8>(*this, x, code);
+      break;
+
+    case 16:
+      faiss::compute_code<PQEncoder16>(*this, x, code);
+      break;
+
+    default:
+      faiss::compute_code<PQEncoderGeneric>(*this, x, code);
+      break;
+  }
+}
+
+template<class PQDecoder>
+void decode(const ProductQuantizer& pq, const uint8_t *code, float *x)
+{
+  PQDecoder decoder(code, pq.nbits);
+  for (size_t m = 0; m < pq.M; m++) {
+    uint64_t c = decoder.decode();
+    memcpy(x + m * pq.dsub, pq.get_centroids(m, c), sizeof(float) * pq.dsub);
+  }
+}
+
+void ProductQuantizer::decode (const uint8_t *code, float *x) const
+{
+  switch (nbits) {
+    case 8:
+      faiss::decode<PQDecoder8>(*this, code, x);
+      break;
+
+    case 16:
+      faiss::decode<PQDecoder16>(*this, code, x);
+      break;
+
+    default:
+      faiss::decode<PQDecoderGeneric>(*this, code, x);
+      break;
+  }
+}
+
+
+void ProductQuantizer::decode (const uint8_t *code, float *x, size_t n) const
+{
+    for (size_t i = 0; i < n; i++) {
+        this->decode (code + code_size * i, x + d * i);
+    }
+}
+
+
+void ProductQuantizer::compute_code_from_distance_table (const float *tab,
+                                                         uint8_t *code) const
+{
+  PQEncoderGeneric encoder(code, nbits);
+  for (size_t m = 0; m < M; m++) {
+    float mindis = 1e20;
+    uint64_t idxm = 0;
+
+    /* Find best centroid */
+    for (size_t j = 0; j < ksub; j++) {
+      float dis = *tab++;
+      if (dis < mindis) {
+        mindis = dis;
+        idxm = j;
+      }
+    }
+
+    encoder.encode(idxm);
+  }
+}
+
+void ProductQuantizer::compute_codes_with_assign_index (
+                const float * x,
+                uint8_t * codes,
+                size_t n)
+{
+    FAISS_THROW_IF_NOT (assign_index && assign_index->d == dsub);
+
+    for (size_t m = 0; m < M; m++) {
+        assign_index->reset ();
+        assign_index->add (ksub, get_centroids (m, 0));
+        size_t bs = 65536;
+        float * xslice = new float[bs * dsub];
+        ScopeDeleter<float> del (xslice);
+        idx_t *assign = new idx_t[bs];
+        ScopeDeleter<idx_t> del2 (assign);
+
+        for (size_t i0 = 0; i0 < n; i0 += bs) {
+            size_t i1 = std::min(i0 + bs, n);
+
+            for (size_t i = i0; i < i1; i++) {
+                memcpy (xslice + (i - i0) * dsub,
+                        x + i * d + m * dsub,
+                        dsub * sizeof(float));
+            }
+
+            assign_index->assign (i1 - i0, xslice, assign);
+
+            if (nbits == 8) {
+              uint8_t *c = codes + code_size * i0 + m;
+              for (size_t i = i0; i < i1; i++) {
+                *c = assign[i - i0];
+                c += M;
+              }
+            } else if (nbits == 16) {
+              uint16_t *c = (uint16_t*)(codes + code_size * i0 + m * 2);
+              for (size_t i = i0; i < i1; i++) {
+                *c = assign[i - i0];
+                c += M;
+              }
+            } else {
+              for (size_t i = i0; i < i1; ++i) {
+                uint8_t *c = codes + code_size * i + ((m * nbits) / 8);
+                uint8_t offset = (m * nbits) % 8;
+                uint64_t ass = assign[i - i0];
+
+                PQEncoderGeneric encoder(c, nbits, offset);
+                encoder.encode(ass);
+              }
+            }
+
+        }
+    }
+
+}
+
+void ProductQuantizer::compute_codes (const float * x,
+                                      uint8_t * codes,
+                                      size_t n)  const
+{
+  // process by blocks to avoid using too much RAM
+    size_t bs = 256 * 1024;
+    if (n > bs) {
+        for (size_t i0 = 0; i0 < n; i0 += bs) {
+            size_t i1 = std::min(i0 + bs, n);
+            compute_codes (x + d * i0, codes + code_size * i0, i1 - i0);
+        }
+        return;
+    }
+
+    if (dsub < 16) { // simple direct computation
+
+#pragma omp parallel for
+        for (size_t i = 0; i < n; i++)
+            compute_code (x + i * d, codes + i * code_size);
+
+    } else { // worthwile to use BLAS
+        float *dis_tables = new float [n * ksub * M];
+        ScopeDeleter<float> del (dis_tables);
+        compute_distance_tables (n, x, dis_tables);
+
+#pragma omp parallel for
+        for (size_t i = 0; i < n; i++) {
+            uint8_t * code = codes + i * code_size;
+            const float * tab = dis_tables + i * ksub * M;
+            compute_code_from_distance_table (tab, code);
+        }
+    }
+}
+
+
+void ProductQuantizer::compute_distance_table (const float * x,
+                                               float * dis_table) const
+{
+    size_t m;
+
+    for (m = 0; m < M; m++) {
+        fvec_L2sqr_ny (dis_table + m * ksub,
+                       x + m * dsub,
+                       get_centroids(m, 0),
+                       dsub,
+                       ksub);
+    }
+}
+
+void ProductQuantizer::compute_inner_prod_table (const float * x,
+                                                 float * dis_table) const
+{
+    size_t m;
+
+    for (m = 0; m < M; m++) {
+        fvec_inner_products_ny (dis_table + m * ksub,
+                                x + m * dsub,
+                                get_centroids(m, 0),
+                                dsub,
+                                ksub);
+    }
+}
+
+
+void ProductQuantizer::compute_distance_tables (
+           size_t nx,
+           const float * x,
+           float * dis_tables) const
+{
+
+    if (dsub < 16) {
+
+#pragma omp parallel for
+        for (size_t i = 0; i < nx; i++) {
+            compute_distance_table (x + i * d, dis_tables + i * ksub * M);
+        }
+
+    } else { // use BLAS
+
+        for (int m = 0; m < M; m++) {
+            pairwise_L2sqr (dsub,
+                            nx, x + dsub * m,
+                            ksub, centroids.data() + m * dsub * ksub,
+                            dis_tables + ksub * m,
+                            d, dsub, ksub * M);
+        }
+    }
+}
+
+void ProductQuantizer::compute_inner_prod_tables (
+           size_t nx,
+           const float * x,
+           float * dis_tables) const
+{
+
+    if (dsub < 16) {
+
+#pragma omp parallel for
+        for (size_t i = 0; i < nx; i++) {
+            compute_inner_prod_table (x + i * d, dis_tables + i * ksub * M);
+        }
+
+    } else { // use BLAS
+
+        // compute distance tables
+        for (int m = 0; m < M; m++) {
+            FINTEGER ldc = ksub * M, nxi = nx, ksubi = ksub,
+                dsubi = dsub, di = d;
+            float one = 1.0, zero = 0;
+
+            sgemm_ ("Transposed", "Not transposed",
+                    &ksubi, &nxi, &dsubi,
+                    &one, &centroids [m * dsub * ksub], &dsubi,
+                    x + dsub * m, &di,
+                    &zero, dis_tables + ksub * m, &ldc);
+        }
+
+    }
+}
+
+template <class C>
+static void pq_knn_search_with_tables (
+      const ProductQuantizer& pq,
+      size_t nbits,
+      const float *dis_tables,
+      const uint8_t * codes,
+      const size_t ncodes,
+      HeapArray<C> * res,
+      bool init_finalize_heap)
+{
+    size_t k = res->k, nx = res->nh;
+    size_t ksub = pq.ksub, M = pq.M;
+
+
+#pragma omp parallel for
+    for (size_t i = 0; i < nx; i++) {
+        /* query preparation for asymmetric search: compute look-up tables */
+        const float* dis_table = dis_tables + i * ksub * M;
+
+        /* Compute distances and keep smallest values */
+        int64_t * __restrict heap_ids = res->ids + i * k;
+        float * __restrict heap_dis = res->val + i * k;
+
+        if (init_finalize_heap) {
+            heap_heapify<C> (k, heap_dis, heap_ids);
+        }
+
+        switch (nbits) {
+          case 8:
+              pq_estimators_from_tables<uint8_t, C> (pq,
+                                                     codes, ncodes,
+                                                     dis_table,
+                                                     k, heap_dis, heap_ids);
+              break;
+
+          case 16:
+              pq_estimators_from_tables<uint16_t, C> (pq,
+                                                      (uint16_t*)codes, ncodes,
+                                                      dis_table,
+                                                      k, heap_dis, heap_ids);
+              break;
+
+          default:
+              pq_estimators_from_tables_generic<C> (pq,
+                                                    nbits,
+                                                    codes, ncodes,
+                                                    dis_table,
+                                                    k, heap_dis, heap_ids);
+              break;
+        }
+
+        if (init_finalize_heap) {
+            heap_reorder<C> (k, heap_dis, heap_ids);
+        }
+    }
+}
+
+void ProductQuantizer::search (const float * __restrict x,
+                               size_t nx,
+                               const uint8_t * codes,
+                               const size_t ncodes,
+                               float_maxheap_array_t * res,
+                               bool init_finalize_heap) const
+{
+    FAISS_THROW_IF_NOT (nx == res->nh);
+    std::unique_ptr<float[]> dis_tables(new float [nx * ksub * M]);
+    compute_distance_tables (nx, x, dis_tables.get());
+
+    pq_knn_search_with_tables<CMax<float, int64_t>> (
+      *this, nbits, dis_tables.get(), codes, ncodes, res, init_finalize_heap);
+}
+
+void ProductQuantizer::search_ip (const float * __restrict x,
+                               size_t nx,
+                               const uint8_t * codes,
+                               const size_t ncodes,
+                               float_minheap_array_t * res,
+                               bool init_finalize_heap) const
+{
+    FAISS_THROW_IF_NOT (nx == res->nh);
+    std::unique_ptr<float[]> dis_tables(new float [nx * ksub * M]);
+    compute_inner_prod_tables (nx, x, dis_tables.get());
+
+    pq_knn_search_with_tables<CMin<float, int64_t> > (
+      *this, nbits, dis_tables.get(), codes, ncodes, res, init_finalize_heap);
+}
+
+
+
+static float sqr (float x) {
+    return x * x;
+}
+
+void ProductQuantizer::compute_sdc_table ()
+{
+    sdc_table.resize (M * ksub * ksub);
+
+    for (int m = 0; m < M; m++) {
+
+        const float *cents = centroids.data() + m * ksub * dsub;
+        float * dis_tab = sdc_table.data() + m * ksub * ksub;
+
+        // TODO optimize with BLAS
+        for (int i = 0; i < ksub; i++) {
+            const float *centi = cents + i * dsub;
+            for (int j = 0; j < ksub; j++) {
+                float accu = 0;
+                const float *centj = cents + j * dsub;
+                for (int k = 0; k < dsub; k++)
+                    accu += sqr (centi[k] - centj[k]);
+                dis_tab [i + j * ksub] = accu;
+            }
+        }
+    }
+}
+
+void ProductQuantizer::search_sdc (const uint8_t * qcodes,
+                     size_t nq,
+                     const uint8_t * bcodes,
+                     const size_t nb,
+                     float_maxheap_array_t * res,
+                     bool init_finalize_heap) const
+{
+    FAISS_THROW_IF_NOT (sdc_table.size() == M * ksub * ksub);
+    FAISS_THROW_IF_NOT (nbits == 8);
+    size_t k = res->k;
+
+
+#pragma omp parallel for
+    for (size_t i = 0; i < nq; i++) {
+
+        /* Compute distances and keep smallest values */
+        idx_t * heap_ids = res->ids + i * k;
+        float *  heap_dis = res->val + i * k;
+        const uint8_t * qcode = qcodes + i * code_size;
+
+        if (init_finalize_heap)
+            maxheap_heapify (k, heap_dis, heap_ids);
+
+        const uint8_t * bcode = bcodes;
+        for (size_t j = 0; j < nb; j++) {
+            float dis = 0;
+            const float * tab = sdc_table.data();
+            for (int m = 0; m < M; m++) {
+                dis += tab[bcode[m] + qcode[m] * ksub];
+                tab += ksub * ksub;
+            }
+            if (dis < heap_dis[0]) {
+                maxheap_pop (k, heap_dis, heap_ids);
+                maxheap_push (k, heap_dis, heap_ids, dis, j);
+            }
+            bcode += code_size;
+        }
+
+        if (init_finalize_heap)
+            maxheap_reorder (k, heap_dis, heap_ids);
+    }
+
+}
+
+
+ProductQuantizer::PQEncoderGeneric::PQEncoderGeneric(uint8_t *code, int nbits,
+                                                     uint8_t offset)
+    : code(code), offset(offset), nbits(nbits), reg(0) {
+  assert(nbits <= 64);
+  if (offset > 0) {
+    reg = (*code & ((1 << offset) - 1));
+  }
+}
+
+void ProductQuantizer::PQEncoderGeneric::encode(uint64_t x) {
+  reg |= (uint8_t)(x << offset);
+  x >>= (8 - offset);
+  if (offset + nbits >= 8) {
+    *code++ = reg;
+
+    for (int i = 0; i < (nbits - (8 - offset)) / 8; ++i) {
+      *code++ = (uint8_t)x;
+      x >>= 8;
+    }
+
+    offset += nbits;
+    offset &= 7;
+    reg = (uint8_t)x;
+  } else {
+    offset += nbits;
+  }
+}
+
+ProductQuantizer::PQEncoderGeneric::~PQEncoderGeneric() {
+  if (offset > 0) {
+    *code = reg;
+  }
+}
+
+
+ProductQuantizer::PQEncoder8::PQEncoder8(uint8_t *code, int nbits)
+    : code(code) {
+  assert(8 == nbits);
+}
+
+void ProductQuantizer::PQEncoder8::encode(uint64_t x) {
+  *code++ = (uint8_t)x;
+}
+
+
+ProductQuantizer::PQEncoder16::PQEncoder16(uint8_t *code, int nbits)
+    : code((uint16_t *)code) {
+  assert(16 == nbits);
+}
+
+void ProductQuantizer::PQEncoder16::encode(uint64_t x) {
+  *code++ = (uint16_t)x;
+}
+
+
+ProductQuantizer::PQDecoderGeneric::PQDecoderGeneric(const uint8_t *code,
+                                                     int nbits)
+    : code(code),
+      offset(0),
+      nbits(nbits),
+      mask((1ull << nbits) - 1),
+      reg(0) {
+  assert(nbits <= 64);
+}
+
+uint64_t ProductQuantizer::PQDecoderGeneric::decode() {
+  if (offset == 0) {
+    reg = *code;
+  }
+  uint64_t c = (reg >> offset);
+
+  if (offset + nbits >= 8) {
+    uint64_t e = 8 - offset;
+    ++code;
+    for (int i = 0; i < (nbits - (8 - offset)) / 8; ++i) {
+      c |= ((uint64_t)(*code++) << e);
+      e += 8;
+    }
+
+    offset += nbits;
+    offset &= 7;
+    if (offset > 0) {
+      reg = *code;
+      c |= ((uint64_t)reg << e);
+    }
+  } else {
+    offset += nbits;
+  }
+
+  return c & mask;
+}
+
+
+ProductQuantizer::PQDecoder8::PQDecoder8(const uint8_t *code, int nbits)
+    : code(code) {
+  assert(8 == nbits);
+}
+
+uint64_t ProductQuantizer::PQDecoder8::decode() {
+  return (uint64_t)(*code++);
+}
+
+
+ProductQuantizer::PQDecoder16::PQDecoder16(const uint8_t *code, int nbits)
+    : code((uint16_t *)code) {
+  assert(16 == nbits);
+}
+
+uint64_t ProductQuantizer::PQDecoder16::decode() {
+  return (uint64_t)(*code++);
+}
+
+
+}  // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/impl/ProductQuantizer.h b/core/src/index/thirdparty/faiss/impl/ProductQuantizer.h
new file mode 100644
index 0000000000..40066441bd
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/ProductQuantizer.h
@@ -0,0 +1,242 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_PRODUCT_QUANTIZER_H
+#define FAISS_PRODUCT_QUANTIZER_H
+
+#include <stdint.h>
+
+#include <vector>
+
+#include <faiss/Clustering.h>
+#include <faiss/utils/Heap.h>
+
+namespace faiss {
+
+/** Product Quantizer. Implemented only for METRIC_L2 */
+struct ProductQuantizer {
+
+    using idx_t = Index::idx_t;
+
+    size_t d;              ///< size of the input vectors
+    size_t M;              ///< number of subquantizers
+    size_t nbits;          ///< number of bits per quantization index
+
+    // values derived from the above
+    size_t dsub;           ///< dimensionality of each subvector
+    size_t code_size;      ///< bytes per indexed vector
+    size_t ksub;           ///< number of centroids for each subquantizer
+    bool verbose;          ///< verbose during training?
+
+    /// initialization
+    enum train_type_t {
+        Train_default,
+        Train_hot_start,   ///< the centroids are already initialized
+        Train_shared,      ///< share dictionary accross PQ segments
+        Train_hypercube,   ///< intialize centroids with nbits-D hypercube
+        Train_hypercube_pca,   ///< intialize centroids with nbits-D hypercube
+    };
+    train_type_t train_type;
+
+    ClusteringParameters cp; ///< parameters used during clustering
+
+    /// if non-NULL, use this index for assignment (should be of size
+    /// d / M)
+    Index *assign_index;
+
+    /// Centroid table, size M * ksub * dsub
+    std::vector<float> centroids;
+
+    /// return the centroids associated with subvector m
+    float * get_centroids (size_t m, size_t i) {
+        return &centroids [(m * ksub + i) * dsub];
+    }
+    const float * get_centroids (size_t m, size_t i) const {
+        return &centroids [(m * ksub + i) * dsub];
+    }
+
+    // Train the product quantizer on a set of points. A clustering
+    // can be set on input to define non-default clustering parameters
+    void train (int n, const float *x);
+
+    ProductQuantizer(size_t d, /* dimensionality of the input vectors */
+            size_t M,          /* number of subquantizers */
+            size_t nbits);     /* number of bit per subvector index */
+
+    ProductQuantizer ();
+
+    /// compute derived values when d, M and nbits have been set
+    void set_derived_values ();
+
+    /// Define the centroids for subquantizer m
+    void set_params (const float * centroids, int m);
+
+    /// Quantize one vector with the product quantizer
+    void compute_code (const float * x, uint8_t * code) const ;
+
+    /// same as compute_code for several vectors
+    void compute_codes (const float * x,
+                        uint8_t * codes,
+                        size_t n) const ;
+
+    /// speed up code assignment using assign_index
+    /// (non-const because the index is changed)
+    void compute_codes_with_assign_index (
+                const float * x,
+                uint8_t * codes,
+                size_t n);
+
+    /// decode a vector from a given code (or n vectors if third argument)
+    void decode (const uint8_t *code, float *x) const;
+    void decode (const uint8_t *code, float *x, size_t n) const;
+
+    /// If we happen to have the distance tables precomputed, this is
+    /// more efficient to compute the codes.
+    void compute_code_from_distance_table (const float *tab,
+                                           uint8_t *code) const;
+
+
+    /** Compute distance table for one vector.
+     *
+     * The distance table for x = [x_0 x_1 .. x_(M-1)] is a M * ksub
+     * matrix that contains
+     *
+     *   dis_table (m, j) = || x_m - c_(m, j)||^2
+     *   for m = 0..M-1 and j = 0 .. ksub - 1
+     *
+     * where c_(m, j) is the centroid no j of sub-quantizer m.
+     *
+     * @param x         input vector size d
+     * @param dis_table output table, size M * ksub
+     */
+    void compute_distance_table (const float * x,
+                                 float * dis_table) const;
+
+    void compute_inner_prod_table (const float * x,
+                                   float * dis_table) const;
+
+
+    /** compute distance table for several vectors
+     * @param nx        nb of input vectors
+     * @param x         input vector size nx * d
+     * @param dis_table output table, size nx * M * ksub
+     */
+    void compute_distance_tables (size_t nx,
+                                  const float * x,
+                                  float * dis_tables) const;
+
+    void compute_inner_prod_tables (size_t nx,
+                                    const float * x,
+                                    float * dis_tables) const;
+
+
+    /** perform a search (L2 distance)
+     * @param x        query vectors, size nx * d
+     * @param nx       nb of queries
+     * @param codes    database codes, size ncodes * code_size
+     * @param ncodes   nb of nb vectors
+     * @param res      heap array to store results (nh == nx)
+     * @param init_finalize_heap  initialize heap (input) and sort (output)?
+     */
+    void search (const float * x,
+                 size_t nx,
+                 const uint8_t * codes,
+                 const size_t ncodes,
+                 float_maxheap_array_t *res,
+                 bool init_finalize_heap = true) const;
+
+    /** same as search, but with inner product similarity */
+    void search_ip (const float * x,
+                 size_t nx,
+                 const uint8_t * codes,
+                 const size_t ncodes,
+                 float_minheap_array_t *res,
+                 bool init_finalize_heap = true) const;
+
+
+    /// Symmetric Distance Table
+    std::vector<float> sdc_table;
+
+    // intitialize the SDC table from the centroids
+    void compute_sdc_table ();
+
+    void search_sdc (const uint8_t * qcodes,
+                     size_t nq,
+                     const uint8_t * bcodes,
+                     const size_t ncodes,
+                     float_maxheap_array_t * res,
+                     bool init_finalize_heap = true) const;
+
+    struct PQEncoderGeneric {
+        uint8_t *code;   ///< code for this vector
+        uint8_t offset;
+        const int nbits; ///< number of bits per subquantizer index
+
+        uint8_t reg;
+
+        PQEncoderGeneric(uint8_t *code, int nbits, uint8_t offset = 0);
+
+        void encode(uint64_t x);
+
+        ~PQEncoderGeneric();
+    };
+
+
+    struct PQEncoder8 {
+        uint8_t *code;
+
+        PQEncoder8(uint8_t *code, int nbits);
+
+        void encode(uint64_t x);
+    };
+
+    struct PQEncoder16 {
+        uint16_t *code;
+
+        PQEncoder16(uint8_t *code, int nbits);
+
+        void encode(uint64_t x);
+    };
+
+
+    struct PQDecoderGeneric {
+        const uint8_t *code;
+        uint8_t offset;
+        const int nbits;
+        const uint64_t mask;
+        uint8_t reg;
+
+        PQDecoderGeneric(const uint8_t *code, int nbits);
+
+        uint64_t decode();
+    };
+
+    struct PQDecoder8 {
+        const uint8_t *code;
+
+        PQDecoder8(const uint8_t *code, int nbits);
+
+        uint64_t decode();
+    };
+
+    struct PQDecoder16 {
+        const uint16_t *code;
+
+        PQDecoder16(const uint8_t *code, int nbits);
+
+        uint64_t decode();
+    };
+
+};
+
+
+}  // namespace faiss
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.cpp b/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.cpp
new file mode 100644
index 0000000000..06bfcb1a79
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.cpp
@@ -0,0 +1,1625 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/impl/ScalarQuantizer.h>
+
+#include <cstdio>
+#include <algorithm>
+
+#include <omp.h>
+
+#ifdef __SSE__
+#include <immintrin.h>
+#endif
+
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+namespace faiss {
+
+/*******************************************************************
+ * ScalarQuantizer implementation
+ *
+ * The main source of complexity is to support combinations of 4
+ * variants without incurring runtime tests or virtual function calls:
+ *
+ * - 4 / 8 bits per code component
+ * - uniform / non-uniform
+ * - IP / L2 distance search
+ * - scalar / AVX distance computation
+ *
+ * The appropriate Quantizer object is returned via select_quantizer
+ * that hides the template mess.
+ ********************************************************************/
+
+#ifdef __AVX__
+#define USE_AVX
+#endif
+
+
+
+namespace {
+
+typedef Index::idx_t idx_t;
+typedef ScalarQuantizer::QuantizerType QuantizerType;
+typedef ScalarQuantizer::RangeStat RangeStat;
+using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer;
+
+
+/*******************************************************************
+ * Codec: converts between values in [0, 1] and an index in a code
+ * array. The "i" parameter is the vector component index (not byte
+ * index).
+ */
+
+struct Codec8bit {
+
+    static void encode_component (float x, uint8_t *code, int i) {
+        code[i] = (int)(255 * x);
+    }
+
+    static float decode_component (const uint8_t *code, int i) {
+        return (code[i] + 0.5f) / 255.0f;
+    }
+
+#ifdef USE_AVX
+    static __m256 decode_8_components (const uint8_t *code, int i) {
+        uint64_t c8 = *(uint64_t*)(code + i);
+        __m128i c4lo = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8));
+        __m128i c4hi = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8 >> 32));
+        // __m256i i8 = _mm256_set_m128i(c4lo, c4hi);
+        __m256i i8 = _mm256_castsi128_si256 (c4lo);
+        i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
+        __m256 f8 = _mm256_cvtepi32_ps (i8);
+        __m256 half = _mm256_set1_ps (0.5f);
+        f8 += half;
+        __m256 one_255 = _mm256_set1_ps (1.f / 255.f);
+        return f8 * one_255;
+    }
+#endif
+};
+
+
+struct Codec4bit {
+
+    static void encode_component (float x, uint8_t *code, int i) {
+        code [i / 2] |= (int)(x * 15.0) << ((i & 1) << 2);
+    }
+
+    static float decode_component (const uint8_t *code, int i) {
+        return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
+    }
+
+
+#ifdef USE_AVX
+    static __m256 decode_8_components (const uint8_t *code, int i) {
+        uint32_t c4 = *(uint32_t*)(code + (i >> 1));
+        uint32_t mask = 0x0f0f0f0f;
+        uint32_t c4ev = c4 & mask;
+        uint32_t c4od = (c4 >> 4) & mask;
+
+        // the 8 lower bytes of c8 contain the values
+        __m128i c8 = _mm_unpacklo_epi8 (_mm_set1_epi32(c4ev),
+                                        _mm_set1_epi32(c4od));
+        __m128i c4lo = _mm_cvtepu8_epi32 (c8);
+        __m128i c4hi = _mm_cvtepu8_epi32 (_mm_srli_si128(c8, 4));
+        __m256i i8 = _mm256_castsi128_si256 (c4lo);
+        i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
+        __m256 f8 = _mm256_cvtepi32_ps (i8);
+        __m256 half = _mm256_set1_ps (0.5f);
+        f8 += half;
+        __m256 one_255 = _mm256_set1_ps (1.f / 15.f);
+        return f8 * one_255;
+    }
+#endif
+};
+
+struct Codec6bit {
+
+    static void encode_component (float x, uint8_t *code, int i) {
+        int bits = (int)(x * 63.0);
+        code += (i >> 2) * 3;
+        switch(i & 3) {
+        case 0:
+            code[0] |= bits;
+            break;
+        case 1:
+            code[0] |= bits << 6;
+            code[1] |= bits >> 2;
+            break;
+        case 2:
+            code[1] |= bits << 4;
+            code[2] |= bits >> 4;
+            break;
+        case 3:
+            code[2] |= bits << 2;
+            break;
+        }
+    }
+
+    static float decode_component (const uint8_t *code, int i) {
+        uint8_t bits;
+        code += (i >> 2) * 3;
+        switch(i & 3) {
+        case 0:
+            bits = code[0] & 0x3f;
+            break;
+        case 1:
+            bits = code[0] >> 6;
+            bits |= (code[1] & 0xf) << 2;
+            break;
+        case 2:
+            bits = code[1] >> 4;
+            bits |= (code[2] & 3) << 4;
+            break;
+        case 3:
+            bits = code[2] >> 2;
+            break;
+        }
+        return (bits + 0.5f) / 63.0f;
+    }
+
+#ifdef USE_AVX
+    static __m256 decode_8_components (const uint8_t *code, int i) {
+        return _mm256_set_ps
+            (decode_component(code, i + 7),
+             decode_component(code, i + 6),
+             decode_component(code, i + 5),
+             decode_component(code, i + 4),
+             decode_component(code, i + 3),
+             decode_component(code, i + 2),
+             decode_component(code, i + 1),
+             decode_component(code, i + 0));
+    }
+#endif
+};
+
+
+
+#ifdef USE_AVX
+
+
+uint16_t encode_fp16 (float x) {
+    __m128 xf = _mm_set1_ps (x);
+    __m128i xi = _mm_cvtps_ph (
+         xf, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
+    return _mm_cvtsi128_si32 (xi) & 0xffff;
+}
+
+
+float decode_fp16 (uint16_t x) {
+    __m128i xi = _mm_set1_epi16 (x);
+    __m128 xf = _mm_cvtph_ps (xi);
+    return _mm_cvtss_f32 (xf);
+}
+
+#else
+
+// non-intrinsic FP16 <-> FP32 code adapted from
+// https://github.com/ispc/ispc/blob/master/stdlib.ispc
+
+float floatbits (uint32_t x) {
+    void *xptr = &x;
+    return *(float*)xptr;
+}
+
+uint32_t intbits (float f) {
+    void *fptr = &f;
+    return *(uint32_t*)fptr;
+}
+
+
+uint16_t encode_fp16 (float f) {
+
+    // via Fabian "ryg" Giesen.
+    // https://gist.github.com/2156668
+    uint32_t sign_mask = 0x80000000u;
+    int32_t o;
+
+    uint32_t fint = intbits(f);
+    uint32_t sign = fint & sign_mask;
+    fint ^= sign;
+
+    // NOTE all the integer compares in this function can be safely
+    // compiled into signed compares since all operands are below
+    // 0x80000000. Important if you want fast straight SSE2 code (since
+    // there's no unsigned PCMPGTD).
+
+    // Inf or NaN (all exponent bits set)
+    // NaN->qNaN and Inf->Inf
+    // unconditional assignment here, will override with right value for
+    // the regular case below.
+    uint32_t f32infty = 255u << 23;
+    o = (fint > f32infty) ? 0x7e00u : 0x7c00u;
+
+    // (De)normalized number or zero
+    // update fint unconditionally to save the blending; we don't need it
+    // anymore for the Inf/NaN case anyway.
+
+    const uint32_t round_mask = ~0xfffu;
+    const uint32_t magic = 15u << 23;
+
+    // Shift exponent down, denormalize if necessary.
+    // NOTE This represents half-float denormals using single
+    // precision denormals.  The main reason to do this is that
+    // there's no shift with per-lane variable shifts in SSE*, which
+    // we'd otherwise need. It has some funky side effects though:
+    // - This conversion will actually respect the FTZ (Flush To Zero)
+    //   flag in MXCSR - if it's set, no half-float denormals will be
+    //   generated. I'm honestly not sure whether this is good or
+    //   bad. It's definitely interesting.
+    // - If the underlying HW doesn't support denormals (not an issue
+    //   with Intel CPUs, but might be a problem on GPUs or PS3 SPUs),
+    //   you will always get flush-to-zero behavior. This is bad,
+    //   unless you're on a CPU where you don't care.
+    // - Denormals tend to be slow. FP32 denormals are rare in
+    //   practice outside of things like recursive filters in DSP -
+    //   not a typical half-float application. Whether FP16 denormals
+    //   are rare in practice, I don't know. Whatever slow path your
+    //   HW may or may not have for denormals, this may well hit it.
+    float fscale = floatbits(fint & round_mask) * floatbits(magic);
+    fscale = std::min(fscale, floatbits((31u << 23) - 0x1000u));
+    int32_t fint2 = intbits(fscale) - round_mask;
+
+    if (fint < f32infty)
+        o = fint2 >> 13; // Take the bits!
+
+    return (o | (sign >> 16));
+}
+
+float decode_fp16 (uint16_t h) {
+
+    // https://gist.github.com/2144712
+    // Fabian "ryg" Giesen.
+
+    const uint32_t shifted_exp = 0x7c00u << 13; // exponent mask after shift
+
+    int32_t o = ((int32_t)(h & 0x7fffu)) << 13;     // exponent/mantissa bits
+    int32_t exp = shifted_exp & o;   // just the exponent
+    o += (int32_t)(127 - 15) << 23;        // exponent adjust
+
+    int32_t infnan_val = o + ((int32_t)(128 - 16) << 23);
+    int32_t zerodenorm_val = intbits(
+                 floatbits(o + (1u<<23)) - floatbits(113u << 23));
+    int32_t reg_val = (exp == 0) ? zerodenorm_val : o;
+
+    int32_t sign_bit = ((int32_t)(h & 0x8000u)) << 16;
+    return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit);
+}
+
+#endif
+
+
+
+/*******************************************************************
+ * Quantizer: normalizes scalar vector components, then passes them
+ * through a codec
+ *******************************************************************/
+
+
+
+
+
+template<class Codec, bool uniform, int SIMD>
+struct QuantizerTemplate {};
+
+
+template<class Codec>
+struct QuantizerTemplate<Codec, true, 1>: ScalarQuantizer::Quantizer {
+    const size_t d;
+    const float vmin, vdiff;
+
+    QuantizerTemplate(size_t d, const std::vector<float> &trained):
+        d(d), vmin(trained[0]), vdiff(trained[1])
+    {
+    }
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            float xi = (x[i] - vmin) / vdiff;
+            if (xi < 0) {
+                xi = 0;
+            }
+            if (xi > 1.0) {
+                xi = 1.0;
+            }
+            Codec::encode_component(xi, code, i);
+        }
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            float xi = Codec::decode_component(code, i);
+            x[i] = vmin + xi * vdiff;
+        }
+    }
+
+    float reconstruct_component (const uint8_t * code, int i) const
+    {
+        float xi = Codec::decode_component (code, i);
+        return vmin + xi * vdiff;
+    }
+
+};
+
+
+
+#ifdef USE_AVX
+
+template<class Codec>
+struct QuantizerTemplate<Codec, true, 8>: QuantizerTemplate<Codec, true, 1> {
+
+    QuantizerTemplate (size_t d, const std::vector<float> &trained):
+        QuantizerTemplate<Codec, true, 1> (d, trained) {}
+
+    __m256 reconstruct_8_components (const uint8_t * code, int i) const
+    {
+        __m256 xi = Codec::decode_8_components (code, i);
+        return _mm256_set1_ps(this->vmin) + xi * _mm256_set1_ps (this->vdiff);
+    }
+
+};
+
+#endif
+
+
+
+template<class Codec>
+struct QuantizerTemplate<Codec, false, 1>: ScalarQuantizer::Quantizer {
+    const size_t d;
+    const float *vmin, *vdiff;
+
+    QuantizerTemplate (size_t d, const std::vector<float> &trained):
+        d(d), vmin(trained.data()), vdiff(trained.data() + d) {}
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            float xi = (x[i] - vmin[i]) / vdiff[i];
+            if (xi < 0)
+                xi = 0;
+            if (xi > 1.0)
+                xi = 1.0;
+            Codec::encode_component(xi, code, i);
+        }
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            float xi = Codec::decode_component(code, i);
+            x[i] = vmin[i] + xi * vdiff[i];
+        }
+    }
+
+    float reconstruct_component (const uint8_t * code, int i) const
+    {
+        float xi = Codec::decode_component (code, i);
+        return vmin[i] + xi * vdiff[i];
+    }
+
+};
+
+
+#ifdef USE_AVX
+
+template<class Codec>
+struct QuantizerTemplate<Codec, false, 8>: QuantizerTemplate<Codec, false, 1> {
+
+    QuantizerTemplate (size_t d, const std::vector<float> &trained):
+        QuantizerTemplate<Codec, false, 1> (d, trained) {}
+
+    __m256 reconstruct_8_components (const uint8_t * code, int i) const
+    {
+        __m256 xi = Codec::decode_8_components (code, i);
+        return _mm256_loadu_ps (this->vmin + i) + xi * _mm256_loadu_ps (this->vdiff + i);
+    }
+
+
+};
+
+#endif
+
+/*******************************************************************
+ * FP16 quantizer
+ *******************************************************************/
+
+template<int SIMDWIDTH>
+struct QuantizerFP16 {};
+
+template<>
+struct QuantizerFP16<1>: ScalarQuantizer::Quantizer {
+    const size_t d;
+
+    QuantizerFP16(size_t d, const std::vector<float> & /* unused */):
+        d(d) {}
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            ((uint16_t*)code)[i] = encode_fp16(x[i]);
+        }
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            x[i] = decode_fp16(((uint16_t*)code)[i]);
+        }
+    }
+
+    float reconstruct_component (const uint8_t * code, int i) const
+    {
+        return decode_fp16(((uint16_t*)code)[i]);
+    }
+
+};
+
+#ifdef USE_AVX
+
+template<>
+struct QuantizerFP16<8>: QuantizerFP16<1> {
+
+    QuantizerFP16 (size_t d, const std::vector<float> &trained):
+        QuantizerFP16<1> (d, trained) {}
+
+    __m256 reconstruct_8_components (const uint8_t * code, int i) const
+    {
+        __m128i codei = _mm_loadu_si128 ((const __m128i*)(code + 2 * i));
+        return _mm256_cvtph_ps (codei);
+    }
+
+};
+
+#endif
+
+/*******************************************************************
+ * 8bit_direct quantizer
+ *******************************************************************/
+
+template<int SIMDWIDTH>
+struct Quantizer8bitDirect {};
+
+template<>
+struct Quantizer8bitDirect<1>: ScalarQuantizer::Quantizer {
+    const size_t d;
+
+    Quantizer8bitDirect(size_t d, const std::vector<float> & /* unused */):
+        d(d) {}
+
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            code[i] = (uint8_t)x[i];
+        }
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            x[i] = code[i];
+        }
+    }
+
+    float reconstruct_component (const uint8_t * code, int i) const
+    {
+        return code[i];
+    }
+
+};
+
+#ifdef USE_AVX
+
+template<>
+struct Quantizer8bitDirect<8>: Quantizer8bitDirect<1> {
+
+    Quantizer8bitDirect (size_t d, const std::vector<float> &trained):
+        Quantizer8bitDirect<1> (d, trained) {}
+
+    __m256 reconstruct_8_components (const uint8_t * code, int i) const
+    {
+        __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8
+        __m256i y8 = _mm256_cvtepu8_epi32 (x8);  // 8 * int32
+        return _mm256_cvtepi32_ps (y8); // 8 * float32
+    }
+
+};
+
+#endif
+
+
+template<int SIMDWIDTH>
+ScalarQuantizer::Quantizer *select_quantizer_1 (
+          QuantizerType qtype,
+          size_t d, const std::vector<float> & trained)
+{
+    switch(qtype) {
+    case ScalarQuantizer::QT_8bit:
+        return new QuantizerTemplate<Codec8bit, false, SIMDWIDTH>(d, trained);
+    case ScalarQuantizer::QT_6bit:
+        return new QuantizerTemplate<Codec6bit, false, SIMDWIDTH>(d, trained);
+    case ScalarQuantizer::QT_4bit:
+        return new QuantizerTemplate<Codec4bit, false, SIMDWIDTH>(d, trained);
+    case ScalarQuantizer::QT_8bit_uniform:
+        return new QuantizerTemplate<Codec8bit, true, SIMDWIDTH>(d, trained);
+    case ScalarQuantizer::QT_4bit_uniform:
+        return new QuantizerTemplate<Codec4bit, true, SIMDWIDTH>(d, trained);
+    case ScalarQuantizer::QT_fp16:
+        return new QuantizerFP16<SIMDWIDTH> (d, trained);
+    case ScalarQuantizer::QT_8bit_direct:
+        return new Quantizer8bitDirect<SIMDWIDTH> (d, trained);
+    }
+    FAISS_THROW_MSG ("unknown qtype");
+}
+
+
+
+
+/*******************************************************************
+ * Quantizer range training
+ */
+
+static float sqr (float x) {
+    return x * x;
+}
+
+
+void train_Uniform(RangeStat rs, float rs_arg,
+                   idx_t n, int k, const float *x,
+                   std::vector<float> & trained)
+{
+    trained.resize (2);
+    float & vmin = trained[0];
+    float & vmax = trained[1];
+
+    if (rs == ScalarQuantizer::RS_minmax) {
+        vmin = HUGE_VAL; vmax = -HUGE_VAL;
+        for (size_t i = 0; i < n; i++) {
+            if (x[i] < vmin) vmin = x[i];
+            if (x[i] > vmax) vmax = x[i];
+        }
+        float vexp = (vmax - vmin) * rs_arg;
+        vmin -= vexp;
+        vmax += vexp;
+    } else if (rs == ScalarQuantizer::RS_meanstd) {
+        double sum = 0, sum2 = 0;
+        for (size_t i = 0; i < n; i++) {
+            sum += x[i];
+            sum2 += x[i] * x[i];
+        }
+        float mean = sum / n;
+        float var = sum2 / n - mean * mean;
+        float std = var <= 0 ? 1.0 : sqrt(var);
+
+        vmin = mean - std * rs_arg ;
+        vmax = mean + std * rs_arg ;
+    } else if (rs == ScalarQuantizer::RS_quantiles) {
+        std::vector<float> x_copy(n);
+        memcpy(x_copy.data(), x, n * sizeof(*x));
+        // TODO just do a qucikselect
+        std::sort(x_copy.begin(), x_copy.end());
+        int o = int(rs_arg * n);
+        if (o < 0) o = 0;
+        if (o > n - o) o = n / 2;
+        vmin = x_copy[o];
+        vmax = x_copy[n - 1 - o];
+
+    } else if (rs == ScalarQuantizer::RS_optim) {
+        float a, b;
+        float sx = 0;
+        {
+            vmin = HUGE_VAL, vmax = -HUGE_VAL;
+            for (size_t i = 0; i < n; i++) {
+                if (x[i] < vmin) vmin = x[i];
+                if (x[i] > vmax) vmax = x[i];
+                sx += x[i];
+            }
+            b = vmin;
+            a = (vmax - vmin) / (k - 1);
+        }
+        int verbose = false;
+        int niter = 2000;
+        float last_err = -1;
+        int iter_last_err = 0;
+        for (int it = 0; it < niter; it++) {
+            float sn = 0, sn2 = 0, sxn = 0, err1 = 0;
+
+            for (idx_t i = 0; i < n; i++) {
+                float xi = x[i];
+                float ni = floor ((xi - b) / a + 0.5);
+                if (ni < 0) ni = 0;
+                if (ni >= k) ni = k - 1;
+                err1 += sqr (xi - (ni * a + b));
+                sn  += ni;
+                sn2 += ni * ni;
+                sxn += ni * xi;
+            }
+
+            if (err1 == last_err) {
+                iter_last_err ++;
+                if (iter_last_err == 16) break;
+            } else {
+                last_err = err1;
+                iter_last_err = 0;
+            }
+
+            float det = sqr (sn) - sn2 * n;
+
+            b = (sn * sxn - sn2 * sx) / det;
+            a = (sn * sx - n * sxn) / det;
+            if (verbose) {
+                printf ("it %d, err1=%g            \r", it, err1);
+                fflush(stdout);
+            }
+        }
+        if (verbose) printf("\n");
+
+        vmin = b;
+        vmax = b + a * (k - 1);
+
+    } else {
+        FAISS_THROW_MSG ("Invalid qtype");
+    }
+    vmax -= vmin;
+}
+
+void train_NonUniform(RangeStat rs, float rs_arg,
+                      idx_t n, int d, int k, const float *x,
+                      std::vector<float> & trained)
+{
+
+    trained.resize (2 * d);
+    float * vmin = trained.data();
+    float * vmax = trained.data() + d;
+    if (rs == ScalarQuantizer::RS_minmax) {
+        memcpy (vmin, x, sizeof(*x) * d);
+        memcpy (vmax, x, sizeof(*x) * d);
+        for (size_t i = 1; i < n; i++) {
+            const float *xi = x + i * d;
+            for (size_t j = 0; j < d; j++) {
+                if (xi[j] < vmin[j]) vmin[j] = xi[j];
+                if (xi[j] > vmax[j]) vmax[j] = xi[j];
+            }
+        }
+        float *vdiff = vmax;
+        for (size_t j = 0; j < d; j++) {
+            float vexp = (vmax[j] - vmin[j]) * rs_arg;
+            vmin[j] -= vexp;
+            vmax[j] += vexp;
+            vdiff [j] = vmax[j] - vmin[j];
+        }
+    } else {
+        // transpose
+        std::vector<float> xt(n * d);
+        for (size_t i = 1; i < n; i++) {
+            const float *xi = x + i * d;
+            for (size_t j = 0; j < d; j++) {
+                xt[j * n + i] = xi[j];
+            }
+        }
+        std::vector<float> trained_d(2);
+#pragma omp parallel for
+        for (size_t j = 0; j < d; j++) {
+            train_Uniform(rs, rs_arg,
+                          n, k, xt.data() + j * n,
+                          trained_d);
+            vmin[j] = trained_d[0];
+            vmax[j] = trained_d[1];
+        }
+    }
+}
+
+
+
+/*******************************************************************
+ * Similarity: gets vector components and computes a similarity wrt. a
+ * query vector stored in the object. The data fields just encapsulate
+ * an accumulator.
+ */
+
+template<int SIMDWIDTH>
+struct SimilarityL2 {};
+
+
+template<>
+struct SimilarityL2<1> {
+    static constexpr int simdwidth = 1;
+    static constexpr MetricType metric_type = METRIC_L2;
+
+    const float *y, *yi;
+
+    explicit SimilarityL2 (const float * y): y(y) {}
+
+    /******* scalar accumulator *******/
+
+    float accu;
+
+    void begin () {
+        accu = 0;
+        yi = y;
+    }
+
+    void add_component (float x) {
+        float tmp = *yi++ - x;
+        accu += tmp * tmp;
+    }
+
+    void add_component_2 (float x1, float x2) {
+        float tmp = x1 - x2;
+        accu += tmp * tmp;
+    }
+
+    float result () {
+        return accu;
+    }
+};
+
+
+#ifdef USE_AVX
+template<>
+struct SimilarityL2<8> {
+    static constexpr int simdwidth = 8;
+    static constexpr MetricType metric_type = METRIC_L2;
+
+    const float *y, *yi;
+
+    explicit SimilarityL2 (const float * y): y(y) {}
+    __m256 accu8;
+
+    void begin_8 () {
+        accu8 = _mm256_setzero_ps();
+        yi = y;
+    }
+
+    void add_8_components (__m256 x) {
+        __m256 yiv = _mm256_loadu_ps (yi);
+        yi += 8;
+        __m256 tmp = yiv - x;
+        accu8 += tmp * tmp;
+    }
+
+    void add_8_components_2 (__m256 x, __m256 y) {
+        __m256 tmp = y - x;
+        accu8 += tmp * tmp;
+    }
+
+    float result_8 () {
+        __m256 sum = _mm256_hadd_ps(accu8, accu8);
+        __m256 sum2 = _mm256_hadd_ps(sum, sum);
+        // now add the 0th and 4th component
+        return
+            _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
+            _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
+    }
+
+};
+
+#endif
+
+
+template<int SIMDWIDTH>
+struct SimilarityIP {};
+
+
+template<>
+struct SimilarityIP<1> {
+    static constexpr int simdwidth = 1;
+    static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
+    const float *y, *yi;
+
+    float accu;
+
+    explicit SimilarityIP (const float * y):
+        y (y) {}
+
+    void begin () {
+        accu = 0;
+        yi = y;
+    }
+
+    void add_component (float x) {
+        accu +=  *yi++ * x;
+    }
+
+    void add_component_2 (float x1, float x2) {
+        accu +=  x1 * x2;
+    }
+
+    float result () {
+        return accu;
+    }
+};
+
+#ifdef USE_AVX
+
+template<>
+struct SimilarityIP<8> {
+    static constexpr int simdwidth = 8;
+    static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
+
+    const float *y, *yi;
+
+    float accu;
+
+    explicit SimilarityIP (const float * y):
+        y (y) {}
+
+    __m256 accu8;
+
+    void begin_8 () {
+        accu8 = _mm256_setzero_ps();
+        yi = y;
+    }
+
+    void add_8_components (__m256 x) {
+        __m256 yiv = _mm256_loadu_ps (yi);
+        yi += 8;
+        accu8 += yiv * x;
+    }
+
+    void add_8_components_2 (__m256 x1, __m256 x2) {
+        accu8 += x1 * x2;
+    }
+
+    float result_8 () {
+        __m256 sum = _mm256_hadd_ps(accu8, accu8);
+        __m256 sum2 = _mm256_hadd_ps(sum, sum);
+        // now add the 0th and 4th component
+        return
+            _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
+            _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
+    }
+};
+#endif
+
+
+/*******************************************************************
+ * DistanceComputer: combines a similarity and a quantizer to do
+ * code-to-vector or code-to-code comparisons
+ *******************************************************************/
+
+template<class Quantizer, class Similarity, int SIMDWIDTH>
+struct DCTemplate : SQDistanceComputer {};
+
+template<class Quantizer, class Similarity>
+struct DCTemplate<Quantizer, Similarity, 1> : SQDistanceComputer
+{
+    using Sim = Similarity;
+
+    Quantizer quant;
+
+    DCTemplate(size_t d, const std::vector<float> &trained):
+        quant(d, trained)
+    {}
+
+    float compute_distance(const float* x, const uint8_t* code) const {
+
+        Similarity sim(x);
+        sim.begin();
+        for (size_t i = 0; i < quant.d; i++) {
+            float xi = quant.reconstruct_component(code, i);
+            sim.add_component(xi);
+        }
+        return sim.result();
+    }
+
+    float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
+        const {
+        Similarity sim(nullptr);
+        sim.begin();
+        for (size_t i = 0; i < quant.d; i++) {
+            float x1 = quant.reconstruct_component(code1, i);
+            float x2 = quant.reconstruct_component(code2, i);
+                sim.add_component_2(x1, x2);
+        }
+        return sim.result();
+    }
+
+    void set_query (const float *x) final {
+        q = x;
+    }
+
+    /// compute distance of vector i to current query
+    float operator () (idx_t i) final {
+        return compute_distance (q, codes + i * code_size);
+    }
+
+    float symmetric_dis (idx_t i, idx_t j) override {
+        return compute_code_distance (codes + i * code_size,
+                                      codes + j * code_size);
+    }
+
+    float query_to_code (const uint8_t * code) const {
+        return compute_distance (q, code);
+    }
+
+};
+
+#ifdef USE_AVX
+
+template<class Quantizer, class Similarity>
+struct DCTemplate<Quantizer, Similarity, 8> : SQDistanceComputer
+{
+    using Sim = Similarity;
+
+    Quantizer quant;
+
+    DCTemplate(size_t d, const std::vector<float> &trained):
+        quant(d, trained)
+    {}
+
+    float compute_distance(const float* x, const uint8_t* code) const {
+
+        Similarity sim(x);
+        sim.begin_8();
+        for (size_t i = 0; i < quant.d; i += 8) {
+            __m256 xi = quant.reconstruct_8_components(code, i);
+            sim.add_8_components(xi);
+        }
+        return sim.result_8();
+    }
+
+    float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
+        const {
+        Similarity sim(nullptr);
+        sim.begin_8();
+        for (size_t i = 0; i < quant.d; i += 8) {
+            __m256 x1 = quant.reconstruct_8_components(code1, i);
+            __m256 x2 = quant.reconstruct_8_components(code2, i);
+            sim.add_8_components_2(x1, x2);
+        }
+        return sim.result_8();
+    }
+
+    void set_query (const float *x) final {
+        q = x;
+    }
+
+    /// compute distance of vector i to current query
+    float operator () (idx_t i) final {
+        return compute_distance (q, codes + i * code_size);
+    }
+
+    float symmetric_dis (idx_t i, idx_t j) override {
+        return compute_code_distance (codes + i * code_size,
+                                      codes + j * code_size);
+    }
+
+    float query_to_code (const uint8_t * code) const {
+        return compute_distance (q, code);
+    }
+
+};
+
+#endif
+
+
+
+/*******************************************************************
+ * DistanceComputerByte: computes distances in the integer domain
+ *******************************************************************/
+
+template<class Similarity, int SIMDWIDTH>
+struct DistanceComputerByte : SQDistanceComputer {};
+
+template<class Similarity>
+struct DistanceComputerByte<Similarity, 1> : SQDistanceComputer {
+    using Sim = Similarity;
+
+    int d;
+    std::vector<uint8_t> tmp;
+
+    DistanceComputerByte(int d, const std::vector<float> &): d(d), tmp(d) {
+    }
+
+    int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
+        const {
+        int accu = 0;
+        for (int i = 0; i < d; i++) {
+            if (Sim::metric_type == METRIC_INNER_PRODUCT) {
+                accu += int(code1[i]) * code2[i];
+            } else {
+                int diff = int(code1[i]) - code2[i];
+                accu += diff * diff;
+            }
+        }
+        return accu;
+    }
+
+    void set_query (const float *x) final {
+        for (int i = 0; i < d; i++) {
+            tmp[i] = int(x[i]);
+        }
+    }
+
+    int compute_distance(const float* x, const uint8_t* code) {
+        set_query(x);
+        return compute_code_distance(tmp.data(), code);
+    }
+
+    /// compute distance of vector i to current query
+    float operator () (idx_t i) final {
+        return compute_distance (q, codes + i * code_size);
+    }
+
+    float symmetric_dis (idx_t i, idx_t j) override {
+        return compute_code_distance (codes + i * code_size,
+                                      codes + j * code_size);
+    }
+
+    float query_to_code (const uint8_t * code) const {
+        return compute_code_distance (tmp.data(), code);
+    }
+
+};
+
+#ifdef USE_AVX
+
+
+template<class Similarity>
+struct DistanceComputerByte<Similarity, 8> : SQDistanceComputer {
+    using Sim = Similarity;
+
+    int d;
+    std::vector<uint8_t> tmp;
+
+    DistanceComputerByte(int d, const std::vector<float> &): d(d), tmp(d) {
+    }
+
+    int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
+        const {
+        // __m256i accu = _mm256_setzero_ps ();
+        __m256i accu = _mm256_setzero_si256 ();
+        for (int i = 0; i < d; i += 16) {
+            // load 16 bytes, convert to 16 uint16_t
+            __m256i c1 = _mm256_cvtepu8_epi16
+                (_mm_loadu_si128((__m128i*)(code1 + i)));
+            __m256i c2 = _mm256_cvtepu8_epi16
+                (_mm_loadu_si128((__m128i*)(code2 + i)));
+            __m256i prod32;
+            if (Sim::metric_type == METRIC_INNER_PRODUCT) {
+                prod32 = _mm256_madd_epi16(c1, c2);
+            } else {
+                __m256i diff = _mm256_sub_epi16(c1, c2);
+                prod32 = _mm256_madd_epi16(diff, diff);
+            }
+            accu = _mm256_add_epi32 (accu, prod32);
+
+        }
+        __m128i sum = _mm256_extractf128_si256(accu, 0);
+        sum = _mm_add_epi32 (sum, _mm256_extractf128_si256(accu, 1));
+        sum = _mm_hadd_epi32 (sum, sum);
+        sum = _mm_hadd_epi32 (sum, sum);
+        return _mm_cvtsi128_si32 (sum);
+    }
+
+    void set_query (const float *x) final {
+        /*
+        for (int i = 0; i < d; i += 8) {
+            __m256 xi = _mm256_loadu_ps (x + i);
+            __m256i ci = _mm256_cvtps_epi32(xi);
+        */
+        for (int i = 0; i < d; i++) {
+            tmp[i] = int(x[i]);
+        }
+    }
+
+    int compute_distance(const float* x, const uint8_t* code) {
+        set_query(x);
+        return compute_code_distance(tmp.data(), code);
+    }
+
+    /// compute distance of vector i to current query
+    float operator () (idx_t i) final {
+        return compute_distance (q, codes + i * code_size);
+    }
+
+    float symmetric_dis (idx_t i, idx_t j) override {
+        return compute_code_distance (codes + i * code_size,
+                                      codes + j * code_size);
+    }
+
+    float query_to_code (const uint8_t * code) const {
+        return compute_code_distance (tmp.data(), code);
+    }
+
+
+};
+
+#endif
+
+/*******************************************************************
+ * select_distance_computer: runtime selection of template
+ * specialization
+ *******************************************************************/
+
+
+template<class Sim>
+SQDistanceComputer *select_distance_computer (
+          QuantizerType qtype,
+          size_t d, const std::vector<float> & trained)
+{
+    constexpr int SIMDWIDTH = Sim::simdwidth;
+    switch(qtype) {
+    case ScalarQuantizer::QT_8bit_uniform:
+        return new DCTemplate<QuantizerTemplate<Codec8bit, true, SIMDWIDTH>,
+                              Sim, SIMDWIDTH>(d, trained);
+
+    case ScalarQuantizer::QT_4bit_uniform:
+        return new DCTemplate<QuantizerTemplate<Codec4bit, true, SIMDWIDTH>,
+                              Sim, SIMDWIDTH>(d, trained);
+
+    case ScalarQuantizer::QT_8bit:
+        return new DCTemplate<QuantizerTemplate<Codec8bit, false, SIMDWIDTH>,
+                              Sim, SIMDWIDTH>(d, trained);
+
+    case ScalarQuantizer::QT_6bit:
+        return new DCTemplate<QuantizerTemplate<Codec6bit, false, SIMDWIDTH>,
+                              Sim, SIMDWIDTH>(d, trained);
+
+    case ScalarQuantizer::QT_4bit:
+        return new DCTemplate<QuantizerTemplate<Codec4bit, false, SIMDWIDTH>,
+                              Sim, SIMDWIDTH>(d, trained);
+
+    case ScalarQuantizer::QT_fp16:
+        return new DCTemplate
+            <QuantizerFP16<SIMDWIDTH>, Sim, SIMDWIDTH>(d, trained);
+
+    case ScalarQuantizer::QT_8bit_direct:
+        if (d % 16 == 0) {
+            return new DistanceComputerByte<Sim, SIMDWIDTH>(d, trained);
+        } else {
+            return new DCTemplate
+                <Quantizer8bitDirect<SIMDWIDTH>, Sim, SIMDWIDTH>(d, trained);
+        }
+    }
+    FAISS_THROW_MSG ("unknown qtype");
+    return nullptr;
+}
+
+
+
+} // anonymous namespace
+
+
+
+/*******************************************************************
+ * ScalarQuantizer implementation
+ ********************************************************************/
+
+
+
+ScalarQuantizer::ScalarQuantizer
+          (size_t d, QuantizerType qtype):
+              qtype (qtype), rangestat(RS_minmax), rangestat_arg(0), d (d)
+{
+    switch (qtype) {
+    case QT_8bit:
+    case QT_8bit_uniform:
+    case QT_8bit_direct:
+        code_size = d;
+        break;
+    case QT_4bit:
+    case QT_4bit_uniform:
+        code_size = (d + 1) / 2;
+        break;
+    case QT_6bit:
+        code_size = (d * 6 + 7) / 8;
+        break;
+    case QT_fp16:
+        code_size = d * 2;
+        break;
+    }
+
+}
+
+ScalarQuantizer::ScalarQuantizer ():
+    qtype(QT_8bit),
+    rangestat(RS_minmax), rangestat_arg(0), d (0), code_size(0)
+{}
+
+void ScalarQuantizer::train (size_t n, const float *x)
+{
+    int bit_per_dim =
+        qtype == QT_4bit_uniform ? 4 :
+        qtype == QT_4bit ? 4 :
+        qtype == QT_6bit ? 6 :
+        qtype == QT_8bit_uniform ? 8 :
+        qtype == QT_8bit ? 8 : -1;
+
+    switch (qtype) {
+    case QT_4bit_uniform: case QT_8bit_uniform:
+        train_Uniform (rangestat, rangestat_arg,
+                       n * d, 1 << bit_per_dim, x, trained);
+        break;
+    case QT_4bit: case QT_8bit: case QT_6bit:
+        train_NonUniform (rangestat, rangestat_arg,
+                          n, d, 1 << bit_per_dim, x, trained);
+        break;
+    case QT_fp16:
+    case QT_8bit_direct:
+        // no training necessary
+        break;
+    }
+}
+
+void ScalarQuantizer::train_residual(size_t n,
+                                     const float *x,
+                                     Index *quantizer,
+                                     bool by_residual,
+                                     bool verbose)
+{
+    const float * x_in = x;
+
+    // 100k points more than enough
+    x = fvecs_maybe_subsample (
+         d, (size_t*)&n, 100000,
+         x, verbose, 1234);
+
+    ScopeDeleter<float> del_x (x_in == x ? nullptr : x);
+
+    if (by_residual) {
+        std::vector<Index::idx_t> idx(n);
+        quantizer->assign (n, x, idx.data());
+
+        std::vector<float> residuals(n * d);
+        quantizer->compute_residual_n (n, x, residuals.data(), idx.data());
+
+        train (n, residuals.data());
+    } else {
+        train (n, x);
+    }
+}
+
+
+ScalarQuantizer::Quantizer *ScalarQuantizer::select_quantizer () const
+{
+#ifdef USE_AVX
+    if (d % 8 == 0) {
+        return select_quantizer_1<8> (qtype, d, trained);
+    } else
+#endif
+    {
+        return select_quantizer_1<1> (qtype, d, trained);
+    }
+}
+
+
+void ScalarQuantizer::compute_codes (const float * x,
+                                     uint8_t * codes,
+                                     size_t n) const
+{
+    std::unique_ptr<Quantizer> squant(select_quantizer ());
+
+    memset (codes, 0, code_size * n);
+#pragma omp parallel for
+    for (size_t i = 0; i < n; i++)
+        squant->encode_vector (x + i * d, codes + i * code_size);
+}
+
+void ScalarQuantizer::decode (const uint8_t *codes, float *x, size_t n) const
+{
+    std::unique_ptr<Quantizer> squant(select_quantizer ());
+
+#pragma omp parallel for
+    for (size_t i = 0; i < n; i++)
+        squant->decode_vector (codes + i * code_size, x + i * d);
+}
+
+
+SQDistanceComputer *
+ScalarQuantizer::get_distance_computer (MetricType metric) const
+{
+    FAISS_THROW_IF_NOT(metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT);
+#ifdef USE_AVX
+    if (d % 8 == 0) {
+        if (metric == METRIC_L2) {
+            return select_distance_computer<SimilarityL2<8> >
+                (qtype, d, trained);
+        } else {
+            return select_distance_computer<SimilarityIP<8> >
+                (qtype, d, trained);
+        }
+    } else
+#endif
+    {
+        if (metric == METRIC_L2) {
+            return select_distance_computer<SimilarityL2<1> >
+                (qtype, d, trained);
+        } else {
+            return select_distance_computer<SimilarityIP<1> >
+                (qtype, d, trained);
+        }
+    }
+}
+
+
+/*******************************************************************
+ * IndexScalarQuantizer/IndexIVFScalarQuantizer scanner object
+ *
+ * It is an InvertedListScanner, but is designed to work with
+ * IndexScalarQuantizer as well.
+ ********************************************************************/
+
+namespace {
+
+
+template<class DCClass>
+struct IVFSQScannerIP: InvertedListScanner {
+    DCClass dc;
+    bool store_pairs, by_residual;
+
+    size_t code_size;
+
+    idx_t list_no;  /// current list (set to 0 for Flat index
+    float accu0;    /// added to all distances
+
+    IVFSQScannerIP(int d, const std::vector<float> & trained,
+                   size_t code_size, bool store_pairs,
+                   bool by_residual):
+        dc(d, trained), store_pairs(store_pairs),
+        by_residual(by_residual),
+        code_size(code_size), list_no(0), accu0(0)
+    {}
+
+
+    void set_query (const float *query) override {
+        dc.set_query (query);
+    }
+
+    void set_list (idx_t list_no, float coarse_dis) override {
+        this->list_no = list_no;
+        accu0 = by_residual ? coarse_dis : 0;
+    }
+
+    float distance_to_code (const uint8_t *code) const final {
+        return accu0 + dc.query_to_code (code);
+    }
+
+    size_t scan_codes (size_t list_size,
+                       const uint8_t *codes,
+                       const idx_t *ids,
+                       float *simi, idx_t *idxi,
+                       size_t k) const override
+    {
+        size_t nup = 0;
+
+        for (size_t j = 0; j < list_size; j++) {
+
+            float accu = accu0 + dc.query_to_code (codes);
+
+            if (accu > simi [0]) {
+                minheap_pop (k, simi, idxi);
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                minheap_push (k, simi, idxi, accu, id);
+                nup++;
+            }
+            codes += code_size;
+        }
+        return nup;
+    }
+
+    void scan_codes_range (size_t list_size,
+                           const uint8_t *codes,
+                           const idx_t *ids,
+                           float radius,
+                           RangeQueryResult & res) const override
+    {
+        for (size_t j = 0; j < list_size; j++) {
+            float accu = accu0 + dc.query_to_code (codes);
+            if (accu > radius) {
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                res.add (accu, id);
+            }
+            codes += code_size;
+        }
+    }
+
+
+};
+
+
+template<class DCClass>
+struct IVFSQScannerL2: InvertedListScanner {
+
+    DCClass dc;
+
+    bool store_pairs, by_residual;
+    size_t code_size;
+    const Index *quantizer;
+    idx_t list_no;    /// current inverted list
+    const float *x;   /// current query
+
+    std::vector<float> tmp;
+
+    IVFSQScannerL2(int d, const std::vector<float> & trained,
+                   size_t code_size, const Index *quantizer,
+                   bool store_pairs, bool by_residual):
+        dc(d, trained), store_pairs(store_pairs), by_residual(by_residual),
+        code_size(code_size), quantizer(quantizer),
+        list_no (0), x (nullptr), tmp (d)
+    {
+    }
+
+
+    void set_query (const float *query) override {
+        x = query;
+        if (!quantizer) {
+            dc.set_query (query);
+        }
+    }
+
+
+    void set_list (idx_t list_no, float /*coarse_dis*/) override {
+        if (by_residual) {
+            this->list_no = list_no;
+            // shift of x_in wrt centroid
+            quantizer->Index::compute_residual (x, tmp.data(), list_no);
+            dc.set_query (tmp.data ());
+        } else {
+            dc.set_query (x);
+        }
+    }
+
+    float distance_to_code (const uint8_t *code) const final {
+        return dc.query_to_code (code);
+    }
+
+    size_t scan_codes (size_t list_size,
+                       const uint8_t *codes,
+                       const idx_t *ids,
+                       float *simi, idx_t *idxi,
+                       size_t k) const override
+    {
+        size_t nup = 0;
+        for (size_t j = 0; j < list_size; j++) {
+
+            float dis = dc.query_to_code (codes);
+
+            if (dis < simi [0]) {
+                maxheap_pop (k, simi, idxi);
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                maxheap_push (k, simi, idxi, dis, id);
+                nup++;
+            }
+            codes += code_size;
+        }
+        return nup;
+    }
+
+    void scan_codes_range (size_t list_size,
+                           const uint8_t *codes,
+                           const idx_t *ids,
+                           float radius,
+                           RangeQueryResult & res) const override
+    {
+        for (size_t j = 0; j < list_size; j++) {
+            float dis = dc.query_to_code (codes);
+            if (dis < radius) {
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                res.add (dis, id);
+            }
+            codes += code_size;
+        }
+    }
+
+
+};
+
+template<class DCClass>
+InvertedListScanner* sel2_InvertedListScanner
+      (const ScalarQuantizer *sq,
+       const Index *quantizer, bool store_pairs, bool r)
+{
+    if (DCClass::Sim::metric_type == METRIC_L2) {
+        return new IVFSQScannerL2<DCClass>(sq->d, sq->trained, sq->code_size,
+                                           quantizer, store_pairs, r);
+    } else if (DCClass::Sim::metric_type == METRIC_INNER_PRODUCT) {
+        return new IVFSQScannerIP<DCClass>(sq->d, sq->trained, sq->code_size,
+                                           store_pairs, r);
+    } else {
+        FAISS_THROW_MSG("unsupported metric type");
+    }
+}
+
+template<class Similarity, class Codec, bool uniform>
+InvertedListScanner* sel12_InvertedListScanner
+        (const ScalarQuantizer *sq,
+         const Index *quantizer, bool store_pairs, bool r)
+{
+    constexpr int SIMDWIDTH = Similarity::simdwidth;
+    using QuantizerClass = QuantizerTemplate<Codec, uniform, SIMDWIDTH>;
+    using DCClass = DCTemplate<QuantizerClass, Similarity, SIMDWIDTH>;
+    return sel2_InvertedListScanner<DCClass> (sq, quantizer, store_pairs, r);
+}
+
+
+
+template<class Similarity>
+InvertedListScanner* sel1_InvertedListScanner
+        (const ScalarQuantizer *sq, const Index *quantizer,
+         bool store_pairs, bool r)
+{
+    constexpr int SIMDWIDTH = Similarity::simdwidth;
+    switch(sq->qtype) {
+    case ScalarQuantizer::QT_8bit_uniform:
+        return sel12_InvertedListScanner
+            <Similarity, Codec8bit, true>(sq, quantizer, store_pairs, r);
+    case ScalarQuantizer::QT_4bit_uniform:
+        return sel12_InvertedListScanner
+            <Similarity, Codec4bit, true>(sq, quantizer, store_pairs, r);
+    case ScalarQuantizer::QT_8bit:
+        return sel12_InvertedListScanner
+            <Similarity, Codec8bit, false>(sq, quantizer, store_pairs, r);
+    case ScalarQuantizer::QT_4bit:
+        return sel12_InvertedListScanner
+            <Similarity, Codec4bit, false>(sq, quantizer, store_pairs, r);
+    case ScalarQuantizer::QT_6bit:
+        return sel12_InvertedListScanner
+            <Similarity, Codec6bit, false>(sq, quantizer, store_pairs, r);
+    case ScalarQuantizer::QT_fp16:
+        return sel2_InvertedListScanner
+            <DCTemplate<QuantizerFP16<SIMDWIDTH>, Similarity, SIMDWIDTH> >
+            (sq, quantizer, store_pairs, r);
+    case ScalarQuantizer::QT_8bit_direct:
+        if (sq->d % 16 == 0) {
+            return sel2_InvertedListScanner
+                <DistanceComputerByte<Similarity, SIMDWIDTH> >
+                (sq, quantizer, store_pairs, r);
+        } else {
+            return sel2_InvertedListScanner
+                <DCTemplate<Quantizer8bitDirect<SIMDWIDTH>,
+                            Similarity, SIMDWIDTH> >
+                (sq, quantizer, store_pairs, r);
+        }
+
+    }
+
+    FAISS_THROW_MSG ("unknown qtype");
+    return nullptr;
+}
+
+template<int SIMDWIDTH>
+InvertedListScanner* sel0_InvertedListScanner
+        (MetricType mt, const ScalarQuantizer *sq,
+         const Index *quantizer, bool store_pairs, bool by_residual)
+{
+    if (mt == METRIC_L2) {
+        return sel1_InvertedListScanner<SimilarityL2<SIMDWIDTH> >
+            (sq, quantizer, store_pairs, by_residual);
+    } else if (mt == METRIC_INNER_PRODUCT) {
+        return sel1_InvertedListScanner<SimilarityIP<SIMDWIDTH> >
+            (sq, quantizer, store_pairs, by_residual);
+    } else {
+        FAISS_THROW_MSG("unsupported metric type");
+    }
+}
+
+
+
+} // anonymous namespace
+
+
+InvertedListScanner* ScalarQuantizer::select_InvertedListScanner
+        (MetricType mt, const Index *quantizer,
+         bool store_pairs, bool by_residual) const
+{
+#ifdef USE_AVX
+    if (d % 8 == 0) {
+        return sel0_InvertedListScanner<8>
+            (mt, this, quantizer, store_pairs, by_residual);
+    } else
+#endif
+    {
+        return sel0_InvertedListScanner<1>
+            (mt, this, quantizer, store_pairs, by_residual);
+    }
+}
+
+
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.h b/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.h
new file mode 100644
index 0000000000..d5718b280f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.h
@@ -0,0 +1,120 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+
+namespace faiss {
+
+/**
+ * The uniform quantizer has a range [vmin, vmax]. The range can be
+ * the same for all dimensions (uniform) or specific per dimension
+ * (default).
+ */
+
+struct ScalarQuantizer {
+
+    enum QuantizerType {
+        QT_8bit,             ///< 8 bits per component
+        QT_4bit,             ///< 4 bits per component
+        QT_8bit_uniform,     ///< same, shared range for all dimensions
+        QT_4bit_uniform,
+        QT_fp16,
+        QT_8bit_direct,      /// fast indexing of uint8s
+        QT_6bit,             ///< 6 bits per component
+    };
+
+    QuantizerType qtype;
+
+    /** The uniform encoder can estimate the range of representable
+     * values of the unform encoder using different statistics. Here
+     * rs = rangestat_arg */
+
+    // rangestat_arg.
+    enum RangeStat {
+        RS_minmax,           ///< [min - rs*(max-min), max + rs*(max-min)]
+        RS_meanstd,          ///< [mean - std * rs, mean + std * rs]
+        RS_quantiles,        ///< [Q(rs), Q(1-rs)]
+        RS_optim,            ///< alternate optimization of reconstruction error
+    };
+
+    RangeStat rangestat;
+    float rangestat_arg;
+
+    /// dimension of input vectors
+    size_t d;
+
+    /// bytes per vector
+    size_t code_size;
+
+    /// trained values (including the range)
+    std::vector<float> trained;
+
+    ScalarQuantizer (size_t d, QuantizerType qtype);
+    ScalarQuantizer ();
+
+    void train (size_t n, const float *x);
+
+    /// Used by an IVF index to train based on the residuals
+    void train_residual (size_t n,
+                         const float *x,
+                         Index *quantizer,
+                         bool by_residual,
+                         bool verbose);
+
+    /// same as compute_code for several vectors
+    void compute_codes (const float * x,
+                        uint8_t * codes,
+                        size_t n) const ;
+
+    /// decode a vector from a given code (or n vectors if third argument)
+    void decode (const uint8_t *code, float *x, size_t n) const;
+
+
+    /*****************************************************
+     * Objects that provide methods for encoding/decoding, distance
+     * computation and inverted list scanning
+     *****************************************************/
+
+    struct Quantizer {
+        // encodes one vector. Assumes code is filled with 0s on input!
+        virtual void encode_vector(const float *x, uint8_t *code) const = 0;
+        virtual void decode_vector(const uint8_t *code, float *x) const = 0;
+
+        virtual ~Quantizer() {}
+    };
+
+    Quantizer * select_quantizer() const;
+
+    struct SQDistanceComputer: DistanceComputer {
+
+        const float *q;
+        const uint8_t *codes;
+        size_t code_size;
+
+        SQDistanceComputer (): q(nullptr), codes (nullptr), code_size (0)
+        {}
+
+    };
+
+    SQDistanceComputer *get_distance_computer (MetricType metric = METRIC_L2)
+        const;
+
+    InvertedListScanner *select_InvertedListScanner
+        (MetricType mt, const Index *quantizer, bool store_pairs,
+         bool by_residual=false) const;
+
+};
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/impl/ThreadedIndex-inl.h b/core/src/index/thirdparty/faiss/impl/ThreadedIndex-inl.h
new file mode 100644
index 0000000000..de549a0288
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/ThreadedIndex-inl.h
@@ -0,0 +1,192 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/FaissAssert.h>
+#include <exception>
+#include <iostream>
+
+namespace faiss {
+
+template <typename IndexT>
+ThreadedIndex<IndexT>::ThreadedIndex(bool threaded)
+    // 0 is default dimension
+    : ThreadedIndex(0, threaded) {
+}
+
+template <typename IndexT>
+ThreadedIndex<IndexT>::ThreadedIndex(int d, bool threaded)
+    : IndexT(d),
+      own_fields(false),
+      isThreaded_(threaded) {
+  }
+
+template <typename IndexT>
+ThreadedIndex<IndexT>::~ThreadedIndex() {
+  for (auto& p : indices_) {
+    if (isThreaded_) {
+      // should have worker thread
+      FAISS_ASSERT((bool) p.second);
+
+      // This will also flush all pending work
+      p.second->stop();
+      p.second->waitForThreadExit();
+    } else {
+      // should not have worker thread
+      FAISS_ASSERT(!(bool) p.second);
+    }
+
+    if (own_fields) {
+      delete p.first;
+    }
+  }
+}
+
+template <typename IndexT>
+void ThreadedIndex<IndexT>::addIndex(IndexT* index) {
+  // We inherit the dimension from the first index added to us if we don't have
+  // a set dimension
+  if (indices_.empty() && this->d == 0) {
+    this->d = index->d;
+  }
+
+  // The new index must match our set dimension
+  FAISS_THROW_IF_NOT_FMT(this->d == index->d,
+                         "addIndex: dimension mismatch for "
+                         "newly added index; expecting dim %d, "
+                         "new index has dim %d",
+                         this->d, index->d);
+
+  if (!indices_.empty()) {
+    auto& existing = indices_.front().first;
+
+    FAISS_THROW_IF_NOT_MSG(index->metric_type == existing->metric_type,
+                           "addIndex: newly added index is "
+                           "of different metric type than old index");
+
+    // Make sure this index is not duplicated
+    for (auto& p : indices_) {
+      FAISS_THROW_IF_NOT_MSG(p.first != index,
+                             "addIndex: attempting to add index "
+                             "that is already in the collection");
+    }
+  }
+
+  indices_.emplace_back(
+    std::make_pair(
+      index,
+      std::unique_ptr<WorkerThread>(isThreaded_ ?
+                                    new WorkerThread : nullptr)));
+
+  onAfterAddIndex(index);
+}
+
+template <typename IndexT>
+void ThreadedIndex<IndexT>::removeIndex(IndexT* index) {
+  for (auto it = indices_.begin(); it != indices_.end(); ++it) {
+    if (it->first == index) {
+      // This is our index; stop the worker thread before removing it,
+      // to ensure that it has finished before function exit
+      if (isThreaded_) {
+        // should have worker thread
+        FAISS_ASSERT((bool) it->second);
+        it->second->stop();
+        it->second->waitForThreadExit();
+      } else {
+        // should not have worker thread
+        FAISS_ASSERT(!(bool) it->second);
+      }
+
+      indices_.erase(it);
+      onAfterRemoveIndex(index);
+
+      if (own_fields) {
+        delete index;
+      }
+
+      return;
+    }
+  }
+
+  // could not find our index
+  FAISS_THROW_MSG("IndexReplicas::removeIndex: index not found");
+}
+
+template <typename IndexT>
+void ThreadedIndex<IndexT>::runOnIndex(std::function<void(int, IndexT*)> f) {
+  if (isThreaded_) {
+    std::vector<std::future<bool>> v;
+
+    for (int i = 0; i < this->indices_.size(); ++i) {
+      auto& p = this->indices_[i];
+      auto indexPtr = p.first;
+      v.emplace_back(p.second->add([f, i, indexPtr](){ f(i, indexPtr); }));
+    }
+
+    waitAndHandleFutures(v);
+  } else {
+    // Multiple exceptions may be thrown; gather them as we encounter them,
+    // while letting everything else run to completion
+    std::vector<std::pair<int, std::exception_ptr>> exceptions;
+
+    for (int i = 0; i < this->indices_.size(); ++i) {
+      auto& p = this->indices_[i];
+      try {
+        f(i, p.first);
+      } catch (...) {
+        exceptions.emplace_back(std::make_pair(i, std::current_exception()));
+      }
+    }
+
+    handleExceptions(exceptions);
+  }
+}
+
+template <typename IndexT>
+void ThreadedIndex<IndexT>::runOnIndex(
+  std::function<void(int, const IndexT*)> f) const {
+  const_cast<ThreadedIndex<IndexT>*>(this)->runOnIndex(
+    [f](int i, IndexT* idx){ f(i, idx); });
+}
+
+template <typename IndexT>
+void ThreadedIndex<IndexT>::reset() {
+  runOnIndex([](int, IndexT* index){ index->reset(); });
+  this->ntotal = 0;
+  this->is_trained = false;
+}
+
+template <typename IndexT>
+void
+ThreadedIndex<IndexT>::onAfterAddIndex(IndexT* index) {
+}
+
+template <typename IndexT>
+void
+ThreadedIndex<IndexT>::onAfterRemoveIndex(IndexT* index) {
+}
+
+template <typename IndexT>
+void
+ThreadedIndex<IndexT>::waitAndHandleFutures(std::vector<std::future<bool>>& v) {
+  // Blocking wait for completion for all of the indices, capturing any
+  // exceptions that are generated
+  std::vector<std::pair<int, std::exception_ptr>> exceptions;
+
+  for (int i = 0; i < v.size(); ++i) {
+    auto& fut = v[i];
+
+    try {
+      fut.get();
+    } catch (...) {
+      exceptions.emplace_back(std::make_pair(i, std::current_exception()));
+    }
+  }
+
+  handleExceptions(exceptions);
+}
+
+} // namespace
diff --git a/core/src/index/thirdparty/faiss/impl/ThreadedIndex.h b/core/src/index/thirdparty/faiss/impl/ThreadedIndex.h
new file mode 100644
index 0000000000..89f21486a6
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/ThreadedIndex.h
@@ -0,0 +1,80 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/IndexBinary.h>
+#include <faiss/utils/WorkerThread.h>
+#include <memory>
+#include <vector>
+
+namespace faiss {
+
+/// A holder of indices in a collection of threads
+/// The interface to this class itself is not thread safe
+template <typename IndexT>
+class ThreadedIndex : public IndexT {
+ public:
+  explicit ThreadedIndex(bool threaded);
+  explicit ThreadedIndex(int d, bool threaded);
+
+  ~ThreadedIndex() override;
+
+  /// override an index that is managed by ourselves.
+  /// WARNING: once an index is added, it becomes unsafe to touch it from any
+  /// other thread than that on which is managing it, until we are shut
+  /// down. Use runOnIndex to perform work on it instead.
+  void addIndex(IndexT* index);
+
+  /// Remove an index that is managed by ourselves.
+  /// This will flush all pending work on that index, and then shut
+  /// down its managing thread, and will remove the index.
+  void removeIndex(IndexT* index);
+
+  /// Run a function on all indices, in the thread that the index is
+  /// managed in.
+  /// Function arguments are (index in collection, index pointer)
+  void runOnIndex(std::function<void(int, IndexT*)> f);
+  void runOnIndex(std::function<void(int, const IndexT*)> f) const;
+
+  /// faiss::Index API
+  /// All indices receive the same call
+  void reset() override;
+
+  /// Returns the number of sub-indices
+  int count() const { return indices_.size(); }
+
+  /// Returns the i-th sub-index
+  IndexT* at(int i) { return indices_[i].first; }
+
+  /// Returns the i-th sub-index (const version)
+  const IndexT* at(int i) const { return indices_[i].first; }
+
+  /// Whether or not we are responsible for deleting our contained indices
+  bool own_fields;
+
+ protected:
+  /// Called just after an index is added
+  virtual void onAfterAddIndex(IndexT* index);
+
+  /// Called just after an index is removed
+  virtual void onAfterRemoveIndex(IndexT* index);
+
+protected:
+  static void waitAndHandleFutures(std::vector<std::future<bool>>& v);
+
+  /// Collection of Index instances, with their managing worker thread if any
+  std::vector<std::pair<IndexT*, std::unique_ptr<WorkerThread>>> indices_;
+
+  /// Is this index multi-threaded?
+  bool isThreaded_;
+};
+
+} // namespace
+
+#include <faiss/impl/ThreadedIndex-inl.h>
diff --git a/core/src/index/thirdparty/faiss/impl/index_read.cpp b/core/src/index/thirdparty/faiss/impl/index_read.cpp
new file mode 100644
index 0000000000..8e977a323c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/index_read.cpp
@@ -0,0 +1,819 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/index_io.h>
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/io.h>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/VectorTransform.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/IndexLSH.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexIVFPQR.h>
+#include <faiss/Index2Layer.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFSpectralHash.h>
+#include <faiss/MetaIndexes.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/IndexSQHybrid.h>
+#include <faiss/IndexHNSW.h>
+#include <faiss/IndexLattice.h>
+
+#include <faiss/OnDiskInvertedLists.h>
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/IndexBinaryFromFloat.h>
+#include <faiss/IndexBinaryHNSW.h>
+#include <faiss/IndexBinaryIVF.h>
+
+
+
+namespace faiss {
+
+/*************************************************************
+ * I/O macros
+ *
+ * we use macros so that we have a line number to report in abort
+ * (). This makes debugging a lot easier. The IOReader or IOWriter is
+ * always called f and thus is not passed in as a macro parameter.
+ **************************************************************/
+
+
+#define READANDCHECK(ptr, n) {                                  \
+        size_t ret = (*f)(ptr, sizeof(*(ptr)), n);              \
+        FAISS_THROW_IF_NOT_FMT(ret == (n),                      \
+            "read error in %s: %ld != %ld (%s)",                \
+            f->name.c_str(), ret, size_t(n), strerror(errno));  \
+    }
+
+#define READ1(x)  READANDCHECK(&(x), 1)
+
+// will fail if we write 256G of data at once...
+#define READVECTOR(vec) {                       \
+        long size;                            \
+        READANDCHECK (&size, 1);                \
+        FAISS_THROW_IF_NOT (size >= 0 && size < (1L << 40));  \
+        (vec).resize (size);                    \
+        READANDCHECK ((vec).data (), size);     \
+    }
+
+
+
+/*************************************************************
+ * Read
+ **************************************************************/
+
+static void read_index_header (Index *idx, IOReader *f) {
+    READ1 (idx->d);
+    READ1 (idx->ntotal);
+    Index::idx_t dummy;
+    READ1 (dummy);
+    READ1 (dummy);
+    READ1 (idx->is_trained);
+    READ1 (idx->metric_type);
+    if (idx->metric_type > 1) {
+        READ1 (idx->metric_arg);
+    }
+    idx->verbose = false;
+}
+
+VectorTransform* read_VectorTransform (IOReader *f) {
+    uint32_t h;
+    READ1 (h);
+    VectorTransform *vt = nullptr;
+
+    if (h == fourcc ("rrot") || h == fourcc ("PCAm") ||
+        h == fourcc ("LTra") || h == fourcc ("PcAm") ||
+        h == fourcc ("Viqm")) {
+        LinearTransform *lt = nullptr;
+        if (h == fourcc ("rrot")) {
+            lt = new RandomRotationMatrix ();
+        } else if (h == fourcc ("PCAm") ||
+                   h == fourcc ("PcAm")) {
+            PCAMatrix * pca = new PCAMatrix ();
+            READ1 (pca->eigen_power);
+            READ1 (pca->random_rotation);
+            if (h == fourcc ("PcAm"))
+                READ1 (pca->balanced_bins);
+            READVECTOR (pca->mean);
+            READVECTOR (pca->eigenvalues);
+            READVECTOR (pca->PCAMat);
+            lt = pca;
+        } else if (h == fourcc ("Viqm")) {
+            ITQMatrix *itqm = new ITQMatrix ();
+            READ1 (itqm->max_iter);
+            READ1 (itqm->seed);
+            lt = itqm;
+        } else if (h == fourcc ("LTra")) {
+            lt = new LinearTransform ();
+        }
+        READ1 (lt->have_bias);
+        READVECTOR (lt->A);
+        READVECTOR (lt->b);
+        FAISS_THROW_IF_NOT (lt->A.size() >= lt->d_in * lt->d_out);
+        FAISS_THROW_IF_NOT (!lt->have_bias || lt->b.size() >= lt->d_out);
+        lt->set_is_orthonormal();
+        vt = lt;
+    } else if (h == fourcc ("RmDT")) {
+        RemapDimensionsTransform *rdt = new RemapDimensionsTransform ();
+        READVECTOR (rdt->map);
+        vt = rdt;
+    } else if (h == fourcc ("VNrm")) {
+        NormalizationTransform *nt = new NormalizationTransform ();
+        READ1 (nt->norm);
+        vt = nt;
+    } else if (h == fourcc ("VCnt")) {
+        CenteringTransform *ct = new CenteringTransform ();
+        READVECTOR (ct->mean);
+        vt = ct;
+    } else if (h == fourcc ("Viqt")) {
+        ITQTransform *itqt = new ITQTransform ();
+
+        READVECTOR (itqt->mean);
+        READ1 (itqt->do_pca);
+        {
+            ITQMatrix *itqm = dynamic_cast<ITQMatrix*>
+                (read_VectorTransform (f));
+            FAISS_THROW_IF_NOT(itqm);
+            itqt->itq = *itqm;
+            delete itqm;
+        }
+        {
+            LinearTransform *pi = dynamic_cast<LinearTransform*>
+                (read_VectorTransform (f));
+            FAISS_THROW_IF_NOT (pi);
+            itqt->pca_then_itq = *pi;
+            delete pi;
+        }
+        vt = itqt;
+    } else {
+        FAISS_THROW_MSG("fourcc not recognized");
+    }
+    READ1 (vt->d_in);
+    READ1 (vt->d_out);
+    READ1 (vt->is_trained);
+    return vt;
+}
+
+
+static void read_ArrayInvertedLists_sizes (
+         IOReader *f, std::vector<size_t> & sizes)
+{
+    uint32_t list_type;
+    READ1(list_type);
+    if (list_type == fourcc("full")) {
+        size_t os = sizes.size();
+        READVECTOR (sizes);
+        FAISS_THROW_IF_NOT (os == sizes.size());
+    } else if (list_type == fourcc("sprs")) {
+        std::vector<size_t> idsizes;
+        READVECTOR (idsizes);
+        for (size_t j = 0; j < idsizes.size(); j += 2) {
+            FAISS_THROW_IF_NOT (idsizes[j] < sizes.size());
+            sizes[idsizes[j]] = idsizes[j + 1];
+        }
+    } else {
+        FAISS_THROW_MSG ("invalid list_type");
+    }
+}
+
+InvertedLists *read_InvertedLists (IOReader *f, int io_flags) {
+    uint32_t h;
+    READ1 (h);
+    if (h == fourcc ("il00")) {
+        fprintf(stderr, "read_InvertedLists:"
+                " WARN! inverted lists not stored with IVF object\n");
+        return nullptr;
+    } else if (h == fourcc ("iloa") && !(io_flags & IO_FLAG_MMAP)) {
+        size_t nlist;
+        size_t code_size;
+        std::vector <size_t> list_length;
+        READ1(nlist);
+        READ1(code_size);
+        READVECTOR(list_length);
+        auto ails = new ReadOnlyArrayInvertedLists(nlist, code_size, list_length);
+        size_t n;
+        READ1(n);
+//        ails->readonly_ids.resize(n);
+//        ails->readonly_codes.resize(n*code_size);
+        ails->pin_readonly_ids = std::make_shared<PageLockMemory>(n * sizeof(InvertedLists::idx_t));
+        ails->pin_readonly_codes = std::make_shared<PageLockMemory>(n * code_size * sizeof(uint8_t));
+        READANDCHECK((InvertedLists::idx_t *) ails->pin_readonly_ids->data, n);
+        READANDCHECK((uint8_t *) ails->pin_readonly_codes->data, n * code_size);
+        return ails;
+    } else if (h == fourcc ("ilar") && !(io_flags & IO_FLAG_MMAP)) {
+        auto ails = new ArrayInvertedLists (0, 0);
+        READ1 (ails->nlist);
+        READ1 (ails->code_size);
+        ails->ids.resize (ails->nlist);
+        ails->codes.resize (ails->nlist);
+        std::vector<size_t> sizes (ails->nlist);
+        read_ArrayInvertedLists_sizes (f, sizes);
+        for (size_t i = 0; i < ails->nlist; i++) {
+            ails->ids[i].resize (sizes[i]);
+            ails->codes[i].resize (sizes[i] * ails->code_size);
+        }
+        for (size_t i = 0; i < ails->nlist; i++) {
+            size_t n = ails->ids[i].size();
+            if (n > 0) {
+                READANDCHECK (ails->codes[i].data(), n * ails->code_size);
+                READANDCHECK (ails->ids[i].data(), n);
+            }
+        }
+        return ails;
+    } else if (h == fourcc ("ilar") && (io_flags & IO_FLAG_MMAP)) {
+        // then we load it as an OnDiskInvertedLists
+
+        FileIOReader *reader = dynamic_cast<FileIOReader*>(f);
+        FAISS_THROW_IF_NOT_MSG(reader, "mmap only supported for File objects");
+        FILE *fdesc = reader->f;
+
+        auto ails = new OnDiskInvertedLists ();
+        READ1 (ails->nlist);
+        READ1 (ails->code_size);
+        ails->read_only = true;
+        ails->lists.resize (ails->nlist);
+        std::vector<size_t> sizes (ails->nlist);
+        read_ArrayInvertedLists_sizes (f, sizes);
+        size_t o0 = ftell(fdesc), o = o0;
+        { // do the mmap
+            struct stat buf;
+            int ret = fstat (fileno(fdesc), &buf);
+            FAISS_THROW_IF_NOT_FMT (ret == 0,
+                                    "fstat failed: %s", strerror(errno));
+            ails->totsize = buf.st_size;
+            ails->ptr = (uint8_t*)mmap (nullptr, ails->totsize,
+                                        PROT_READ, MAP_SHARED,
+                                        fileno(fdesc), 0);
+            FAISS_THROW_IF_NOT_FMT (ails->ptr != MAP_FAILED,
+                            "could not mmap: %s",
+                            strerror(errno));
+        }
+
+        for (size_t i = 0; i < ails->nlist; i++) {
+            OnDiskInvertedLists::List & l = ails->lists[i];
+            l.size = l.capacity = sizes[i];
+            l.offset = o;
+            o += l.size * (sizeof(OnDiskInvertedLists::idx_t) +
+                           ails->code_size);
+        }
+        FAISS_THROW_IF_NOT(o <= ails->totsize);
+        // resume normal reading of file
+        fseek (fdesc, o, SEEK_SET);
+        return ails;
+    } else if (h == fourcc ("ilod")) {
+        OnDiskInvertedLists *od = new OnDiskInvertedLists();
+        od->read_only = io_flags & IO_FLAG_READ_ONLY;
+        READ1 (od->nlist);
+        READ1 (od->code_size);
+        // this is a POD object
+        READVECTOR (od->lists);
+        {
+            std::vector<OnDiskInvertedLists::Slot> v;
+            READVECTOR(v);
+            od->slots.assign(v.begin(), v.end());
+        }
+        {
+            std::vector<char> x;
+            READVECTOR(x);
+            od->filename.assign(x.begin(), x.end());
+
+            if (io_flags & IO_FLAG_ONDISK_SAME_DIR) {
+                FileIOReader *reader = dynamic_cast<FileIOReader*>(f);
+                FAISS_THROW_IF_NOT_MSG (
+                    reader, "IO_FLAG_ONDISK_SAME_DIR only supported "
+                    "when reading from file");
+                std::string indexname = reader->name;
+                std::string dirname = "./";
+                size_t slash = indexname.find_last_of('/');
+                if (slash != std::string::npos) {
+                    dirname = indexname.substr(0, slash + 1);
+                }
+                std::string filename = od->filename;
+                slash = filename.find_last_of('/');
+                if (slash != std::string::npos) {
+                    filename = filename.substr(slash + 1);
+                }
+                filename = dirname + filename;
+                printf("IO_FLAG_ONDISK_SAME_DIR: "
+                       "updating ondisk filename from %s to %s\n",
+                       od->filename.c_str(), filename.c_str());
+                od->filename = filename;
+            }
+
+        }
+        READ1(od->totsize);
+        od->do_mmap();
+        return od;
+    } else {
+        FAISS_THROW_MSG ("read_InvertedLists: unsupported invlist type");
+    }
+}
+
+static void read_InvertedLists (
+        IndexIVF *ivf, IOReader *f, int io_flags) {
+    InvertedLists *ils = read_InvertedLists (f, io_flags);
+    FAISS_THROW_IF_NOT (!ils || (ils->nlist == ivf->nlist &&
+                                 ils->code_size == ivf->code_size));
+    ivf->invlists = ils;
+    ivf->own_invlists = true;
+}
+
+static void read_ProductQuantizer (ProductQuantizer *pq, IOReader *f) {
+    READ1 (pq->d);
+    READ1 (pq->M);
+    READ1 (pq->nbits);
+    pq->set_derived_values ();
+    READVECTOR (pq->centroids);
+}
+
+static void read_ScalarQuantizer (ScalarQuantizer *ivsc, IOReader *f) {
+    READ1 (ivsc->qtype);
+    READ1 (ivsc->rangestat);
+    READ1 (ivsc->rangestat_arg);
+    READ1 (ivsc->d);
+    READ1 (ivsc->code_size);
+    READVECTOR (ivsc->trained);
+}
+
+
+static void read_HNSW (HNSW *hnsw, IOReader *f) {
+    READVECTOR (hnsw->assign_probas);
+    READVECTOR (hnsw->cum_nneighbor_per_level);
+    READVECTOR (hnsw->levels);
+    READVECTOR (hnsw->offsets);
+    READVECTOR (hnsw->neighbors);
+
+    READ1 (hnsw->entry_point);
+    READ1 (hnsw->max_level);
+    READ1 (hnsw->efConstruction);
+    READ1 (hnsw->efSearch);
+    READ1 (hnsw->upper_beam);
+}
+
+ProductQuantizer * read_ProductQuantizer (const char*fname) {
+    FileIOReader reader(fname);
+    return read_ProductQuantizer(&reader);
+}
+
+ProductQuantizer * read_ProductQuantizer (IOReader *reader) {
+  ProductQuantizer *pq = new ProductQuantizer();
+  ScopeDeleter1<ProductQuantizer> del (pq);
+
+  read_ProductQuantizer(pq, reader);
+  del.release ();
+  return pq;
+}
+
+static void read_ivf_header (
+    IndexIVF *ivf, IOReader *f,
+    std::vector<std::vector<Index::idx_t> > *ids = nullptr)
+{
+    read_index_header (ivf, f);
+    READ1 (ivf->nlist);
+    READ1 (ivf->nprobe);
+    ivf->quantizer = read_index (f);
+    ivf->own_fields = true;
+    if (ids) { // used in legacy "Iv" formats
+        ids->resize (ivf->nlist);
+        for (size_t i = 0; i < ivf->nlist; i++)
+            READVECTOR ((*ids)[i]);
+    }
+    READ1 (ivf->maintain_direct_map);
+    READVECTOR (ivf->direct_map);
+}
+
+// used for legacy formats
+static ArrayInvertedLists *set_array_invlist(
+    IndexIVF *ivf, std::vector<std::vector<Index::idx_t> > &ids)
+{
+    ArrayInvertedLists *ail = new ArrayInvertedLists (
+             ivf->nlist, ivf->code_size);
+    std::swap (ail->ids, ids);
+    ivf->invlists = ail;
+    ivf->own_invlists = true;
+    return ail;
+}
+
+static IndexIVFPQ *read_ivfpq (IOReader *f, uint32_t h, int io_flags)
+{
+    bool legacy = h == fourcc ("IvQR") || h == fourcc ("IvPQ");
+
+    IndexIVFPQR *ivfpqr =
+        h == fourcc ("IvQR") || h == fourcc ("IwQR") ?
+        new IndexIVFPQR () : nullptr;
+    IndexIVFPQ * ivpq = ivfpqr ? ivfpqr : new IndexIVFPQ ();
+
+    std::vector<std::vector<Index::idx_t> > ids;
+    read_ivf_header (ivpq, f, legacy ? &ids : nullptr);
+    READ1 (ivpq->by_residual);
+    READ1 (ivpq->code_size);
+    read_ProductQuantizer (&ivpq->pq, f);
+
+    if (legacy) {
+        ArrayInvertedLists *ail = set_array_invlist (ivpq, ids);
+        for (size_t i = 0; i < ail->nlist; i++)
+            READVECTOR (ail->codes[i]);
+    } else {
+        read_InvertedLists (ivpq, f, io_flags);
+    }
+
+    if (ivpq->is_trained) {
+        // precomputed table not stored. It is cheaper to recompute it
+        ivpq->use_precomputed_table = 0;
+        if (ivpq->by_residual)
+            ivpq->precompute_table ();
+        if (ivfpqr) {
+            read_ProductQuantizer (&ivfpqr->refine_pq, f);
+            READVECTOR (ivfpqr->refine_codes);
+            READ1 (ivfpqr->k_factor);
+        }
+    }
+    return ivpq;
+}
+
+int read_old_fmt_hack = 0;
+
+Index *read_index (IOReader *f, int io_flags) {
+    Index * idx = nullptr;
+    uint32_t h;
+    READ1 (h);
+    if (h == fourcc ("IxFI") || h == fourcc ("IxF2")) {
+        IndexFlat *idxf;
+        if (h == fourcc ("IxFI")) idxf = new IndexFlatIP ();
+        else                      idxf = new IndexFlatL2 ();
+        read_index_header (idxf, f);
+        READVECTOR (idxf->xb);
+        FAISS_THROW_IF_NOT (idxf->xb.size() == idxf->ntotal * idxf->d);
+        // leak!
+        idx = idxf;
+    } else if (h == fourcc("IxHE") || h == fourcc("IxHe")) {
+        IndexLSH * idxl = new IndexLSH ();
+        read_index_header (idxl, f);
+        READ1 (idxl->nbits);
+        READ1 (idxl->rotate_data);
+        READ1 (idxl->train_thresholds);
+        READVECTOR (idxl->thresholds);
+        READ1 (idxl->bytes_per_vec);
+        if (h == fourcc("IxHE")) {
+            FAISS_THROW_IF_NOT_FMT (idxl->nbits % 64 == 0,
+                            "can only read old format IndexLSH with "
+                            "nbits multiple of 64 (got %d)",
+                            (int) idxl->nbits);
+            // leak
+            idxl->bytes_per_vec *= 8;
+        }
+        {
+            RandomRotationMatrix *rrot = dynamic_cast<RandomRotationMatrix *>
+                (read_VectorTransform (f));
+            FAISS_THROW_IF_NOT_MSG(rrot, "expected a random rotation");
+            idxl->rrot = *rrot;
+            delete rrot;
+        }
+        READVECTOR (idxl->codes);
+        FAISS_THROW_IF_NOT (idxl->rrot.d_in == idxl->d &&
+                      idxl->rrot.d_out == idxl->nbits);
+        FAISS_THROW_IF_NOT (
+               idxl->codes.size() == idxl->ntotal * idxl->bytes_per_vec);
+        idx = idxl;
+    } else if (h == fourcc ("IxPQ") || h == fourcc ("IxPo") ||
+               h == fourcc ("IxPq")) {
+        // IxPQ and IxPo were merged into the same IndexPQ object
+        IndexPQ * idxp =new IndexPQ ();
+        read_index_header (idxp, f);
+        read_ProductQuantizer (&idxp->pq, f);
+        READVECTOR (idxp->codes);
+        if (h == fourcc ("IxPo") || h == fourcc ("IxPq")) {
+            READ1 (idxp->search_type);
+            READ1 (idxp->encode_signs);
+            READ1 (idxp->polysemous_ht);
+        }
+        // Old versoins of PQ all had metric_type set to INNER_PRODUCT
+        // when they were in fact using L2. Therefore, we force metric type
+        // to L2 when the old format is detected
+        if (h == fourcc ("IxPQ") || h == fourcc ("IxPo")) {
+            idxp->metric_type = METRIC_L2;
+        }
+        idx = idxp;
+    } else if (h == fourcc ("IvFl") || h == fourcc("IvFL")) { // legacy
+        IndexIVFFlat * ivfl = new IndexIVFFlat ();
+        std::vector<std::vector<Index::idx_t> > ids;
+        read_ivf_header (ivfl, f, &ids);
+        ivfl->code_size = ivfl->d * sizeof(float);
+        ArrayInvertedLists *ail = set_array_invlist (ivfl, ids);
+
+        if (h == fourcc ("IvFL")) {
+            for (size_t i = 0; i < ivfl->nlist; i++) {
+                READVECTOR (ail->codes[i]);
+            }
+        } else { // old format
+            for (size_t i = 0; i < ivfl->nlist; i++) {
+                std::vector<float> vec;
+                READVECTOR (vec);
+                ail->codes[i].resize(vec.size() * sizeof(float));
+                memcpy(ail->codes[i].data(), vec.data(),
+                       ail->codes[i].size());
+            }
+        }
+        idx = ivfl;
+    } else if (h == fourcc ("IwFd")) {
+        IndexIVFFlatDedup * ivfl = new IndexIVFFlatDedup ();
+        read_ivf_header (ivfl, f);
+        ivfl->code_size = ivfl->d * sizeof(float);
+        {
+            std::vector<Index::idx_t> tab;
+            READVECTOR (tab);
+            for (long i = 0; i < tab.size(); i += 2) {
+                std::pair<Index::idx_t, Index::idx_t>
+                    pair (tab[i], tab[i + 1]);
+                ivfl->instances.insert (pair);
+            }
+        }
+        read_InvertedLists (ivfl, f, io_flags);
+        idx = ivfl;
+    } else if (h == fourcc ("IwFl")) {
+        IndexIVFFlat * ivfl = new IndexIVFFlat ();
+        read_ivf_header (ivfl, f);
+        ivfl->code_size = ivfl->d * sizeof(float);
+        read_InvertedLists (ivfl, f, io_flags);
+        idx = ivfl;
+    } else if (h == fourcc ("IxSQ")) {
+        IndexScalarQuantizer * idxs = new IndexScalarQuantizer ();
+        read_index_header (idxs, f);
+        read_ScalarQuantizer (&idxs->sq, f);
+        READVECTOR (idxs->codes);
+        idxs->code_size = idxs->sq.code_size;
+        idx = idxs;
+    } else if (h == fourcc ("IxLa")) {
+        int d, nsq, scale_nbit, r2;
+        READ1 (d);
+        READ1 (nsq);
+        READ1 (scale_nbit);
+        READ1 (r2);
+        IndexLattice *idxl = new IndexLattice (d, nsq, scale_nbit, r2);
+        read_index_header (idxl, f);
+        READVECTOR (idxl->trained);
+        idx = idxl;
+    } else if(h == fourcc ("IvSQ")) { // legacy
+        IndexIVFScalarQuantizer * ivsc = new IndexIVFScalarQuantizer();
+        std::vector<std::vector<Index::idx_t> > ids;
+        read_ivf_header (ivsc, f, &ids);
+        read_ScalarQuantizer (&ivsc->sq, f);
+        READ1 (ivsc->code_size);
+        ArrayInvertedLists *ail = set_array_invlist (ivsc, ids);
+        for(int i = 0; i < ivsc->nlist; i++)
+            READVECTOR (ail->codes[i]);
+        idx = ivsc;
+    } else if(h == fourcc ("IwSQ") || h == fourcc ("IwSq")) {
+        IndexIVFScalarQuantizer * ivsc = new IndexIVFScalarQuantizer();
+        read_ivf_header (ivsc, f);
+        read_ScalarQuantizer (&ivsc->sq, f);
+        READ1 (ivsc->code_size);
+        if (h == fourcc ("IwSQ")) {
+            ivsc->by_residual = true;
+        } else {
+            READ1 (ivsc->by_residual);
+        }
+        read_InvertedLists (ivsc, f, io_flags);
+        idx = ivsc;
+    } else if (h == fourcc("ISqH")) {
+        IndexIVFSQHybrid *ivfsqhbyrid = new IndexIVFSQHybrid();
+        read_ivf_header(ivfsqhbyrid, f);
+        read_ScalarQuantizer(&ivfsqhbyrid->sq, f);
+        READ1 (ivfsqhbyrid->code_size);
+        READ1 (ivfsqhbyrid->by_residual);
+        read_InvertedLists(ivfsqhbyrid, f, io_flags);
+        idx = ivfsqhbyrid;
+    } else if(h == fourcc ("IwSh")) {
+        IndexIVFSpectralHash *ivsp = new IndexIVFSpectralHash ();
+        read_ivf_header (ivsp, f);
+        ivsp->vt = read_VectorTransform (f);
+        ivsp->own_fields = true;
+        READ1 (ivsp->nbit);
+        // not stored by write_ivf_header
+        ivsp->code_size = (ivsp->nbit + 7) / 8;
+        READ1 (ivsp->period);
+        READ1 (ivsp->threshold_type);
+        READVECTOR (ivsp->trained);
+        read_InvertedLists (ivsp, f, io_flags);
+        idx = ivsp;
+    } else if(h == fourcc ("IvPQ") || h == fourcc ("IvQR") ||
+              h == fourcc ("IwPQ") || h == fourcc ("IwQR")) {
+
+        idx = read_ivfpq (f, h, io_flags);
+
+    } else if(h == fourcc ("IxPT")) {
+        IndexPreTransform * ixpt = new IndexPreTransform();
+        ixpt->own_fields = true;
+        read_index_header (ixpt, f);
+        int nt;
+        if (read_old_fmt_hack == 2) {
+            nt = 1;
+        } else {
+            READ1 (nt);
+        }
+        for (int i = 0; i < nt; i++) {
+            ixpt->chain.push_back (read_VectorTransform (f));
+        }
+        ixpt->index = read_index (f, io_flags);
+        idx = ixpt;
+    } else if(h == fourcc ("Imiq")) {
+        MultiIndexQuantizer * imiq = new MultiIndexQuantizer ();
+        read_index_header (imiq, f);
+        read_ProductQuantizer (&imiq->pq, f);
+        idx = imiq;
+    } else if(h == fourcc ("IxRF")) {
+        IndexRefineFlat *idxrf = new IndexRefineFlat ();
+        read_index_header (idxrf, f);
+        idxrf->base_index = read_index(f, io_flags);
+        idxrf->own_fields = true;
+        IndexFlat *rf = dynamic_cast<IndexFlat*> (read_index (f, io_flags));
+        std::swap (*rf, idxrf->refine_index);
+        delete rf;
+        READ1 (idxrf->k_factor);
+        idx = idxrf;
+    } else if(h == fourcc ("IxMp") || h == fourcc ("IxM2")) {
+        bool is_map2 = h == fourcc ("IxM2");
+        IndexIDMap * idxmap = is_map2 ? new IndexIDMap2 () : new IndexIDMap ();
+        read_index_header (idxmap, f);
+        idxmap->index = read_index (f, io_flags);
+        idxmap->own_fields = true;
+        READVECTOR (idxmap->id_map);
+        if (is_map2) {
+            static_cast<IndexIDMap2*>(idxmap)->construct_rev_map ();
+        }
+        idx = idxmap;
+    } else if (h == fourcc ("Ix2L")) {
+        Index2Layer * idxp = new Index2Layer ();
+        read_index_header (idxp, f);
+        idxp->q1.quantizer = read_index (f, io_flags);
+        READ1 (idxp->q1.nlist);
+        READ1 (idxp->q1.quantizer_trains_alone);
+        read_ProductQuantizer (&idxp->pq, f);
+        READ1 (idxp->code_size_1);
+        READ1 (idxp->code_size_2);
+        READ1 (idxp->code_size);
+        READVECTOR (idxp->codes);
+        idx = idxp;
+    } else if(h == fourcc("IHNf") || h == fourcc("IHNp") ||
+              h == fourcc("IHNs") || h == fourcc("IHN2")) {
+        IndexHNSW *idxhnsw = nullptr;
+        if (h == fourcc("IHNf")) idxhnsw = new IndexHNSWFlat ();
+        if (h == fourcc("IHNp")) idxhnsw = new IndexHNSWPQ ();
+        if (h == fourcc("IHNs")) idxhnsw = new IndexHNSWSQ ();
+        if (h == fourcc("IHN2")) idxhnsw = new IndexHNSW2Level ();
+        read_index_header (idxhnsw, f);
+        read_HNSW (&idxhnsw->hnsw, f);
+        idxhnsw->storage = read_index (f, io_flags);
+        idxhnsw->own_fields = true;
+        if (h == fourcc("IHNp")) {
+            dynamic_cast<IndexPQ*>(idxhnsw->storage)->pq.compute_sdc_table ();
+        }
+        idx = idxhnsw;
+    } else {
+        FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
+        idx = nullptr;
+    }
+    return idx;
+}
+
+
+Index *read_index (FILE * f, int io_flags) {
+    FileIOReader reader(f);
+    return read_index(&reader, io_flags);
+}
+
+Index *read_index (const char *fname, int io_flags) {
+    FileIOReader reader(fname);
+    Index *idx = read_index (&reader, io_flags);
+    return idx;
+}
+
+VectorTransform *read_VectorTransform (const char *fname) {
+    FileIOReader reader(fname);
+    VectorTransform *vt = read_VectorTransform (&reader);
+    return vt;
+}
+
+
+
+/*************************************************************
+ * Read binary indexes
+ **************************************************************/
+
+static void read_InvertedLists (
+    IndexBinaryIVF *ivf, IOReader *f, int io_flags) {
+    InvertedLists *ils = read_InvertedLists (f, io_flags);
+    FAISS_THROW_IF_NOT (!ils || (ils->nlist == ivf->nlist &&
+                                 ils->code_size == ivf->code_size));
+    ivf->invlists = ils;
+    ivf->own_invlists = true;
+}
+
+
+
+static void read_index_binary_header (IndexBinary *idx, IOReader *f) {
+    READ1 (idx->d);
+    READ1 (idx->code_size);
+    READ1 (idx->ntotal);
+    READ1 (idx->is_trained);
+    READ1 (idx->metric_type);
+    idx->verbose = false;
+}
+
+static void read_binary_ivf_header (
+    IndexBinaryIVF *ivf, IOReader *f,
+    std::vector<std::vector<Index::idx_t> > *ids = nullptr)
+{
+    read_index_binary_header (ivf, f);
+    READ1 (ivf->nlist);
+    READ1 (ivf->nprobe);
+    ivf->quantizer = read_index_binary (f);
+    ivf->own_fields = true;
+    if (ids) { // used in legacy "Iv" formats
+        ids->resize (ivf->nlist);
+        for (size_t i = 0; i < ivf->nlist; i++)
+            READVECTOR ((*ids)[i]);
+    }
+    READ1 (ivf->maintain_direct_map);
+    READVECTOR (ivf->direct_map);
+}
+
+IndexBinary *read_index_binary (IOReader *f, int io_flags) {
+    IndexBinary * idx = nullptr;
+    uint32_t h;
+    READ1 (h);
+    if (h == fourcc ("IBxF")) {
+        IndexBinaryFlat *idxf = new IndexBinaryFlat ();
+        read_index_binary_header (idxf, f);
+        READVECTOR (idxf->xb);
+        FAISS_THROW_IF_NOT (idxf->xb.size() == idxf->ntotal * idxf->code_size);
+        // leak!
+        idx = idxf;
+    } else if (h == fourcc ("IBwF")) {
+        IndexBinaryIVF *ivf = new IndexBinaryIVF ();
+        read_binary_ivf_header (ivf, f);
+        read_InvertedLists (ivf, f, io_flags);
+        idx = ivf;
+    } else if (h == fourcc ("IBFf")) {
+        IndexBinaryFromFloat *idxff = new IndexBinaryFromFloat ();
+        read_index_binary_header (idxff, f);
+        idxff->own_fields = true;
+        idxff->index = read_index (f, io_flags);
+        idx = idxff;
+    } else if (h == fourcc ("IBHf")) {
+        IndexBinaryHNSW *idxhnsw = new IndexBinaryHNSW ();
+        read_index_binary_header (idxhnsw, f);
+        read_HNSW (&idxhnsw->hnsw, f);
+        idxhnsw->storage = read_index_binary (f, io_flags);
+        idxhnsw->own_fields = true;
+        idx = idxhnsw;
+    } else if(h == fourcc ("IBMp") || h == fourcc ("IBM2")) {
+        bool is_map2 = h == fourcc ("IBM2");
+        IndexBinaryIDMap * idxmap = is_map2 ?
+            new IndexBinaryIDMap2 () : new IndexBinaryIDMap ();
+        read_index_binary_header (idxmap, f);
+        idxmap->index = read_index_binary (f, io_flags);
+        idxmap->own_fields = true;
+        READVECTOR (idxmap->id_map);
+        if (is_map2) {
+            static_cast<IndexBinaryIDMap2*>(idxmap)->construct_rev_map ();
+        }
+        idx = idxmap;
+    } else {
+        FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
+        idx = nullptr;
+    }
+    return idx;
+}
+
+IndexBinary *read_index_binary (FILE * f, int io_flags) {
+    FileIOReader reader(f);
+    return read_index_binary(&reader, io_flags);
+}
+
+IndexBinary *read_index_binary (const char *fname, int io_flags) {
+    FileIOReader reader(fname);
+    IndexBinary *idx = read_index_binary (&reader, io_flags);
+    return idx;
+}
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/impl/index_write.cpp b/core/src/index/thirdparty/faiss/impl/index_write.cpp
new file mode 100644
index 0000000000..10b1d07c33
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/index_write.cpp
@@ -0,0 +1,581 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/index_io.h>
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/io.h>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/VectorTransform.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/IndexLSH.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexIVFPQR.h>
+#include <faiss/Index2Layer.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFSpectralHash.h>
+#include <faiss/MetaIndexes.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/IndexSQHybrid.h>
+#include <faiss/IndexHNSW.h>
+#include <faiss/IndexLattice.h>
+
+#include <faiss/OnDiskInvertedLists.h>
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/IndexBinaryFromFloat.h>
+#include <faiss/IndexBinaryHNSW.h>
+#include <faiss/IndexBinaryIVF.h>
+
+
+
+/*************************************************************
+ * The I/O format is the content of the class. For objects that are
+ * inherited, like Index, a 4-character-code (fourcc) indicates which
+ * child class this is an instance of.
+ *
+ * In this case, the fields of the parent class are written first,
+ * then the ones for the child classes. Note that this requires
+ * classes to be serialized to have a constructor without parameters,
+ * so that the fields can be filled in later. The default constructor
+ * should set reasonable defaults for all fields.
+ *
+ * The fourccs are assigned arbitrarily. When the class changed (added
+ * or deprecated fields), the fourcc can be replaced. New code should
+ * be able to read the old fourcc and fill in new classes.
+ *
+ * TODO: serialization to strings for use in Python pickle or Torch
+ * serialization.
+ *
+ * TODO: in this file, the read functions that encouter errors may
+ * leak memory.
+ **************************************************************/
+
+
+
+namespace faiss {
+
+
+/*************************************************************
+ * I/O macros
+ *
+ * we use macros so that we have a line number to report in abort
+ * (). This makes debugging a lot easier. The IOReader or IOWriter is
+ * always called f and thus is not passed in as a macro parameter.
+ **************************************************************/
+
+
+#define WRITEANDCHECK(ptr, n) {                                 \
+        size_t ret = (*f)(ptr, sizeof(*(ptr)), n);              \
+        FAISS_THROW_IF_NOT_FMT(ret == (n),                      \
+            "write error in %s: %ld != %ld (%s)",               \
+            f->name.c_str(), ret, size_t(n), strerror(errno));  \
+    }
+
+#define WRITE1(x) WRITEANDCHECK(&(x), 1)
+
+#define WRITEVECTOR(vec) {                      \
+        size_t size = (vec).size ();            \
+        WRITEANDCHECK (&size, 1);               \
+        WRITEANDCHECK ((vec).data (), size);    \
+    }
+
+
+
+/*************************************************************
+ * Write
+ **************************************************************/
+static void write_index_header (const Index *idx, IOWriter *f) {
+    WRITE1 (idx->d);
+    WRITE1 (idx->ntotal);
+    Index::idx_t dummy = 1 << 20;
+    WRITE1 (dummy);
+    WRITE1 (dummy);
+    WRITE1 (idx->is_trained);
+    WRITE1 (idx->metric_type);
+    if (idx->metric_type > 1) {
+        WRITE1 (idx->metric_arg);
+    }
+}
+
+void write_VectorTransform (const VectorTransform *vt, IOWriter *f) {
+    if (const LinearTransform * lt =
+           dynamic_cast < const LinearTransform *> (vt)) {
+        if (dynamic_cast<const RandomRotationMatrix *>(lt)) {
+            uint32_t h = fourcc ("rrot");
+            WRITE1 (h);
+        } else if (const PCAMatrix * pca =
+                   dynamic_cast<const PCAMatrix *>(lt)) {
+            uint32_t h = fourcc ("PcAm");
+            WRITE1 (h);
+            WRITE1 (pca->eigen_power);
+            WRITE1 (pca->random_rotation);
+            WRITE1 (pca->balanced_bins);
+            WRITEVECTOR (pca->mean);
+            WRITEVECTOR (pca->eigenvalues);
+            WRITEVECTOR (pca->PCAMat);
+        } else if (const ITQMatrix * itqm =
+                   dynamic_cast<const ITQMatrix *>(lt)) {
+            uint32_t h = fourcc ("Viqm");
+            WRITE1 (h);
+            WRITE1 (itqm->max_iter);
+            WRITE1 (itqm->seed);
+        } else {
+            // generic LinearTransform (includes OPQ)
+            uint32_t h = fourcc ("LTra");
+            WRITE1 (h);
+        }
+        WRITE1 (lt->have_bias);
+        WRITEVECTOR (lt->A);
+        WRITEVECTOR (lt->b);
+    } else if (const RemapDimensionsTransform *rdt =
+               dynamic_cast<const RemapDimensionsTransform *>(vt)) {
+        uint32_t h = fourcc ("RmDT");
+        WRITE1 (h);
+        WRITEVECTOR (rdt->map);
+    } else if (const NormalizationTransform *nt =
+               dynamic_cast<const NormalizationTransform *>(vt)) {
+        uint32_t h = fourcc ("VNrm");
+        WRITE1 (h);
+        WRITE1 (nt->norm);
+    } else if (const CenteringTransform *ct =
+               dynamic_cast<const CenteringTransform *>(vt)) {
+        uint32_t h = fourcc ("VCnt");
+        WRITE1 (h);
+        WRITEVECTOR (ct->mean);
+    } else if (const ITQTransform *itqt =
+               dynamic_cast<const ITQTransform*> (vt)) {
+        uint32_t h = fourcc ("Viqt");
+        WRITE1 (h);
+        WRITEVECTOR (itqt->mean);
+        WRITE1 (itqt->do_pca);
+        write_VectorTransform (&itqt->itq, f);
+        write_VectorTransform (&itqt->pca_then_itq, f);
+    } else {
+        FAISS_THROW_MSG ("cannot serialize this");
+    }
+    // common fields
+    WRITE1 (vt->d_in);
+    WRITE1 (vt->d_out);
+    WRITE1 (vt->is_trained);
+}
+
+void write_ProductQuantizer (const ProductQuantizer *pq, IOWriter *f) {
+    WRITE1 (pq->d);
+    WRITE1 (pq->M);
+    WRITE1 (pq->nbits);
+    WRITEVECTOR (pq->centroids);
+}
+
+static void write_ScalarQuantizer (
+        const ScalarQuantizer *ivsc, IOWriter *f) {
+    WRITE1 (ivsc->qtype);
+    WRITE1 (ivsc->rangestat);
+    WRITE1 (ivsc->rangestat_arg);
+    WRITE1 (ivsc->d);
+    WRITE1 (ivsc->code_size);
+    WRITEVECTOR (ivsc->trained);
+}
+
+void write_InvertedLists (const InvertedLists *ils, IOWriter *f) {
+    if (ils == nullptr) {
+        uint32_t h = fourcc ("il00");
+        WRITE1 (h);
+    } else if (const auto & ails =
+               dynamic_cast<const ArrayInvertedLists *>(ils)) {
+        uint32_t h = fourcc ("ilar");
+        WRITE1 (h);
+        WRITE1 (ails->nlist);
+        WRITE1 (ails->code_size);
+        // here we store either as a full or a sparse data buffer
+        size_t n_non0 = 0;
+        for (size_t i = 0; i < ails->nlist; i++) {
+            if (ails->ids[i].size() > 0)
+                n_non0++;
+        }
+        if (n_non0 > ails->nlist / 2) {
+            uint32_t list_type = fourcc("full");
+            WRITE1 (list_type);
+            std::vector<size_t> sizes;
+            for (size_t i = 0; i < ails->nlist; i++) {
+                sizes.push_back (ails->ids[i].size());
+            }
+            WRITEVECTOR (sizes);
+        } else {
+            int list_type = fourcc("sprs"); // sparse
+            WRITE1 (list_type);
+            std::vector<size_t> sizes;
+            for (size_t i = 0; i < ails->nlist; i++) {
+                size_t n = ails->ids[i].size();
+                if (n > 0) {
+                    sizes.push_back (i);
+                    sizes.push_back (n);
+                }
+            }
+            WRITEVECTOR (sizes);
+        }
+        // make a single contiguous data buffer (useful for mmapping)
+        for (size_t i = 0; i < ails->nlist; i++) {
+            size_t n = ails->ids[i].size();
+            if (n > 0) {
+                WRITEANDCHECK (ails->codes[i].data(), n * ails->code_size);
+                WRITEANDCHECK (ails->ids[i].data(), n);
+            }
+        }
+    } else if (const auto & oa =
+            dynamic_cast<const ReadOnlyArrayInvertedLists *>(ils)) {
+        uint32_t h = fourcc("iloa");
+        WRITE1 (h);
+        WRITE1 (oa->nlist);
+        WRITE1 (oa->code_size);
+        WRITEVECTOR(oa->readonly_length);
+        size_t n = oa->pin_readonly_ids->size() / sizeof(InvertedLists::idx_t);
+        WRITE1(n);
+//        WRITEANDCHECK(oa->readonly_ids.data(), n);
+//        WRITEANDCHECK(oa->readonly_codes.data(), n * oa->code_size);
+        WRITEANDCHECK((InvertedLists::idx_t *) oa->pin_readonly_ids->data, n);
+        WRITEANDCHECK((uint8_t *) oa->pin_readonly_codes->data, n * oa->code_size);
+    } else if (const auto & od =
+               dynamic_cast<const OnDiskInvertedLists *>(ils)) {
+        uint32_t h = fourcc ("ilod");
+        WRITE1 (h);
+        WRITE1 (ils->nlist);
+        WRITE1 (ils->code_size);
+        // this is a POD object
+        WRITEVECTOR (od->lists);
+
+        {
+            std::vector<OnDiskInvertedLists::Slot> v(
+                      od->slots.begin(), od->slots.end());
+            WRITEVECTOR(v);
+        }
+        {
+            std::vector<char> x(od->filename.begin(), od->filename.end());
+            WRITEVECTOR(x);
+        }
+        WRITE1(od->totsize);
+
+    } else {
+        fprintf(stderr, "WARN! write_InvertedLists: unsupported invlist type, "
+                "saving null invlist\n");
+        uint32_t h = fourcc ("il00");
+        WRITE1 (h);
+    }
+}
+
+
+void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) {
+    FileIOWriter writer(fname);
+    write_ProductQuantizer (pq, &writer);
+}
+
+static void write_HNSW (const HNSW *hnsw, IOWriter *f) {
+
+    WRITEVECTOR (hnsw->assign_probas);
+    WRITEVECTOR (hnsw->cum_nneighbor_per_level);
+    WRITEVECTOR (hnsw->levels);
+    WRITEVECTOR (hnsw->offsets);
+    WRITEVECTOR (hnsw->neighbors);
+
+    WRITE1 (hnsw->entry_point);
+    WRITE1 (hnsw->max_level);
+    WRITE1 (hnsw->efConstruction);
+    WRITE1 (hnsw->efSearch);
+    WRITE1 (hnsw->upper_beam);
+}
+
+static void write_ivf_header (const IndexIVF *ivf, IOWriter *f) {
+    write_index_header (ivf, f);
+    WRITE1 (ivf->nlist);
+    WRITE1 (ivf->nprobe);
+    write_index (ivf->quantizer, f);
+    WRITE1 (ivf->maintain_direct_map);
+    WRITEVECTOR (ivf->direct_map);
+}
+
+void write_index (const Index *idx, IOWriter *f) {
+    if (const IndexFlat * idxf = dynamic_cast<const IndexFlat *> (idx)) {
+        uint32_t h = fourcc (
+              idxf->metric_type == METRIC_INNER_PRODUCT ? "IxFI" :
+              idxf->metric_type == METRIC_L2 ? "IxF2" : nullptr);
+        WRITE1 (h);
+        write_index_header (idx, f);
+        WRITEVECTOR (idxf->xb);
+    } else if(const IndexLSH * idxl = dynamic_cast<const IndexLSH *> (idx)) {
+        uint32_t h = fourcc ("IxHe");
+        WRITE1 (h);
+        write_index_header (idx, f);
+        WRITE1 (idxl->nbits);
+        WRITE1 (idxl->rotate_data);
+        WRITE1 (idxl->train_thresholds);
+        WRITEVECTOR (idxl->thresholds);
+        WRITE1 (idxl->bytes_per_vec);
+        write_VectorTransform (&idxl->rrot, f);
+        WRITEVECTOR (idxl->codes);
+    } else if(const IndexPQ * idxp = dynamic_cast<const IndexPQ *> (idx)) {
+        uint32_t h = fourcc ("IxPq");
+        WRITE1 (h);
+        write_index_header (idx, f);
+        write_ProductQuantizer (&idxp->pq, f);
+        WRITEVECTOR (idxp->codes);
+        // search params -- maybe not useful to store?
+        WRITE1 (idxp->search_type);
+        WRITE1 (idxp->encode_signs);
+        WRITE1 (idxp->polysemous_ht);
+    } else if(const Index2Layer * idxp =
+              dynamic_cast<const Index2Layer *> (idx)) {
+        uint32_t h = fourcc ("Ix2L");
+        WRITE1 (h);
+        write_index_header (idx, f);
+        write_index (idxp->q1.quantizer, f);
+        WRITE1 (idxp->q1.nlist);
+        WRITE1 (idxp->q1.quantizer_trains_alone);
+        write_ProductQuantizer (&idxp->pq, f);
+        WRITE1 (idxp->code_size_1);
+        WRITE1 (idxp->code_size_2);
+        WRITE1 (idxp->code_size);
+        WRITEVECTOR (idxp->codes);
+    } else if(const IndexScalarQuantizer * idxs =
+              dynamic_cast<const IndexScalarQuantizer *> (idx)) {
+        uint32_t h = fourcc ("IxSQ");
+        WRITE1 (h);
+        write_index_header (idx, f);
+        write_ScalarQuantizer (&idxs->sq, f);
+        WRITEVECTOR (idxs->codes);
+    } else if(const IndexLattice * idxl =
+              dynamic_cast<const IndexLattice *> (idx)) {
+        uint32_t h = fourcc ("IxLa");
+        WRITE1 (h);
+        WRITE1 (idxl->d);
+        WRITE1 (idxl->nsq);
+        WRITE1 (idxl->scale_nbit);
+        WRITE1 (idxl->zn_sphere_codec.r2);
+        write_index_header (idx, f);
+        WRITEVECTOR (idxl->trained);
+    } else if(const IndexIVFFlatDedup * ivfl =
+              dynamic_cast<const IndexIVFFlatDedup *> (idx)) {
+        uint32_t h = fourcc ("IwFd");
+        WRITE1 (h);
+        write_ivf_header (ivfl, f);
+        {
+            std::vector<Index::idx_t> tab (2 * ivfl->instances.size());
+            long i = 0;
+            for (auto it = ivfl->instances.begin();
+                 it != ivfl->instances.end(); ++it) {
+                tab[i++] = it->first;
+                tab[i++] = it->second;
+            }
+            WRITEVECTOR (tab);
+        }
+        write_InvertedLists (ivfl->invlists, f);
+    } else if(const IndexIVFFlat * ivfl =
+              dynamic_cast<const IndexIVFFlat *> (idx)) {
+        uint32_t h = fourcc ("IwFl");
+        WRITE1 (h);
+        write_ivf_header (ivfl, f);
+        write_InvertedLists (ivfl->invlists, f);
+    } else if(const IndexIVFScalarQuantizer * ivsc =
+              dynamic_cast<const IndexIVFScalarQuantizer *> (idx)) {
+        uint32_t h = fourcc ("IwSq");
+        WRITE1 (h);
+        write_ivf_header (ivsc, f);
+        write_ScalarQuantizer (&ivsc->sq, f);
+        WRITE1 (ivsc->code_size);
+        WRITE1 (ivsc->by_residual);
+        write_InvertedLists (ivsc->invlists, f);
+    } else if(const IndexIVFSQHybrid *ivfsqhbyrid =
+            dynamic_cast<const IndexIVFSQHybrid*>(idx)) {
+        uint32_t h = fourcc ("ISqH");
+        WRITE1 (h);
+        write_ivf_header (ivfsqhbyrid, f);
+        write_ScalarQuantizer (&ivfsqhbyrid->sq, f);
+        WRITE1 (ivfsqhbyrid->code_size);
+        WRITE1 (ivfsqhbyrid->by_residual);
+        write_InvertedLists (ivfsqhbyrid->invlists, f);
+    } else if(const IndexIVFSpectralHash *ivsp =
+              dynamic_cast<const IndexIVFSpectralHash *>(idx)) {
+        uint32_t h = fourcc ("IwSh");
+        WRITE1 (h);
+        write_ivf_header (ivsp, f);
+        write_VectorTransform (ivsp->vt, f);
+        WRITE1 (ivsp->nbit);
+        WRITE1 (ivsp->period);
+        WRITE1 (ivsp->threshold_type);
+        WRITEVECTOR (ivsp->trained);
+        write_InvertedLists (ivsp->invlists, f);
+    } else if(const IndexIVFPQ * ivpq =
+              dynamic_cast<const IndexIVFPQ *> (idx)) {
+        const IndexIVFPQR * ivfpqr = dynamic_cast<const IndexIVFPQR *> (idx);
+
+        uint32_t h = fourcc (ivfpqr ? "IwQR" : "IwPQ");
+        WRITE1 (h);
+        write_ivf_header (ivpq, f);
+        WRITE1 (ivpq->by_residual);
+        WRITE1 (ivpq->code_size);
+        write_ProductQuantizer (&ivpq->pq, f);
+        write_InvertedLists (ivpq->invlists, f);
+        if (ivfpqr) {
+            write_ProductQuantizer (&ivfpqr->refine_pq, f);
+            WRITEVECTOR (ivfpqr->refine_codes);
+            WRITE1 (ivfpqr->k_factor);
+        }
+
+    } else if(const IndexPreTransform * ixpt =
+              dynamic_cast<const IndexPreTransform *> (idx)) {
+        uint32_t h = fourcc ("IxPT");
+        WRITE1 (h);
+        write_index_header (ixpt, f);
+        int nt = ixpt->chain.size();
+        WRITE1 (nt);
+        for (int i = 0; i < nt; i++)
+            write_VectorTransform (ixpt->chain[i], f);
+        write_index (ixpt->index, f);
+    } else if(const MultiIndexQuantizer * imiq =
+              dynamic_cast<const MultiIndexQuantizer *> (idx)) {
+        uint32_t h = fourcc ("Imiq");
+        WRITE1 (h);
+        write_index_header (imiq, f);
+        write_ProductQuantizer (&imiq->pq, f);
+    } else if(const IndexRefineFlat * idxrf =
+              dynamic_cast<const IndexRefineFlat *> (idx)) {
+        uint32_t h = fourcc ("IxRF");
+        WRITE1 (h);
+        write_index_header (idxrf, f);
+        write_index (idxrf->base_index, f);
+        write_index (&idxrf->refine_index, f);
+        WRITE1 (idxrf->k_factor);
+    } else if(const IndexIDMap * idxmap =
+              dynamic_cast<const IndexIDMap *> (idx)) {
+        uint32_t h =
+            dynamic_cast<const IndexIDMap2 *> (idx) ? fourcc ("IxM2") :
+            fourcc ("IxMp");
+        // no need to store additional info for IndexIDMap2
+        WRITE1 (h);
+        write_index_header (idxmap, f);
+        write_index (idxmap->index, f);
+        WRITEVECTOR (idxmap->id_map);
+    } else if(const IndexHNSW * idxhnsw =
+              dynamic_cast<const IndexHNSW *> (idx)) {
+        uint32_t h =
+            dynamic_cast<const IndexHNSWFlat*>(idx)   ? fourcc("IHNf") :
+            dynamic_cast<const IndexHNSWPQ*>(idx)     ? fourcc("IHNp") :
+            dynamic_cast<const IndexHNSWSQ*>(idx)     ? fourcc("IHNs") :
+            dynamic_cast<const IndexHNSW2Level*>(idx) ? fourcc("IHN2") :
+            0;
+        FAISS_THROW_IF_NOT (h != 0);
+        WRITE1 (h);
+        write_index_header (idxhnsw, f);
+        write_HNSW (&idxhnsw->hnsw, f);
+        write_index (idxhnsw->storage, f);
+    } else {
+      FAISS_THROW_MSG ("don't know how to serialize this type of index");
+    }
+}
+
+void write_index (const Index *idx, FILE *f) {
+    FileIOWriter writer(f);
+    write_index (idx, &writer);
+}
+
+void write_index (const Index *idx, const char *fname) {
+    FileIOWriter writer(fname);
+    write_index (idx, &writer);
+}
+
+void write_VectorTransform (const VectorTransform *vt, const char *fname) {
+    FileIOWriter writer(fname);
+    write_VectorTransform (vt, &writer);
+}
+
+
+/*************************************************************
+ * Write binary indexes
+ **************************************************************/
+
+
+static void write_index_binary_header (const IndexBinary *idx, IOWriter *f) {
+    WRITE1 (idx->d);
+    WRITE1 (idx->code_size);
+    WRITE1 (idx->ntotal);
+    WRITE1 (idx->is_trained);
+    WRITE1 (idx->metric_type);
+}
+
+static void write_binary_ivf_header (const IndexBinaryIVF *ivf, IOWriter *f) {
+    write_index_binary_header (ivf, f);
+    WRITE1 (ivf->nlist);
+    WRITE1 (ivf->nprobe);
+    write_index_binary (ivf->quantizer, f);
+    WRITE1 (ivf->maintain_direct_map);
+    WRITEVECTOR (ivf->direct_map);
+}
+
+void write_index_binary (const IndexBinary *idx, IOWriter *f) {
+    if (const IndexBinaryFlat *idxf =
+        dynamic_cast<const IndexBinaryFlat *> (idx)) {
+        uint32_t h = fourcc ("IBxF");
+        WRITE1 (h);
+        write_index_binary_header (idx, f);
+        WRITEVECTOR (idxf->xb);
+    } else if (const IndexBinaryIVF *ivf =
+               dynamic_cast<const IndexBinaryIVF *> (idx)) {
+        uint32_t h = fourcc ("IBwF");
+        WRITE1 (h);
+        write_binary_ivf_header (ivf, f);
+        write_InvertedLists (ivf->invlists, f);
+    } else if(const IndexBinaryFromFloat * idxff =
+              dynamic_cast<const IndexBinaryFromFloat *> (idx)) {
+        uint32_t h = fourcc ("IBFf");
+        WRITE1 (h);
+        write_index_binary_header (idxff, f);
+        write_index (idxff->index, f);
+    } else if (const IndexBinaryHNSW *idxhnsw =
+               dynamic_cast<const IndexBinaryHNSW *> (idx)) {
+        uint32_t h = fourcc ("IBHf");
+        WRITE1 (h);
+        write_index_binary_header (idxhnsw, f);
+        write_HNSW (&idxhnsw->hnsw, f);
+        write_index_binary (idxhnsw->storage, f);
+    } else if(const IndexBinaryIDMap * idxmap =
+              dynamic_cast<const IndexBinaryIDMap *> (idx)) {
+        uint32_t h =
+            dynamic_cast<const IndexBinaryIDMap2 *> (idx) ? fourcc ("IBM2") :
+            fourcc ("IBMp");
+        // no need to store additional info for IndexIDMap2
+        WRITE1 (h);
+        write_index_binary_header (idxmap, f);
+        write_index_binary (idxmap->index, f);
+        WRITEVECTOR (idxmap->id_map);
+    } else {
+        FAISS_THROW_MSG ("don't know how to serialize this type of index");
+    }
+}
+
+void write_index_binary (const IndexBinary *idx, FILE *f) {
+    FileIOWriter writer(f);
+    write_index_binary(idx, &writer);
+}
+
+void write_index_binary (const IndexBinary *idx, const char *fname) {
+    FileIOWriter writer(fname);
+    write_index_binary (idx, &writer);
+}
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/impl/io.cpp b/core/src/index/thirdparty/faiss/impl/io.cpp
new file mode 100644
index 0000000000..e8ffca6bc9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/io.cpp
@@ -0,0 +1,142 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <cstring>
+#include <cassert>
+
+#include <faiss/impl/io.h>
+#include <faiss/impl/FaissAssert.h>
+
+
+namespace faiss {
+
+
+/***********************************************************************
+ * IO functions
+ ***********************************************************************/
+
+
+int IOReader::fileno ()
+{
+    FAISS_THROW_MSG ("IOReader does not support memory mapping");
+}
+
+int IOWriter::fileno ()
+{
+    FAISS_THROW_MSG ("IOWriter does not support memory mapping");
+}
+
+/***********************************************************************
+ * IO Vector
+ ***********************************************************************/
+
+
+
+size_t VectorIOWriter::operator()(
+                const void *ptr, size_t size, size_t nitems)
+{
+    size_t bytes = size * nitems;
+    if (bytes > 0) {
+        size_t o = data.size();
+        data.resize(o + bytes);
+        memcpy (&data[o], ptr, size * nitems);
+    }
+    return nitems;
+}
+
+size_t VectorIOReader::operator()(
+                  void *ptr, size_t size, size_t nitems)
+{
+    if (rp >= data.size()) return 0;
+    size_t nremain = (data.size() - rp) / size;
+    if (nremain < nitems) nitems = nremain;
+    if (size * nitems > 0) {
+        memcpy (ptr, &data[rp], size * nitems);
+        rp += size * nitems;
+    }
+    return nitems;
+}
+
+
+
+
+/***********************************************************************
+ * IO File
+ ***********************************************************************/
+
+
+
+FileIOReader::FileIOReader(FILE *rf): f(rf) {}
+
+FileIOReader::FileIOReader(const char * fname)
+{
+    name = fname;
+    f = fopen(fname, "rb");
+    FAISS_THROW_IF_NOT_FMT (f, "could not open %s for reading: %s",
+                            fname, strerror(errno));
+    need_close = true;
+}
+
+FileIOReader::~FileIOReader()  {
+    if (need_close) {
+        int ret = fclose(f);
+        if (ret != 0) {// we cannot raise and exception in the destructor
+            fprintf(stderr, "file %s close error: %s",
+                    name.c_str(), strerror(errno));
+        }
+    }
+}
+
+size_t FileIOReader::operator()(void *ptr, size_t size, size_t nitems) {
+    return fread(ptr, size, nitems, f);
+}
+
+int FileIOReader::fileno()  {
+    return ::fileno (f);
+}
+
+
+FileIOWriter::FileIOWriter(FILE *wf): f(wf) {}
+
+FileIOWriter::FileIOWriter(const char * fname)
+{
+    name = fname;
+    f = fopen(fname, "wb");
+    FAISS_THROW_IF_NOT_FMT (f, "could not open %s for writing: %s",
+                            fname, strerror(errno));
+    need_close = true;
+}
+
+FileIOWriter::~FileIOWriter()  {
+    if (need_close) {
+        int ret = fclose(f);
+        if (ret != 0) {
+            // we cannot raise and exception in the destructor
+            fprintf(stderr, "file %s close error: %s",
+                    name.c_str(), strerror(errno));
+        }
+    }
+}
+
+size_t FileIOWriter::operator()(const void *ptr, size_t size, size_t nitems) {
+    return fwrite(ptr, size, nitems, f);
+}
+
+int FileIOWriter::fileno()  {
+    return ::fileno (f);
+}
+
+uint32_t fourcc (const char sx[4]) {
+    assert(4 == strlen(sx));
+    const unsigned char *x = (unsigned char*)sx;
+    return x[0] | x[1] << 8 | x[2] << 16 | x[3] << 24;
+}
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/impl/io.h b/core/src/index/thirdparty/faiss/impl/io.h
new file mode 100644
index 0000000000..173d87da63
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/io.h
@@ -0,0 +1,98 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/***********************************************************
+ * Abstract I/O objects
+ ***********************************************************/
+
+#pragma once
+
+#include <string>
+#include <cstdio>
+#include <vector>
+
+#include <faiss/Index.h>
+
+namespace faiss {
+
+
+struct IOReader {
+    // name that can be used in error messages
+    std::string name;
+
+    // fread
+    virtual size_t operator()(
+         void *ptr, size_t size, size_t nitems) = 0;
+
+    // return a file number that can be memory-mapped
+    virtual int fileno ();
+
+    virtual ~IOReader() {}
+};
+
+struct IOWriter {
+    // name that can be used in error messages
+    std::string name;
+
+    // fwrite
+    virtual size_t operator()(
+         const void *ptr, size_t size, size_t nitems) = 0;
+
+    // return a file number that can be memory-mapped
+    virtual int fileno ();
+
+    virtual ~IOWriter() {}
+};
+
+
+struct VectorIOReader:IOReader {
+    std::vector<uint8_t> data;
+    size_t rp = 0;
+    size_t operator()(void *ptr, size_t size, size_t nitems) override;
+};
+
+struct VectorIOWriter:IOWriter {
+    std::vector<uint8_t> data;
+    size_t operator()(const void *ptr, size_t size, size_t nitems) override;
+};
+
+struct FileIOReader: IOReader {
+    FILE *f = nullptr;
+    bool need_close = false;
+
+    FileIOReader(FILE *rf);
+
+    FileIOReader(const char * fname);
+
+    ~FileIOReader() override;
+
+    size_t operator()(void *ptr, size_t size, size_t nitems) override;
+
+    int fileno() override;
+};
+
+struct FileIOWriter: IOWriter {
+    FILE *f = nullptr;
+    bool need_close = false;
+
+    FileIOWriter(FILE *wf);
+
+    FileIOWriter(const char * fname);
+
+    ~FileIOWriter() override;
+
+    size_t operator()(const void *ptr, size_t size, size_t nitems) override;
+
+    int fileno() override;
+};
+
+/// cast a 4-character string to a uint32_t that can be written and read easily
+uint32_t fourcc (const char sx[4]);
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/impl/lattice_Zn.cpp b/core/src/index/thirdparty/faiss/impl/lattice_Zn.cpp
new file mode 100644
index 0000000000..ea3f19bd6e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/lattice_Zn.cpp
@@ -0,0 +1,712 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/impl/lattice_Zn.h>
+
+#include <cstdlib>
+#include <cmath>
+#include <cstring>
+#include <cassert>
+
+#include <queue>
+#include <unordered_set>
+#include <unordered_map>
+#include <algorithm>
+
+#include <faiss/utils/distances.h>
+
+namespace faiss {
+
+/********************************************
+ * small utility functions
+ ********************************************/
+
+namespace {
+
+inline float sqr(float x) {
+    return x * x;
+}
+
+
+typedef std::vector<float> point_list_t;
+
+struct Comb {
+    std::vector<uint64_t> tab; // Pascal's triangle
+    int nmax;
+
+    explicit Comb(int nmax): nmax(nmax) {
+        tab.resize(nmax * nmax, 0);
+        tab[0] = 1;
+        for(int i = 1; i < nmax; i++) {
+            tab[i * nmax] = 1;
+            for(int j = 1; j <= i; j++) {
+                tab[i * nmax + j] =
+                    tab[(i - 1) * nmax + j] +
+                    tab[(i - 1) * nmax + (j - 1)];
+            }
+
+        }
+    }
+
+    uint64_t operator()(int n, int p) const {
+        assert (n < nmax && p < nmax);
+        if (p > n) return 0;
+        return tab[n * nmax + p];
+    }
+};
+
+Comb comb(100);
+
+
+
+// compute combinations of n integer values <= v that sum up to total (squared)
+point_list_t sum_of_sq (float total, int v, int n, float add = 0) {
+    if (total < 0) {
+        return point_list_t();
+    } else if (n == 1) {
+        while (sqr(v + add) > total) v--;
+        if (sqr(v + add) == total) {
+            return point_list_t(1, v + add);
+        } else {
+            return point_list_t();
+        }
+    } else {
+        point_list_t res;
+        while (v >= 0) {
+            point_list_t sub_points =
+                sum_of_sq (total - sqr(v + add), v, n - 1, add);
+            for (size_t i = 0; i < sub_points.size(); i += n - 1) {
+                res.push_back (v + add);
+                for (int j = 0; j < n - 1; j++) {
+                    res.push_back(sub_points[i + j]);
+                }
+            }
+            v--;
+        }
+        return res;
+    }
+}
+
+int decode_comb_1 (uint64_t *n, int k1, int r) {
+    while (comb(r, k1) > *n) {
+        r--;
+    }
+    *n -= comb(r, k1);
+    return r;
+}
+
+// optimized version for < 64 bits
+long repeats_encode_64 (
+     const std::vector<Repeat> & repeats,
+     int dim, const float *c)
+{
+    uint64_t coded = 0;
+    int nfree = dim;
+    uint64_t code = 0, shift = 1;
+    for (auto r = repeats.begin(); r != repeats.end(); ++r) {
+        int rank = 0, occ = 0;
+        uint64_t code_comb = 0;
+        uint64_t tosee = ~coded;
+        for(;;) {
+            // directly jump to next available slot.
+            int i = __builtin_ctzl(tosee);
+            tosee &= ~(1UL << i) ;
+            if (c[i] == r->val) {
+                code_comb += comb(rank, occ + 1);
+                occ++;
+                coded |= 1UL << i;
+                if (occ == r->n) break;
+            }
+            rank++;
+        }
+        uint64_t max_comb = comb(nfree, r->n);
+        code += shift * code_comb;
+        shift *= max_comb;
+        nfree -= r->n;
+    }
+    return code;
+}
+
+
+void repeats_decode_64(
+     const std::vector<Repeat> & repeats,
+     int dim, uint64_t code, float *c)
+{
+    uint64_t decoded = 0;
+    int nfree = dim;
+    for (auto r = repeats.begin(); r != repeats.end(); ++r) {
+        uint64_t max_comb = comb(nfree, r->n);
+        uint64_t code_comb = code % max_comb;
+        code /= max_comb;
+
+        int occ = 0;
+        int rank = nfree;
+        int next_rank = decode_comb_1 (&code_comb, r->n, rank);
+        uint64_t tosee = ((1UL << dim) - 1) ^ decoded;
+        for(;;) {
+            int i = 63 - __builtin_clzl(tosee);
+            tosee &= ~(1UL << i);
+            rank--;
+            if (rank == next_rank) {
+                decoded |= 1UL << i;
+                c[i] = r->val;
+                occ++;
+                if (occ == r->n) break;
+                next_rank = decode_comb_1 (
+                   &code_comb, r->n - occ, next_rank);
+            }
+        }
+        nfree -= r->n;
+    }
+
+}
+
+
+
+} // anonymous namespace
+
+Repeats::Repeats (int dim, const float *c): dim(dim)
+{
+    for(int i = 0; i < dim; i++) {
+        int j = 0;
+        for(;;) {
+            if (j == repeats.size()) {
+                repeats.push_back(Repeat{c[i], 1});
+                break;
+            }
+            if (repeats[j].val == c[i]) {
+                repeats[j].n++;
+                break;
+            }
+            j++;
+        }
+    }
+}
+
+
+long Repeats::count () const
+{
+    long accu = 1;
+    int remain = dim;
+    for (int i = 0; i < repeats.size(); i++) {
+        accu *= comb(remain, repeats[i].n);
+        remain -= repeats[i].n;
+    }
+    return accu;
+}
+
+
+
+// version with a bool vector that works for > 64 dim
+long Repeats::encode(const float *c) const
+{
+    if (dim < 64) {
+        return repeats_encode_64 (repeats, dim, c);
+    }
+    std::vector<bool> coded(dim, false);
+    int nfree = dim;
+    uint64_t code = 0, shift = 1;
+    for (auto r = repeats.begin(); r != repeats.end(); ++r) {
+        int rank = 0, occ = 0;
+        uint64_t code_comb = 0;
+        for (int i = 0; i < dim; i++) {
+            if (!coded[i]) {
+                if (c[i] == r->val) {
+                    code_comb += comb(rank, occ + 1);
+                    occ++;
+                    coded[i] = true;
+                    if (occ == r->n) break;
+                }
+                rank++;
+            }
+        }
+        uint64_t max_comb = comb(nfree, r->n);
+        code += shift * code_comb;
+        shift *= max_comb;
+        nfree -= r->n;
+    }
+    return code;
+}
+
+
+
+void Repeats::decode(uint64_t code, float *c) const
+{
+    if (dim < 64) {
+        repeats_decode_64 (repeats, dim, code, c);
+        return;
+    }
+
+    std::vector<bool> decoded(dim, false);
+    int nfree = dim;
+    for (auto r = repeats.begin(); r != repeats.end(); ++r) {
+        uint64_t max_comb = comb(nfree, r->n);
+        uint64_t code_comb = code % max_comb;
+        code /= max_comb;
+
+        int occ = 0;
+        int rank = nfree;
+        int next_rank = decode_comb_1 (&code_comb, r->n, rank);
+        for (int i = dim - 1; i >= 0; i--) {
+            if (!decoded[i]) {
+                rank--;
+                if (rank == next_rank) {
+                    decoded[i] = true;
+                    c[i] = r->val;
+                    occ++;
+                    if (occ == r->n) break;
+                    next_rank = decode_comb_1 (
+                         &code_comb, r->n - occ, next_rank);
+                }
+            }
+        }
+        nfree -= r->n;
+    }
+
+}
+
+
+
+/********************************************
+ * EnumeratedVectors functions
+ ********************************************/
+
+
+void EnumeratedVectors::encode_multi(size_t n, const float *c,
+                               uint64_t * codes) const
+{
+#pragma omp parallel if (n > 1000)
+    {
+#pragma omp for
+        for(int i = 0; i < n; i++) {
+            codes[i] = encode(c + i * dim);
+        }
+    }
+}
+
+
+void EnumeratedVectors::decode_multi(size_t n, const uint64_t * codes,
+                               float *c) const
+{
+#pragma omp parallel if (n > 1000)
+    {
+#pragma omp for
+        for(int i = 0; i < n; i++) {
+            decode(codes[i], c + i * dim);
+        }
+    }
+}
+
+void EnumeratedVectors::find_nn (
+                  size_t nc, const uint64_t * codes,
+                  size_t nq, const float *xq,
+                  long *labels, float *distances)
+{
+    for (long i = 0; i < nq; i++) {
+        distances[i] = -1e20;
+        labels[i] = -1;
+    }
+
+    float c[dim];
+    for(long i = 0; i < nc; i++) {
+        uint64_t code = codes[nc];
+        decode(code, c);
+        for (long j = 0; j < nq; j++) {
+            const float *x = xq + j * dim;
+            float dis = fvec_inner_product(x, c, dim);
+            if (dis > distances[j]) {
+                distances[j] = dis;
+                labels[j] = i;
+            }
+        }
+    }
+
+}
+
+
+/**********************************************************
+ * ZnSphereSearch
+ **********************************************************/
+
+
+ZnSphereSearch::ZnSphereSearch(int dim, int r2): dimS(dim), r2(r2) {
+    voc = sum_of_sq(r2, int(ceil(sqrt(r2)) + 1), dim);
+    natom = voc.size() / dim;
+}
+
+float ZnSphereSearch::search(const float *x, float *c) const {
+    float tmp[dimS * 2];
+    int tmp_int[dimS];
+    return search(x, c, tmp, tmp_int);
+}
+
+float ZnSphereSearch::search(const float *x, float *c,
+                             float *tmp, // size 2 *dim
+                             int *tmp_int, // size dim
+                             int *ibest_out
+                             ) const {
+    int dim = dimS;
+    assert (natom > 0);
+    int *o = tmp_int;
+    float *xabs = tmp;
+    float *xperm = tmp + dim;
+
+    // argsort
+    for (int i = 0; i < dim; i++) {
+        o[i] = i;
+        xabs[i] = fabsf(x[i]);
+    }
+    std::sort(o, o + dim, [xabs](int a, int b) {
+            return xabs[a] > xabs[b];
+        });
+    for (int i = 0; i < dim; i++) {
+        xperm[i] = xabs[o[i]];
+    }
+    // find best
+    int ibest = -1;
+    float dpbest = -100;
+    for (int i = 0; i < natom; i++) {
+        float dp = fvec_inner_product (voc.data() + i * dim, xperm, dim);
+        if (dp > dpbest) {
+            dpbest = dp;
+            ibest = i;
+        }
+    }
+    // revert sort
+    const float *cin = voc.data() + ibest * dim;
+    for (int i = 0; i < dim; i++) {
+        c[o[i]] = copysignf (cin[i], x[o[i]]);
+    }
+    if (ibest_out) {
+        *ibest_out = ibest;
+    }
+    return dpbest;
+}
+
+void ZnSphereSearch::search_multi(int n, const float *x,
+                                  float *c_out,
+                                  float *dp_out) {
+#pragma omp parallel if (n > 1000)
+    {
+#pragma omp for
+        for(int i = 0; i < n; i++) {
+            dp_out[i] = search(x + i * dimS, c_out + i * dimS);
+        }
+    }
+}
+
+
+/**********************************************************
+ * ZnSphereCodec
+ **********************************************************/
+
+ZnSphereCodec::ZnSphereCodec(int dim, int r2):
+    ZnSphereSearch(dim, r2),
+    EnumeratedVectors(dim)
+{
+    nv = 0;
+    for (int i = 0; i < natom; i++) {
+        Repeats repeats(dim, &voc[i * dim]);
+        CodeSegment cs(repeats);
+        cs.c0 = nv;
+        Repeat &br = repeats.repeats.back();
+        cs.signbits = br.val == 0 ? dim - br.n : dim;
+        code_segments.push_back(cs);
+        nv += repeats.count() << cs.signbits;
+    }
+
+    uint64_t nvx = nv;
+    code_size = 0;
+    while (nvx > 0) {
+        nvx >>= 8;
+        code_size++;
+    }
+}
+
+uint64_t ZnSphereCodec::search_and_encode(const float *x) const {
+    float tmp[dim * 2];
+    int tmp_int[dim];
+    int ano; // atom number
+    float c[dim];
+    search(x, c, tmp, tmp_int, &ano);
+    uint64_t signs = 0;
+    float cabs[dim];
+    int nnz = 0;
+    for (int i = 0; i < dim; i++) {
+        cabs[i] = fabs(c[i]);
+        if (c[i] != 0) {
+            if (c[i] < 0) {
+                signs |= 1UL << nnz;
+            }
+            nnz ++;
+        }
+    }
+    const CodeSegment &cs = code_segments[ano];
+    assert(nnz == cs.signbits);
+    uint64_t code = cs.c0 + signs;
+    code += cs.encode(cabs) << cs.signbits;
+    return code;
+}
+
+uint64_t ZnSphereCodec::encode(const float *x) const
+{
+    return search_and_encode(x);
+}
+
+
+void ZnSphereCodec::decode(uint64_t code, float *c) const {
+    int i0 = 0, i1 = natom;
+    while (i0 + 1 < i1) {
+        int imed = (i0 + i1) / 2;
+        if (code_segments[imed].c0 <= code) i0 = imed;
+        else i1 = imed;
+    }
+    const CodeSegment &cs = code_segments[i0];
+    code -= cs.c0;
+    uint64_t signs = code;
+    code >>= cs.signbits;
+    cs.decode(code, c);
+
+    int nnz = 0;
+    for (int i = 0; i < dim; i++) {
+        if (c[i] != 0) {
+            if (signs & (1UL << nnz)) {
+                c[i] = -c[i];
+            }
+            nnz ++;
+        }
+    }
+}
+
+
+/**************************************************************
+ * ZnSphereCodecRec
+ **************************************************************/
+
+uint64_t ZnSphereCodecRec::get_nv(int ld, int r2a) const
+{
+    return all_nv[ld * (r2 + 1) + r2a];
+}
+
+
+uint64_t ZnSphereCodecRec::get_nv_cum(int ld, int r2t, int r2a) const
+{
+    return all_nv_cum[(ld * (r2 + 1) + r2t) * (r2 + 1) + r2a];
+}
+
+void ZnSphereCodecRec::set_nv_cum(int ld, int r2t, int r2a, uint64_t cum)
+{
+    all_nv_cum[(ld * (r2 + 1) + r2t) * (r2 + 1) + r2a] = cum;
+}
+
+
+ZnSphereCodecRec::ZnSphereCodecRec(int dim, int r2):
+    EnumeratedVectors(dim), r2(r2)
+{
+    log2_dim = 0;
+    while (dim > (1 << log2_dim)) {
+        log2_dim++;
+    }
+    assert(dim == (1 << log2_dim) ||
+           !"dimension must be a power of 2");
+
+    all_nv.resize((log2_dim + 1) * (r2 + 1));
+    all_nv_cum.resize((log2_dim + 1) * (r2 + 1) * (r2 + 1));
+
+    for (int r2a = 0; r2a <= r2; r2a++) {
+        int r = int(sqrt(r2a));
+        if (r * r == r2a) {
+            all_nv[r2a] = r == 0 ? 1 : 2;
+        } else {
+            all_nv[r2a] = 0;
+        }
+    }
+
+    for (int ld = 1; ld <= log2_dim; ld++) {
+
+        for (int r2sub = 0; r2sub <= r2; r2sub++) {
+            uint64_t nv = 0;
+            for (int r2a = 0; r2a <= r2sub; r2a++) {
+                int r2b = r2sub - r2a;
+                set_nv_cum(ld, r2sub, r2a, nv);
+                nv += get_nv(ld - 1, r2a) * get_nv(ld - 1, r2b);
+            }
+            all_nv[ld * (r2 + 1) + r2sub] = nv;
+        }
+    }
+    nv = get_nv(log2_dim, r2);
+
+    uint64_t nvx = nv;
+    code_size = 0;
+    while (nvx > 0) {
+        nvx >>= 8;
+        code_size++;
+    }
+
+    int cache_level = std::min(3, log2_dim - 1);
+    decode_cache_ld = 0;
+    assert(cache_level <= log2_dim);
+    decode_cache.resize((r2 + 1));
+
+    for (int r2sub = 0; r2sub <= r2; r2sub++) {
+        int ld = cache_level;
+        uint64_t nvi = get_nv(ld, r2sub);
+        std::vector<float> &cache = decode_cache[r2sub];
+        int dimsub = (1 << cache_level);
+        cache.resize (nvi * dimsub);
+        float c[dim];
+        uint64_t code0 = get_nv_cum(cache_level + 1, r2,
+                                 r2 - r2sub);
+        for (int i = 0; i < nvi; i++) {
+            decode(i + code0, c);
+            memcpy(&cache[i * dimsub], c + dim - dimsub,
+                   dimsub * sizeof(*c));
+        }
+    }
+    decode_cache_ld = cache_level;
+}
+
+uint64_t ZnSphereCodecRec::encode(const float *c) const
+{
+    return encode_centroid(c);
+}
+
+
+
+uint64_t ZnSphereCodecRec::encode_centroid(const float *c) const
+{
+    uint64_t codes[dim];
+    int norm2s[dim];
+    for(int i = 0; i < dim; i++) {
+        if (c[i] == 0) {
+            codes[i] = 0;
+            norm2s[i] = 0;
+        } else {
+            int r2i = int(c[i] * c[i]);
+            norm2s[i] = r2i;
+            codes[i] = c[i] >= 0 ? 0 : 1;
+        }
+    }
+    int dim2 = dim / 2;
+    for(int ld = 1; ld <= log2_dim; ld++) {
+        for (int i = 0; i < dim2; i++) {
+            int r2a = norm2s[2 * i];
+            int r2b = norm2s[2 * i + 1];
+
+            uint64_t code_a = codes[2 * i];
+            uint64_t code_b = codes[2 * i + 1];
+
+            codes[i] =
+                get_nv_cum(ld, r2a + r2b, r2a) +
+                code_a * get_nv(ld - 1, r2b) +
+                code_b;
+            norm2s[i] = r2a + r2b;
+        }
+        dim2 /= 2;
+    }
+    return codes[0];
+}
+
+
+
+void ZnSphereCodecRec::decode(uint64_t code, float *c) const
+{
+    uint64_t codes[dim];
+    int norm2s[dim];
+    codes[0] = code;
+    norm2s[0] = r2;
+
+    int dim2 = 1;
+    for(int ld = log2_dim; ld > decode_cache_ld; ld--) {
+        for (int i = dim2 - 1; i >= 0; i--) {
+            int r2sub = norm2s[i];
+            int i0 = 0, i1 = r2sub + 1;
+            uint64_t codei = codes[i];
+            const uint64_t *cum =
+                &all_nv_cum[(ld * (r2 + 1) + r2sub) * (r2 + 1)];
+            while (i1 > i0 + 1) {
+                int imed = (i0 + i1) / 2;
+                if (cum[imed] <= codei)
+                    i0 = imed;
+                else
+                    i1 = imed;
+            }
+            int r2a = i0, r2b = r2sub - i0;
+            codei -= cum[r2a];
+            norm2s[2 * i] = r2a;
+            norm2s[2 * i + 1] = r2b;
+
+            uint64_t code_a = codei / get_nv(ld - 1, r2b);
+            uint64_t code_b = codei % get_nv(ld - 1, r2b);
+
+            codes[2 * i] = code_a;
+            codes[2 * i + 1] = code_b;
+
+        }
+        dim2 *= 2;
+    }
+
+    if (decode_cache_ld == 0) {
+        for(int i = 0; i < dim; i++) {
+            if (norm2s[i] == 0) {
+                c[i] = 0;
+            } else {
+                float r = sqrt(norm2s[i]);
+                assert(r * r == norm2s[i]);
+                c[i] = codes[i] == 0 ? r : -r;
+            }
+        }
+    } else {
+        int subdim = 1 << decode_cache_ld;
+        assert ((dim2 * subdim) == dim);
+
+        for(int i = 0; i < dim2; i++) {
+
+            const std::vector<float> & cache =
+                decode_cache[norm2s[i]];
+            assert(codes[i] < cache.size());
+            memcpy(c + i * subdim,
+                   &cache[codes[i] * subdim],
+                   sizeof(*c)* subdim);
+        }
+    }
+}
+
+// if not use_rec, instanciate an arbitrary harmless znc_rec
+ZnSphereCodecAlt::ZnSphereCodecAlt (int dim, int r2):
+    ZnSphereCodec (dim, r2),
+    use_rec ((dim & (dim - 1)) == 0),
+    znc_rec (use_rec ? dim : 8,
+             use_rec ? r2 : 14)
+{}
+
+uint64_t ZnSphereCodecAlt::encode(const float *x) const
+{
+    if (!use_rec) {
+        // it's ok if the vector is not normalized
+        return ZnSphereCodec::encode(x);
+    } else {
+        // find nearest centroid
+        std::vector<float> centroid(dim);
+        search (x, centroid.data());
+        return znc_rec.encode(centroid.data());
+    }
+}
+
+void ZnSphereCodecAlt::decode(uint64_t code, float *c) const
+{
+    if (!use_rec) {
+        ZnSphereCodec::decode (code, c);
+    } else {
+        znc_rec.decode (code, c);
+    }
+}
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/impl/lattice_Zn.h b/core/src/index/thirdparty/faiss/impl/lattice_Zn.h
new file mode 100644
index 0000000000..f346d1e4c5
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/lattice_Zn.h
@@ -0,0 +1,199 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+#ifndef FAISS_LATTICE_ZN_H
+#define FAISS_LATTICE_ZN_H
+
+#include <vector>
+#include <stddef.h>
+#include <stdint.h>
+
+namespace faiss {
+
+/** returns the nearest vertex in the sphere to a query. Returns only
+ * the coordinates, not an id.
+ *
+ * Algorithm: all points are derived from a one atom vector up to a
+ * permutation and sign changes. The search function finds the most
+ * appropriate atom and transformation.
+ */
+struct ZnSphereSearch {
+    int dimS, r2;
+    int natom;
+
+    /// size dim * ntatom
+    std::vector<float> voc;
+
+    ZnSphereSearch(int dim, int r2);
+
+    /// find nearest centroid. x does not need to be normalized
+    float search(const float *x, float *c) const;
+
+    /// full call. Requires externally-allocated temp space
+    float search(const float *x, float *c,
+                 float *tmp, // size 2 *dim
+                 int *tmp_int, // size dim
+                 int *ibest_out = nullptr
+                 ) const;
+
+    // multi-threaded
+    void search_multi(int n, const float *x,
+                      float *c_out,
+                      float *dp_out);
+
+};
+
+
+/***************************************************************************
+ * Support ids as well.
+ *
+ * Limitations: ids are limited to 64 bit
+ ***************************************************************************/
+
+struct EnumeratedVectors {
+    /// size of the collection
+    uint64_t nv;
+    int dim;
+
+    explicit EnumeratedVectors(int dim): nv(0), dim(dim) {}
+
+    /// encode a vector from a collection
+    virtual uint64_t encode(const float *x) const = 0;
+
+    /// decode it
+    virtual void decode(uint64_t code, float *c) const = 0;
+
+    // call encode on nc vectors
+    void encode_multi (size_t nc, const float *c,
+                       uint64_t * codes) const;
+
+    // call decode on nc codes
+    void decode_multi (size_t nc, const uint64_t * codes,
+                       float *c) const;
+
+    // find the nearest neighbor of each xq
+    // (decodes and computes distances)
+    void find_nn (size_t n, const uint64_t * codes,
+                  size_t nq, const float *xq,
+                  long *idx, float *dis);
+
+    virtual ~EnumeratedVectors() {}
+
+};
+
+struct Repeat {
+    float val;
+    int n;
+};
+
+/** Repeats: used to encode a vector that has n occurrences of
+ *  val. Encodes the signs and permutation of the vector. Useful for
+ *  atoms.
+ */
+struct Repeats {
+    int dim;
+    std::vector<Repeat> repeats;
+
+    // initialize from a template of the atom.
+    Repeats(int dim = 0, const float *c = nullptr);
+
+    // count number of possible codes for this atom
+    long count() const;
+
+    long encode(const float *c) const;
+
+    void decode(uint64_t code, float *c) const;
+};
+
+
+/** codec that can return ids for the encoded vectors
+ *
+ * uses the ZnSphereSearch to encode the vector by encoding the
+ * permutation and signs. Depends on ZnSphereSearch because it uses
+ * the atom numbers */
+struct ZnSphereCodec: ZnSphereSearch, EnumeratedVectors {
+
+    struct CodeSegment:Repeats {
+        explicit CodeSegment(const Repeats & r): Repeats(r) {}
+        uint64_t c0; // first code assigned to segment
+        int signbits;
+    };
+
+    std::vector<CodeSegment> code_segments;
+    uint64_t nv;
+    size_t code_size;
+
+    ZnSphereCodec(int dim, int r2);
+
+    uint64_t search_and_encode(const float *x) const;
+
+    void decode(uint64_t code, float *c) const override;
+
+    /// takes vectors that do not need to be centroids
+    uint64_t encode(const float *x) const override;
+
+};
+
+/** recursive sphere codec
+ *
+ * Uses a recursive decomposition on the dimensions to encode
+ * centroids found by the ZnSphereSearch. The codes are *not*
+ * compatible with the ones of ZnSpehreCodec
+ */
+struct ZnSphereCodecRec: EnumeratedVectors {
+
+    int r2;
+
+    int log2_dim;
+    int code_size;
+
+    ZnSphereCodecRec(int dim, int r2);
+
+    uint64_t encode_centroid(const float *c) const;
+
+    void decode(uint64_t code, float *c) const override;
+
+    /// vectors need to be centroids (does not work on arbitrary
+    /// vectors)
+    uint64_t encode(const float *x) const override;
+
+    std::vector<uint64_t> all_nv;
+    std::vector<uint64_t> all_nv_cum;
+
+    int decode_cache_ld;
+    std::vector<std::vector<float> > decode_cache;
+
+    // nb of vectors in the sphere in dim 2^ld with r2 radius
+    uint64_t get_nv(int ld, int r2a) const;
+
+    // cumulative version
+    uint64_t get_nv_cum(int ld, int r2t, int r2a) const;
+    void set_nv_cum(int ld, int r2t, int r2a, uint64_t v);
+
+};
+
+
+/** Codec that uses the recursive codec if dim is a power of 2 and
+ * the regular one otherwise */
+struct ZnSphereCodecAlt: ZnSphereCodec {
+    bool use_rec;
+    ZnSphereCodecRec znc_rec;
+
+    ZnSphereCodecAlt (int dim, int r2);
+
+    uint64_t encode(const float *x) const override;
+
+    void decode(uint64_t code, float *c) const override;
+
+};
+
+
+};
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/index_factory.cpp b/core/src/index/thirdparty/faiss/index_factory.cpp
new file mode 100644
index 0000000000..b51cf9119e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/index_factory.cpp
@@ -0,0 +1,412 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/*
+ * implementation of Hyper-parameter auto-tuning
+ */
+
+#include <faiss/AutoTune.h>
+
+#include <cmath>
+#include <stdarg.h>     /* va_list, va_start, va_arg, va_end */
+
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/utils.h>
+#include <faiss/utils/random.h>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/VectorTransform.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/IndexLSH.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexIVFPQR.h>
+#include <faiss/Index2Layer.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/MetaIndexes.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/IndexHNSW.h>
+#include <faiss/IndexLattice.h>
+
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/IndexBinaryHNSW.h>
+#include <faiss/IndexBinaryIVF.h>
+#include <faiss/IndexSQHybrid.h>
+
+namespace faiss {
+
+
+/***************************************************************
+ * index_factory
+ ***************************************************************/
+
+namespace {
+
+struct VTChain {
+    std::vector<VectorTransform *> chain;
+    ~VTChain () {
+        for (int i = 0; i < chain.size(); i++) {
+            delete chain[i];
+        }
+    }
+};
+
+
+/// what kind of training does this coarse quantizer require?
+char get_trains_alone(const Index *coarse_quantizer) {
+    return
+        dynamic_cast<const MultiIndexQuantizer*>(coarse_quantizer) ? 1 :
+        dynamic_cast<const IndexHNSWFlat*>(coarse_quantizer) ? 2 :
+        0;
+}
+
+
+}
+
+Index *index_factory (int d, const char *description_in, MetricType metric)
+{
+    FAISS_THROW_IF_NOT(metric == METRIC_L2 ||
+                       metric == METRIC_INNER_PRODUCT);
+    VTChain vts;
+    Index *coarse_quantizer = nullptr;
+    Index *index = nullptr;
+    bool add_idmap = false;
+    bool make_IndexRefineFlat = false;
+
+    ScopeDeleter1<Index> del_coarse_quantizer, del_index;
+
+    char description[strlen(description_in) + 1];
+    char *ptr;
+    memcpy (description, description_in, strlen(description_in) + 1);
+
+    int64_t ncentroids = -1;
+    bool use_2layer = false;
+
+    for (char *tok = strtok_r (description, " ,", &ptr);
+         tok;
+         tok = strtok_r (nullptr, " ,", &ptr)) {
+        int d_out, opq_M, nbit, M, M2, pq_m, ncent, r2;
+        std::string stok(tok);
+        nbit = 8;
+
+        // to avoid mem leaks with exceptions:
+        // do all tests before any instanciation
+
+        VectorTransform *vt_1 = nullptr;
+        Index *coarse_quantizer_1 = nullptr;
+        Index *index_1 = nullptr;
+
+        // VectorTransforms
+        if (sscanf (tok, "PCA%d", &d_out) == 1) {
+            vt_1 = new PCAMatrix (d, d_out);
+            d = d_out;
+        } else if (sscanf (tok, "PCAR%d", &d_out) == 1) {
+            vt_1 = new PCAMatrix (d, d_out, 0, true);
+            d = d_out;
+        } else if (sscanf (tok, "RR%d", &d_out) == 1) {
+            vt_1 = new RandomRotationMatrix (d, d_out);
+            d = d_out;
+        } else if (sscanf (tok, "PCAW%d", &d_out) == 1) {
+            vt_1 = new PCAMatrix (d, d_out, -0.5, false);
+            d = d_out;
+        } else if (sscanf (tok, "PCAWR%d", &d_out) == 1) {
+            vt_1 = new PCAMatrix (d, d_out, -0.5, true);
+            d = d_out;
+        } else if (sscanf (tok, "OPQ%d_%d", &opq_M, &d_out) == 2) {
+            vt_1 = new OPQMatrix (d, opq_M, d_out);
+            d = d_out;
+        } else if (sscanf (tok, "OPQ%d", &opq_M) == 1) {
+            vt_1 = new OPQMatrix (d, opq_M);
+        } else if (sscanf (tok, "ITQ%d", &d_out) == 1) {
+            vt_1 = new ITQTransform (d, d_out, true);
+            d = d_out;
+        } else if (stok == "ITQ") {
+            vt_1 = new ITQTransform (d, d, false);
+        } else if (sscanf (tok, "Pad%d", &d_out) == 1) {
+            if (d_out > d) {
+                vt_1 = new RemapDimensionsTransform (d, d_out, false);
+                d = d_out;
+            }
+        } else if (stok == "L2norm") {
+            vt_1 = new NormalizationTransform (d, 2.0);
+
+        // coarse quantizers
+        } else if (!coarse_quantizer &&
+                   sscanf (tok, "IVF%ld_HNSW%d", &ncentroids, &M) == 2) {
+            FAISS_THROW_IF_NOT (metric == METRIC_L2);
+            coarse_quantizer_1 = new IndexHNSWFlat (d, M);
+
+        } else if (!coarse_quantizer &&
+                   sscanf (tok, "IVF%ld", &ncentroids) == 1) {
+            if (metric == METRIC_L2) {
+                coarse_quantizer_1 = new IndexFlatL2 (d);
+            } else {
+                coarse_quantizer_1 = new IndexFlatIP (d);
+            }
+        } else if (!coarse_quantizer && sscanf (tok, "IMI2x%d", &nbit) == 1) {
+            FAISS_THROW_IF_NOT_MSG (metric == METRIC_L2,
+                             "MultiIndex not implemented for inner prod search");
+            coarse_quantizer_1 = new MultiIndexQuantizer (d, 2, nbit);
+            ncentroids = 1 << (2 * nbit);
+
+        } else if (!coarse_quantizer &&
+                   sscanf (tok, "Residual%dx%d", &M, &nbit) == 2) {
+            FAISS_THROW_IF_NOT_MSG (metric == METRIC_L2,
+                       "MultiIndex not implemented for inner prod search");
+            coarse_quantizer_1 = new MultiIndexQuantizer (d, M, nbit);
+            ncentroids = int64_t(1) << (M * nbit);
+            use_2layer = true;
+
+        } else if (!coarse_quantizer &&
+                   sscanf (tok, "Residual%ld", &ncentroids) == 1) {
+            coarse_quantizer_1 = new IndexFlatL2 (d);
+            use_2layer = true;
+
+        } else if (stok == "IDMap") {
+            add_idmap = true;
+
+        // IVFs
+        } else if (!index && (stok == "Flat" || stok == "FlatDedup")) {
+            if (coarse_quantizer) {
+                // if there was an IVF in front, then it is an IVFFlat
+                IndexIVF *index_ivf = stok == "Flat" ?
+                    new IndexIVFFlat (
+                          coarse_quantizer, d, ncentroids, metric) :
+                    new IndexIVFFlatDedup (
+                          coarse_quantizer, d, ncentroids, metric);
+                index_ivf->quantizer_trains_alone =
+                    get_trains_alone (coarse_quantizer);
+                index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
+                del_coarse_quantizer.release ();
+                index_ivf->own_fields = true;
+                index_1 = index_ivf;
+            } else {
+                FAISS_THROW_IF_NOT_MSG (stok != "FlatDedup",
+                                        "dedup supported only for IVFFlat");
+                index_1 = new IndexFlat (d, metric);
+            }
+        } else if (!index && (stok == "SQ8" || stok == "SQ4" || stok == "SQ6" ||
+                              stok == "SQfp16")) {
+            ScalarQuantizer::QuantizerType qt =
+                stok == "SQ8" ? ScalarQuantizer::QT_8bit :
+                stok == "SQ6" ? ScalarQuantizer::QT_6bit :
+                stok == "SQ4" ? ScalarQuantizer::QT_4bit :
+                stok == "SQfp16" ? ScalarQuantizer::QT_fp16 :
+                ScalarQuantizer::QT_4bit;
+            if (coarse_quantizer) {
+                FAISS_THROW_IF_NOT (!use_2layer);
+                IndexIVFScalarQuantizer *index_ivf =
+                    new IndexIVFScalarQuantizer (
+                      coarse_quantizer, d, ncentroids, qt, metric);
+                index_ivf->quantizer_trains_alone =
+                    get_trains_alone (coarse_quantizer);
+                del_coarse_quantizer.release ();
+                index_ivf->own_fields = true;
+                index_1 = index_ivf;
+            } else {
+                index_1 = new IndexScalarQuantizer (d, qt, metric);
+            }
+        } else if (!index && (stok == "SQ8Hybrid" || stok == "SQ4Hybrid" || stok == "SQ6Hybrid" ||
+                              stok == "SQfp16Hybrid")) {
+            ScalarQuantizer::QuantizerType qt =
+                    stok == "SQ8Hybrid" ? ScalarQuantizer::QT_8bit :
+                    stok == "SQ6Hybrid" ? ScalarQuantizer::QT_6bit :
+                    stok == "SQ4Hybrid" ? ScalarQuantizer::QT_4bit :
+                    stok == "SQfp16Hybrid" ? ScalarQuantizer::QT_fp16 :
+                    ScalarQuantizer::QT_4bit;
+            FAISS_THROW_IF_NOT_MSG(coarse_quantizer,
+                                   "SQ Hybrid only with an IVF");
+            FAISS_THROW_IF_NOT (!use_2layer);
+            IndexIVFSQHybrid *index_ivf =
+                    new IndexIVFSQHybrid (
+                            coarse_quantizer, d, ncentroids, qt, metric);
+            index_ivf->quantizer_trains_alone =
+                    get_trains_alone (coarse_quantizer);
+            del_coarse_quantizer.release ();
+            index_ivf->own_fields = true;
+            index_1 = index_ivf;
+        } else if (!index && sscanf (tok, "PQ%d+%d", &M, &M2) == 2) {
+            FAISS_THROW_IF_NOT_MSG(coarse_quantizer,
+                             "PQ with + works only with an IVF");
+            FAISS_THROW_IF_NOT_MSG(metric == METRIC_L2,
+                             "IVFPQR not implemented for inner product search");
+            IndexIVFPQR *index_ivf = new IndexIVFPQR (
+                  coarse_quantizer, d, ncentroids, M, 8, M2, 8);
+            index_ivf->quantizer_trains_alone =
+                    get_trains_alone (coarse_quantizer);
+            del_coarse_quantizer.release ();
+            index_ivf->own_fields = true;
+            index_1 = index_ivf;
+        } else if (!index && (sscanf (tok, "PQ%dx%d", &M, &nbit) == 2 ||
+                              sscanf (tok, "PQ%d", &M) == 1 ||
+                              sscanf (tok, "PQ%dnp", &M) == 1)) {
+            bool do_polysemous_training = stok.find("np") == std::string::npos;
+            if (coarse_quantizer) {
+                if (!use_2layer) {
+                    IndexIVFPQ *index_ivf = new IndexIVFPQ (
+                        coarse_quantizer, d, ncentroids, M, nbit);
+                    index_ivf->quantizer_trains_alone =
+                        get_trains_alone (coarse_quantizer);
+                    index_ivf->metric_type = metric;
+                    index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
+                    del_coarse_quantizer.release ();
+                    index_ivf->own_fields = true;
+                    index_ivf->do_polysemous_training = do_polysemous_training;
+                    index_1 = index_ivf;
+                } else {
+                    Index2Layer *index_2l = new Index2Layer
+                        (coarse_quantizer, ncentroids, M, nbit);
+                    index_2l->q1.quantizer_trains_alone =
+                        get_trains_alone (coarse_quantizer);
+                    index_2l->q1.own_fields = true;
+                    index_1 = index_2l;
+                }
+            } else {
+                IndexPQ *index_pq = new IndexPQ (d, M, nbit, metric);
+                index_pq->do_polysemous_training = do_polysemous_training;
+                index_1 = index_pq;
+            }
+        } else if (!index &&
+                   sscanf (tok, "HNSW%d_%d+PQ%d", &M, &ncent, &pq_m) == 3) {
+            Index * quant = new IndexFlatL2 (d);
+            IndexHNSW2Level * hidx2l = new IndexHNSW2Level (quant, ncent, pq_m, M);
+            Index2Layer * idx2l = dynamic_cast<Index2Layer*>(hidx2l->storage);
+            idx2l->q1.own_fields = true;
+            index_1 = hidx2l;
+        } else if (!index &&
+                   sscanf (tok, "HNSW%d_2x%d+PQ%d", &M, &nbit, &pq_m) == 3) {
+            Index * quant = new MultiIndexQuantizer (d, 2, nbit);
+            IndexHNSW2Level * hidx2l =
+                new IndexHNSW2Level (quant, 1 << (2 * nbit), pq_m, M);
+            Index2Layer * idx2l = dynamic_cast<Index2Layer*>(hidx2l->storage);
+            idx2l->q1.own_fields = true;
+            idx2l->q1.quantizer_trains_alone = 1;
+            index_1 = hidx2l;
+        } else if (!index &&
+                   sscanf (tok, "HNSW%d_PQ%d", &M, &pq_m) == 2) {
+            index_1 = new IndexHNSWPQ (d, pq_m, M);
+        } else if (!index &&
+                   sscanf (tok, "HNSW%d", &M) == 1) {
+            index_1 = new IndexHNSWFlat (d, M);
+        } else if (!index &&
+                   sscanf (tok, "HNSW%d_SQ%d", &M, &pq_m) == 2 &&
+                   pq_m == 8) {
+            index_1 = new IndexHNSWSQ (d, ScalarQuantizer::QT_8bit, M);
+        } else if (!index && (stok == "LSH" || stok == "LSHr" ||
+                              stok == "LSHrt" || stok == "LSHt")) {
+            bool rotate_data = strstr(tok, "r") != nullptr;
+            bool train_thresholds = strstr(tok, "t") != nullptr;
+            index_1 = new IndexLSH (d, d, rotate_data, train_thresholds);
+        } else if (!index &&
+                   sscanf (tok, "ZnLattice%dx%d_%d", &M, &r2, &nbit) == 3) {
+            FAISS_THROW_IF_NOT(!coarse_quantizer);
+            index_1 = new IndexLattice(d, M, nbit, r2);
+        } else if (stok == "RFlat") {
+            make_IndexRefineFlat = true;
+        } else {
+            FAISS_THROW_FMT( "could not parse token \"%s\" in %s\n",
+                             tok, description_in);
+        }
+
+        if (index_1 && add_idmap) {
+            IndexIDMap *idmap = new IndexIDMap(index_1);
+            del_index.set (idmap);
+            idmap->own_fields = true;
+            index_1 = idmap;
+            add_idmap = false;
+        }
+
+        if (vt_1)  {
+            vts.chain.push_back (vt_1);
+        }
+
+        if (coarse_quantizer_1) {
+            coarse_quantizer = coarse_quantizer_1;
+            del_coarse_quantizer.set (coarse_quantizer);
+        }
+
+        if (index_1) {
+            index = index_1;
+            del_index.set (index);
+        }
+    }
+
+    FAISS_THROW_IF_NOT_FMT(index, "description %s did not generate an index",
+                    description_in);
+
+    // nothing can go wrong now
+    del_index.release ();
+    del_coarse_quantizer.release ();
+
+    if (add_idmap) {
+        fprintf(stderr, "index_factory: WARNING: "
+                "IDMap option not used\n");
+    }
+
+    if (vts.chain.size() > 0) {
+        IndexPreTransform *index_pt = new IndexPreTransform (index);
+        index_pt->own_fields = true;
+        // add from back
+        while (vts.chain.size() > 0) {
+            index_pt->prepend_transform (vts.chain.back ());
+            vts.chain.pop_back ();
+        }
+        index = index_pt;
+    }
+
+    if (make_IndexRefineFlat) {
+        IndexRefineFlat *index_rf = new IndexRefineFlat (index);
+        index_rf->own_fields = true;
+        index = index_rf;
+    }
+
+    return index;
+}
+
+IndexBinary *index_binary_factory(int d, const char *description)
+{
+    IndexBinary *index = nullptr;
+
+    int ncentroids = -1;
+    int M;
+
+    if (sscanf(description, "BIVF%d_HNSW%d", &ncentroids, &M) == 2) {
+        IndexBinaryIVF *index_ivf = new IndexBinaryIVF(
+            new IndexBinaryHNSW(d, M), d, ncentroids
+        );
+        index_ivf->own_fields = true;
+        index = index_ivf;
+
+    } else if (sscanf(description, "BIVF%d", &ncentroids) == 1) {
+        IndexBinaryIVF *index_ivf = new IndexBinaryIVF(
+            new IndexBinaryFlat(d), d, ncentroids
+        );
+        index_ivf->own_fields = true;
+        index = index_ivf;
+
+    } else if (sscanf(description, "BHNSW%d", &M) == 1) {
+        IndexBinaryHNSW *index_hnsw = new IndexBinaryHNSW(d, M);
+        index = index_hnsw;
+
+    } else if (std::string(description) == "BFlat") {
+        index = new IndexBinaryFlat(d);
+
+    } else {
+        FAISS_THROW_IF_NOT_FMT(index, "description %s did not generate an index",
+                               description);
+    }
+
+    return index;
+}
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/index_factory.h b/core/src/index/thirdparty/faiss/index_factory.h
new file mode 100644
index 0000000000..005a53c7fa
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/index_factory.h
@@ -0,0 +1,25 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/IndexBinary.h>
+
+namespace faiss {
+
+/** Build and index with the sequence of processing steps described in
+ *  the string. */
+Index *index_factory (int d, const char *description,
+                      MetricType metric = METRIC_L2);
+
+IndexBinary *index_binary_factory (int d, const char *description);
+
+
+}
diff --git a/core/src/index/thirdparty/faiss/index_io.h b/core/src/index/thirdparty/faiss/index_io.h
new file mode 100644
index 0000000000..5aef62c87b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/index_io.h
@@ -0,0 +1,75 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+// I/O code for indexes
+
+#ifndef FAISS_INDEX_IO_H
+#define FAISS_INDEX_IO_H
+
+
+#include <cstdio>
+
+/** I/O functions can read/write to a filename, a file handle or to an
+ * object that abstracts the medium.
+ *
+ * The read functions return objects that should be deallocated with
+ * delete. All references within these objectes are owned by the
+ * object.
+ */
+
+namespace faiss {
+
+struct Index;
+struct IndexBinary;
+struct VectorTransform;
+struct ProductQuantizer;
+struct IOReader;
+struct IOWriter;
+struct InvertedLists;
+
+void write_index (const Index *idx, const char *fname);
+void write_index (const Index *idx, FILE *f);
+void write_index (const Index *idx, IOWriter *writer);
+
+void write_index_binary (const IndexBinary *idx, const char *fname);
+void write_index_binary (const IndexBinary *idx, FILE *f);
+void write_index_binary (const IndexBinary *idx, IOWriter *writer);
+
+// The read_index flags are implemented only for a subset of index types.
+const int IO_FLAG_MMAP = 1; // try to memmap if possible
+const int IO_FLAG_READ_ONLY = 2;
+// strip directory component from ondisk filename, and assume it's in
+// the same directory as the index file
+const int IO_FLAG_ONDISK_SAME_DIR = 4;
+
+Index *read_index (const char *fname, int io_flags = 0);
+Index *read_index (FILE * f, int io_flags = 0);
+Index *read_index (IOReader *reader, int io_flags = 0);
+
+IndexBinary *read_index_binary (const char *fname, int io_flags = 0);
+IndexBinary *read_index_binary (FILE * f, int io_flags = 0);
+IndexBinary *read_index_binary (IOReader *reader, int io_flags = 0);
+
+void write_VectorTransform (const VectorTransform *vt, const char *fname);
+VectorTransform *read_VectorTransform (const char *fname);
+
+ProductQuantizer * read_ProductQuantizer (const char*fname);
+ProductQuantizer * read_ProductQuantizer (IOReader *reader);
+
+void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname);
+void write_ProductQuantizer (const ProductQuantizer*pq, IOWriter *f);
+
+void write_InvertedLists (const InvertedLists *ils, IOWriter *f);
+InvertedLists *read_InvertedLists (IOReader *reader, int io_flags = 0);
+
+
+} // namespace faiss
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/makefile.inc.in b/core/src/index/thirdparty/faiss/makefile.inc.in
new file mode 100644
index 0000000000..2aaaf3cd19
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/makefile.inc.in
@@ -0,0 +1,46 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+CXX          = @CXX@
+CXXCPP       = @CXXCPP@
+CPPFLAGS     = -DFINTEGER=int @CPPFLAGS@ @OPENMP_CXXFLAGS@ @NVCC_CPPFLAGS@
+CXXFLAGS     = -fPIC @ARCH_CXXFLAGS@ -Wno-sign-compare @CXXFLAGS@
+CPUFLAGS     = -mavx2 -mf16c @ARCH_CPUFLAGS@
+LDFLAGS      = @OPENMP_LDFLAGS@ @LDFLAGS@ @NVCC_LDFLAGS@
+LIBS         = @BLAS_LIBS@ @LAPACK_LIBS@ @LIBS@ @NVCC_LIBS@
+PYTHONCFLAGS = @PYTHON_CFLAGS@ -I@NUMPY_INCLUDE@
+SWIGFLAGS    = -DSWIGWORDSIZE64
+
+NVCC         = @NVCC@
+CUDA_ROOT    = @CUDA_PREFIX@
+CUDA_ARCH    = @CUDA_ARCH@
+NVCCFLAGS    = -I $(CUDA_ROOT)/targets/x86_64-linux/include/ \
+-O0 -g \
+-Xcompiler -fPIC \
+-Xcudafe --diag_suppress=unrecognized_attribute \
+$(CUDA_ARCH) \
+-lineinfo \
+-ccbin $(CXX) -DFAISS_USE_FLOAT16
+
+OS = $(shell uname -s)
+
+SHAREDEXT   = so
+SHAREDFLAGS = -shared
+
+ifeq ($(OS),Darwin)
+	SHAREDEXT   = dylib
+	SHAREDFLAGS = -dynamiclib -undefined dynamic_lookup
+        SWIGFLAGS   =
+endif
+
+MKDIR_P      = @MKDIR_P@
+PYTHON       = @PYTHON@
+SWIG         = @SWIG@
+AR          ?= ar
+
+prefix      ?= @prefix@
+exec_prefix ?= @exec_prefix@
+libdir       = @libdir@
+includedir   = @includedir@
diff --git a/core/src/index/thirdparty/faiss/misc/test_blas.cpp b/core/src/index/thirdparty/faiss/misc/test_blas.cpp
new file mode 100644
index 0000000000..be2536497e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/misc/test_blas.cpp
@@ -0,0 +1,84 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+
+#undef FINTEGER
+#define FINTEGER long
+
+
+extern "C" {
+
+/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
+
+int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+            n, FINTEGER *k, const float *alpha, const float *a,
+            FINTEGER *lda, const float *b, FINTEGER *
+            ldb, float *beta, float *c, FINTEGER *ldc);
+
+/* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
+
+int sgeqrf_ (FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
+                 float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
+
+}
+
+float *new_random_vec(int size)
+{
+    float *x = new float[size];
+    for (int i = 0; i < size; i++)
+        x[i] = drand48();
+    return x;
+}
+
+
+int main() {
+
+    FINTEGER m = 10, n = 20, k = 30;
+    float *a = new_random_vec(m * k), *b = new_random_vec(n * k), *c = new float[n * m];
+    float one = 1.0, zero = 0.0;
+
+    printf("BLAS test\n");
+
+    sgemm_("Not transposed", "Not transposed",
+           &m, &n, &k, &one, a, &m, b, &k, &zero, c, &m);
+
+    printf("errors=\n");
+
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            float accu = 0;
+            for (int l = 0; l < k; l++)
+                accu += a[i + l * m] * b[l + j * k];
+            printf ("%6.3f ", accu - c[i + j * m]);
+        }
+        printf("\n");
+    }
+
+    long info = 0x64bL << 32;
+    long mi = 0x64bL << 32 | m;
+    float *tau = new float[m];
+    FINTEGER lwork = -1;
+
+    float work1;
+
+    printf("Intentional Lapack error (appears only for 64-bit INTEGER):\n");
+    sgeqrf_ (&mi, &n, c, &m, tau, &work1, &lwork, (FINTEGER*)&info);
+
+    // sgeqrf_ (&m, &n, c, &zeroi, tau, &work1, &lwork, (FINTEGER*)&info);
+    printf("info=%016lx\n", info);
+
+    if(info >> 32 == 0x64b) {
+        printf("Lapack uses 32-bit integers\n");
+    } else {
+        printf("Lapack uses 64-bit integers\n");
+    }
+
+
+    return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/python/Makefile b/core/src/index/thirdparty/faiss/python/Makefile
new file mode 100644
index 0000000000..2836568253
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/python/Makefile
@@ -0,0 +1,39 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+-include ../makefile.inc
+
+ifneq ($(strip $(NVCC)),)
+	SWIGFLAGS += -DGPU_WRAPPER
+endif
+
+all: build
+
+# Also silently generates swigfaiss.py.
+swigfaiss.cpp: swigfaiss.swig ../libfaiss.a
+	$(SWIG) -python -c++ -Doverride= -I../ $(SWIGFLAGS) -o $@ $<
+
+swigfaiss_avx2.cpp: swigfaiss.swig ../libfaiss.a
+	$(SWIG) -python -c++ -Doverride= -module swigfaiss_avx2 -I../ $(SWIGFLAGS) -o $@ $<
+
+%.o: %.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) $(PYTHONCFLAGS) \
+               -I../ -c $< -o $@
+
+# Extension is .so even on OSX.
+_%.so: %.o ../libfaiss.a
+	$(CXX) $(SHAREDFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+build: _swigfaiss.so faiss.py
+	$(PYTHON) setup.py build
+
+install: build
+	$(PYTHON) setup.py install
+
+clean:
+	rm -f swigfaiss*.cpp swigfaiss*.o swigfaiss*.py _swigfaiss*.so
+	rm -rf build/
+
+.PHONY: all build clean install
diff --git a/core/src/index/thirdparty/faiss/python/faiss.py b/core/src/index/thirdparty/faiss/python/faiss.py
new file mode 100644
index 0000000000..3adbbf4a87
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/python/faiss.py
@@ -0,0 +1,718 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#@nolint
+
+# not linting this file because it imports * form swigfaiss, which
+# causes a ton of useless warnings.
+
+from __future__ import print_function
+
+import numpy as np
+import sys
+import inspect
+import pdb
+import platform
+import subprocess
+
+
+def instruction_set():
+    if platform.system() == "Darwin":
+        if subprocess.check_output(["/usr/sbin/sysctl", "hw.optional.avx2_0"])[-1] == '1':
+            return "AVX2"
+        else:
+            return "default"
+    elif platform.system() == "Linux":
+        import numpy.distutils.cpuinfo
+        if "avx2" in numpy.distutils.cpuinfo.cpu.info[0]['flags']:
+            return "AVX2"
+        else:
+            return "default"
+
+
+try:
+    instr_set = instruction_set()
+    if instr_set == "AVX2":
+        print("Loading faiss with AVX2 support.", file=sys.stderr)
+        from .swigfaiss_avx2 import *
+    else:
+        print("Loading faiss.", file=sys.stderr)
+        from .swigfaiss import *
+
+except ImportError:
+    # we import * so that the symbol X can be accessed as faiss.X
+    print("Loading faiss.", file=sys.stderr)
+    from .swigfaiss import *
+
+
+__version__ = "%d.%d.%d" % (FAISS_VERSION_MAJOR,
+                            FAISS_VERSION_MINOR,
+                            FAISS_VERSION_PATCH)
+
+##################################################################
+# The functions below add or replace some methods for classes
+# this is to be able to pass in numpy arrays directly
+# The C++ version of the classnames will be suffixed with _c
+##################################################################
+
+
+def replace_method(the_class, name, replacement, ignore_missing=False):
+    try:
+        orig_method = getattr(the_class, name)
+    except AttributeError:
+        if ignore_missing:
+            return
+        raise
+    if orig_method.__name__ == 'replacement_' + name:
+        # replacement was done in parent class
+        return
+    setattr(the_class, name + '_c', orig_method)
+    setattr(the_class, name, replacement)
+
+
+def handle_Clustering():
+    def replacement_train(self, x, index):
+        assert x.flags.contiguous
+        n, d = x.shape
+        assert d == self.d
+        self.train_c(n, swig_ptr(x), index)
+    replace_method(Clustering, 'train', replacement_train)
+
+
+handle_Clustering()
+
+
+def handle_Quantizer(the_class):
+
+    def replacement_train(self, x):
+        n, d = x.shape
+        assert d == self.d
+        self.train_c(n, swig_ptr(x))
+
+    def replacement_compute_codes(self, x):
+        n, d = x.shape
+        assert d == self.d
+        codes = np.empty((n, self.code_size), dtype='uint8')
+        self.compute_codes_c(swig_ptr(x), swig_ptr(codes), n)
+        return codes
+
+    def replacement_decode(self, codes):
+        n, cs = codes.shape
+        assert cs == self.code_size
+        x = np.empty((n, self.d), dtype='float32')
+        self.decode_c(swig_ptr(codes), swig_ptr(x), n)
+        return x
+
+    replace_method(the_class, 'train', replacement_train)
+    replace_method(the_class, 'compute_codes', replacement_compute_codes)
+    replace_method(the_class, 'decode', replacement_decode)
+
+
+handle_Quantizer(ProductQuantizer)
+handle_Quantizer(ScalarQuantizer)
+
+
+def handle_Index(the_class):
+
+    def replacement_add(self, x):
+        assert x.flags.contiguous
+        n, d = x.shape
+        assert d == self.d
+        self.add_c(n, swig_ptr(x))
+
+    def replacement_add_with_ids(self, x, ids):
+        n, d = x.shape
+        assert d == self.d
+        assert ids.shape == (n, ), 'not same nb of vectors as ids'
+        self.add_with_ids_c(n, swig_ptr(x), swig_ptr(ids))
+
+    def replacement_assign(self, x, k):
+        n, d = x.shape
+        assert d == self.d
+        labels = np.empty((n, k), dtype=np.int64)
+        self.assign_c(n, swig_ptr(x), swig_ptr(labels), k)
+        return labels
+
+    def replacement_train(self, x):
+        assert x.flags.contiguous
+        n, d = x.shape
+        assert d == self.d
+        self.train_c(n, swig_ptr(x))
+
+    def replacement_search(self, x, k):
+        n, d = x.shape
+        assert d == self.d
+        distances = np.empty((n, k), dtype=np.float32)
+        labels = np.empty((n, k), dtype=np.int64)
+        self.search_c(n, swig_ptr(x),
+                      k, swig_ptr(distances),
+                      swig_ptr(labels))
+        return distances, labels
+
+    def replacement_search_and_reconstruct(self, x, k):
+        n, d = x.shape
+        assert d == self.d
+        distances = np.empty((n, k), dtype=np.float32)
+        labels = np.empty((n, k), dtype=np.int64)
+        recons = np.empty((n, k, d), dtype=np.float32)
+        self.search_and_reconstruct_c(n, swig_ptr(x),
+                                      k, swig_ptr(distances),
+                                      swig_ptr(labels),
+                                      swig_ptr(recons))
+        return distances, labels, recons
+
+    def replacement_remove_ids(self, x):
+        if isinstance(x, IDSelector):
+            sel = x
+        else:
+            assert x.ndim == 1
+            sel = IDSelectorBatch(x.size, swig_ptr(x))
+        return self.remove_ids_c(sel)
+
+    def replacement_reconstruct(self, key):
+        x = np.empty(self.d, dtype=np.float32)
+        self.reconstruct_c(key, swig_ptr(x))
+        return x
+
+    def replacement_reconstruct_n(self, n0, ni):
+        x = np.empty((ni, self.d), dtype=np.float32)
+        self.reconstruct_n_c(n0, ni, swig_ptr(x))
+        return x
+
+    def replacement_update_vectors(self, keys, x):
+        n = keys.size
+        assert keys.shape == (n, )
+        assert x.shape == (n, self.d)
+        self.update_vectors_c(n, swig_ptr(keys), swig_ptr(x))
+
+    def replacement_range_search(self, x, thresh):
+        n, d = x.shape
+        assert d == self.d
+        res = RangeSearchResult(n)
+        self.range_search_c(n, swig_ptr(x), thresh, res)
+        # get pointers and copy them
+        lims = rev_swig_ptr(res.lims, n + 1).copy()
+        nd = int(lims[-1])
+        D = rev_swig_ptr(res.distances, nd).copy()
+        I = rev_swig_ptr(res.labels, nd).copy()
+        return lims, D, I
+
+    def replacement_sa_encode(self, x):
+        n, d = x.shape
+        assert d == self.d
+        codes = np.empty((n, self.sa_code_size()), dtype='uint8')
+        self.sa_encode_c(n, swig_ptr(x), swig_ptr(codes))
+        return codes
+
+    def replacement_sa_decode(self, codes):
+        n, cs = codes.shape
+        assert cs == self.sa_code_size()
+        x = np.empty((n, self.d), dtype='float32')
+        self.sa_decode_c(n, swig_ptr(codes), swig_ptr(x))
+        return x
+
+    replace_method(the_class, 'add', replacement_add)
+    replace_method(the_class, 'add_with_ids', replacement_add_with_ids)
+    replace_method(the_class, 'assign', replacement_assign)
+    replace_method(the_class, 'train', replacement_train)
+    replace_method(the_class, 'search', replacement_search)
+    replace_method(the_class, 'remove_ids', replacement_remove_ids)
+    replace_method(the_class, 'reconstruct', replacement_reconstruct)
+    replace_method(the_class, 'reconstruct_n', replacement_reconstruct_n)
+    replace_method(the_class, 'range_search', replacement_range_search)
+    replace_method(the_class, 'update_vectors', replacement_update_vectors,
+                   ignore_missing=True)
+    replace_method(the_class, 'search_and_reconstruct',
+                   replacement_search_and_reconstruct, ignore_missing=True)
+    replace_method(the_class, 'sa_encode', replacement_sa_encode)
+    replace_method(the_class, 'sa_decode', replacement_sa_decode)
+
+def handle_IndexBinary(the_class):
+
+    def replacement_add(self, x):
+        assert x.flags.contiguous
+        n, d = x.shape
+        assert d * 8 == self.d
+        self.add_c(n, swig_ptr(x))
+
+    def replacement_add_with_ids(self, x, ids):
+        n, d = x.shape
+        assert d * 8 == self.d
+        assert ids.shape == (n, ), 'not same nb of vectors as ids'
+        self.add_with_ids_c(n, swig_ptr(x), swig_ptr(ids))
+
+    def replacement_train(self, x):
+        assert x.flags.contiguous
+        n, d = x.shape
+        assert d * 8 == self.d
+        self.train_c(n, swig_ptr(x))
+
+    def replacement_reconstruct(self, key):
+        x = np.empty(self.d // 8, dtype=np.uint8)
+        self.reconstruct_c(key, swig_ptr(x))
+        return x
+
+    def replacement_search(self, x, k):
+        n, d = x.shape
+        assert d * 8 == self.d
+        distances = np.empty((n, k), dtype=np.int32)
+        labels = np.empty((n, k), dtype=np.int64)
+        self.search_c(n, swig_ptr(x),
+                      k, swig_ptr(distances),
+                      swig_ptr(labels))
+        return distances, labels
+
+    def replacement_remove_ids(self, x):
+        if isinstance(x, IDSelector):
+            sel = x
+        else:
+            assert x.ndim == 1
+            sel = IDSelectorBatch(x.size, swig_ptr(x))
+        return self.remove_ids_c(sel)
+
+    replace_method(the_class, 'add', replacement_add)
+    replace_method(the_class, 'add_with_ids', replacement_add_with_ids)
+    replace_method(the_class, 'train', replacement_train)
+    replace_method(the_class, 'search', replacement_search)
+    replace_method(the_class, 'reconstruct', replacement_reconstruct)
+    replace_method(the_class, 'remove_ids', replacement_remove_ids)
+
+
+def handle_VectorTransform(the_class):
+
+    def apply_method(self, x):
+        assert x.flags.contiguous
+        n, d = x.shape
+        assert d == self.d_in
+        y = np.empty((n, self.d_out), dtype=np.float32)
+        self.apply_noalloc(n, swig_ptr(x), swig_ptr(y))
+        return y
+
+    def replacement_reverse_transform(self, x):
+        n, d = x.shape
+        assert d == self.d_out
+        y = np.empty((n, self.d_in), dtype=np.float32)
+        self.reverse_transform_c(n, swig_ptr(x), swig_ptr(y))
+        return y
+
+    def replacement_vt_train(self, x):
+        assert x.flags.contiguous
+        n, d = x.shape
+        assert d == self.d_in
+        self.train_c(n, swig_ptr(x))
+
+    replace_method(the_class, 'train', replacement_vt_train)
+    # apply is reserved in Pyton...
+    the_class.apply_py = apply_method
+    replace_method(the_class, 'reverse_transform',
+                   replacement_reverse_transform)
+
+
+def handle_AutoTuneCriterion(the_class):
+    def replacement_set_groundtruth(self, D, I):
+        if D:
+            assert I.shape == D.shape
+        self.nq, self.gt_nnn = I.shape
+        self.set_groundtruth_c(
+            self.gt_nnn, swig_ptr(D) if D else None, swig_ptr(I))
+
+    def replacement_evaluate(self, D, I):
+        assert I.shape == D.shape
+        assert I.shape == (self.nq, self.nnn)
+        return self.evaluate_c(swig_ptr(D), swig_ptr(I))
+
+    replace_method(the_class, 'set_groundtruth', replacement_set_groundtruth)
+    replace_method(the_class, 'evaluate', replacement_evaluate)
+
+
+def handle_ParameterSpace(the_class):
+    def replacement_explore(self, index, xq, crit):
+        assert xq.shape == (crit.nq, index.d)
+        ops = OperatingPoints()
+        self.explore_c(index, crit.nq, swig_ptr(xq),
+                       crit, ops)
+        return ops
+    replace_method(the_class, 'explore', replacement_explore)
+
+
+def handle_MatrixStats(the_class):
+    original_init = the_class.__init__
+
+    def replacement_init(self, m):
+        assert len(m.shape) == 2
+        original_init(self, m.shape[0], m.shape[1], swig_ptr(m))
+
+    the_class.__init__ = replacement_init
+
+handle_MatrixStats(MatrixStats)
+
+
+this_module = sys.modules[__name__]
+
+
+for symbol in dir(this_module):
+    obj = getattr(this_module, symbol)
+    # print symbol, isinstance(obj, (type, types.ClassType))
+    if inspect.isclass(obj):
+        the_class = obj
+        if issubclass(the_class, Index):
+            handle_Index(the_class)
+
+        if issubclass(the_class, IndexBinary):
+            handle_IndexBinary(the_class)
+
+        if issubclass(the_class, VectorTransform):
+            handle_VectorTransform(the_class)
+
+        if issubclass(the_class, AutoTuneCriterion):
+            handle_AutoTuneCriterion(the_class)
+
+        if issubclass(the_class, ParameterSpace):
+            handle_ParameterSpace(the_class)
+
+
+###########################################
+# Add Python references to objects
+# we do this at the Python class wrapper level.
+###########################################
+
+def add_ref_in_constructor(the_class, parameter_no):
+    # adds a reference to parameter parameter_no in self
+    # so that that parameter does not get deallocated before self
+    original_init = the_class.__init__
+
+    def replacement_init(self, *args):
+        original_init(self, *args)
+        self.referenced_objects = [args[parameter_no]]
+
+    def replacement_init_multiple(self, *args):
+        original_init(self, *args)
+        pset = parameter_no[len(args)]
+        self.referenced_objects = [args[no] for no in pset]
+
+    if type(parameter_no) == dict:
+        # a list of parameters to keep, depending on the number of arguments
+        the_class.__init__ = replacement_init_multiple
+    else:
+        the_class.__init__ = replacement_init
+
+def add_ref_in_method(the_class, method_name, parameter_no):
+    original_method = getattr(the_class, method_name)
+    def replacement_method(self, *args):
+        ref = args[parameter_no]
+        if not hasattr(self, 'referenced_objects'):
+            self.referenced_objects = [ref]
+        else:
+            self.referenced_objects.append(ref)
+        return original_method(self, *args)
+    setattr(the_class, method_name, replacement_method)
+
+def add_ref_in_function(function_name, parameter_no):
+    # assumes the function returns an object
+    original_function = getattr(this_module, function_name)
+    def replacement_function(*args):
+        result = original_function(*args)
+        ref = args[parameter_no]
+        result.referenced_objects = [ref]
+        return result
+    setattr(this_module, function_name, replacement_function)
+
+add_ref_in_constructor(IndexIVFFlat, 0)
+add_ref_in_constructor(IndexIVFFlatDedup, 0)
+add_ref_in_constructor(IndexPreTransform, {2: [0, 1], 1: [0]})
+add_ref_in_method(IndexPreTransform, 'prepend_transform', 0)
+add_ref_in_constructor(IndexIVFPQ, 0)
+add_ref_in_constructor(IndexIVFPQR, 0)
+add_ref_in_constructor(Index2Layer, 0)
+add_ref_in_constructor(Level1Quantizer, 0)
+add_ref_in_constructor(IndexIVFScalarQuantizer, 0)
+add_ref_in_constructor(IndexIDMap, 0)
+add_ref_in_constructor(IndexIDMap2, 0)
+add_ref_in_constructor(IndexHNSW, 0)
+add_ref_in_method(IndexShards, 'add_shard', 0)
+add_ref_in_method(IndexBinaryShards, 'add_shard', 0)
+add_ref_in_constructor(IndexRefineFlat, 0)
+add_ref_in_constructor(IndexBinaryIVF, 0)
+add_ref_in_constructor(IndexBinaryFromFloat, 0)
+add_ref_in_constructor(IndexBinaryIDMap, 0)
+add_ref_in_constructor(IndexBinaryIDMap2, 0)
+
+add_ref_in_method(IndexReplicas, 'addIndex', 0)
+add_ref_in_method(IndexBinaryReplicas, 'addIndex', 0)
+
+# seems really marginal...
+# remove_ref_from_method(IndexReplicas, 'removeIndex', 0)
+
+if hasattr(this_module, 'GpuIndexFlat'):
+    # handle all the GPUResources refs
+    add_ref_in_function('index_cpu_to_gpu', 0)
+    add_ref_in_constructor(GpuIndexFlat, 0)
+    add_ref_in_constructor(GpuIndexFlatIP, 0)
+    add_ref_in_constructor(GpuIndexFlatL2, 0)
+    add_ref_in_constructor(GpuIndexIVFFlat, 0)
+    add_ref_in_constructor(GpuIndexIVFScalarQuantizer, 0)
+    add_ref_in_constructor(GpuIndexIVFPQ, 0)
+    add_ref_in_constructor(GpuIndexBinaryFlat, 0)
+
+
+
+###########################################
+# GPU functions
+###########################################
+
+
+def index_cpu_to_gpu_multiple_py(resources, index, co=None):
+    """builds the C++ vectors for the GPU indices and the
+    resources. Handles the common case where the resources are assigned to
+    the first len(resources) GPUs"""
+    vres = GpuResourcesVector()
+    vdev = IntVector()
+    for i, res in enumerate(resources):
+        vdev.push_back(i)
+        vres.push_back(res)
+    index = index_cpu_to_gpu_multiple(vres, vdev, index, co)
+    index.referenced_objects = resources
+    return index
+
+def index_cpu_to_all_gpus(index, co=None, ngpu=-1):
+    if ngpu == -1:
+        ngpu = get_num_gpus()
+    res = [StandardGpuResources() for i in range(ngpu)]
+    index2 = index_cpu_to_gpu_multiple_py(res, index, co)
+    return index2
+
+
+###########################################
+# numpy array / std::vector conversions
+###########################################
+
+# mapping from vector names in swigfaiss.swig and the numpy dtype names
+vector_name_map = {
+    'Float': 'float32',
+    'Byte': 'uint8',
+    'Char': 'int8',
+    'Uint64': 'uint64',
+    'Long': 'int64',
+    'Int': 'int32',
+    'Double': 'float64'
+    }
+
+def vector_to_array(v):
+    """ convert a C++ vector to a numpy array """
+    classname = v.__class__.__name__
+    assert classname.endswith('Vector')
+    dtype = np.dtype(vector_name_map[classname[:-6]])
+    a = np.empty(v.size(), dtype=dtype)
+    if v.size() > 0:
+        memcpy(swig_ptr(a), v.data(), a.nbytes)
+    return a
+
+
+def vector_float_to_array(v):
+    return vector_to_array(v)
+
+
+def copy_array_to_vector(a, v):
+    """ copy a numpy array to a vector """
+    n, = a.shape
+    classname = v.__class__.__name__
+    assert classname.endswith('Vector')
+    dtype = np.dtype(vector_name_map[classname[:-6]])
+    assert dtype == a.dtype, (
+        'cannot copy a %s array to a %s (should be %s)' % (
+            a.dtype, classname, dtype))
+    v.resize(n)
+    if n > 0:
+        memcpy(v.data(), swig_ptr(a), a.nbytes)
+
+
+###########################################
+# Wrapper for a few functions
+###########################################
+
+def kmin(array, k):
+    """return k smallest values (and their indices) of the lines of a
+    float32 array"""
+    m, n = array.shape
+    I = np.zeros((m, k), dtype='int64')
+    D = np.zeros((m, k), dtype='float32')
+    ha = float_maxheap_array_t()
+    ha.ids = swig_ptr(I)
+    ha.val = swig_ptr(D)
+    ha.nh = m
+    ha.k = k
+    ha.heapify()
+    ha.addn(n, swig_ptr(array))
+    ha.reorder()
+    return D, I
+
+
+def kmax(array, k):
+    """return k largest values (and their indices) of the lines of a
+    float32 array"""
+    m, n = array.shape
+    I = np.zeros((m, k), dtype='int64')
+    D = np.zeros((m, k), dtype='float32')
+    ha = float_minheap_array_t()
+    ha.ids = swig_ptr(I)
+    ha.val = swig_ptr(D)
+    ha.nh = m
+    ha.k = k
+    ha.heapify()
+    ha.addn(n, swig_ptr(array))
+    ha.reorder()
+    return D, I
+
+
+def pairwise_distances(xq, xb, mt=METRIC_L2, metric_arg=0):
+    """compute the whole pairwise distance matrix between two sets of
+    vectors"""
+    nq, d = xq.shape
+    nb, d2 = xb.shape
+    assert d == d2
+    dis = np.empty((nq, nb), dtype='float32')
+    if mt == METRIC_L2:
+        pairwise_L2sqr(
+            d, nq, swig_ptr(xq),
+            nb, swig_ptr(xb),
+            swig_ptr(dis))
+    else:
+        pairwise_extra_distances(
+            d, nq, swig_ptr(xq),
+            nb, swig_ptr(xb),
+            mt, metric_arg,
+            swig_ptr(dis))
+    return dis
+
+
+
+
+def rand(n, seed=12345):
+    res = np.empty(n, dtype='float32')
+    float_rand(swig_ptr(res), res.size, seed)
+    return res
+
+
+def randint(n, seed=12345, vmax=None):
+    res = np.empty(n, dtype='int64')
+    if vmax is None:
+        int64_rand(swig_ptr(res), res.size, seed)
+    else:
+        int64_rand_max(swig_ptr(res), res.size, vmax, seed)
+    return res
+
+lrand = randint
+
+def randn(n, seed=12345):
+    res = np.empty(n, dtype='float32')
+    float_randn(swig_ptr(res), res.size, seed)
+    return res
+
+
+def eval_intersection(I1, I2):
+    """ size of intersection between each line of two result tables"""
+    n = I1.shape[0]
+    assert I2.shape[0] == n
+    k1, k2 = I1.shape[1], I2.shape[1]
+    ninter = 0
+    for i in range(n):
+        ninter += ranklist_intersection_size(
+            k1, swig_ptr(I1[i]), k2, swig_ptr(I2[i]))
+    return ninter
+
+
+def normalize_L2(x):
+    fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x))
+
+# MapLong2Long interface
+
+def replacement_map_add(self, keys, vals):
+    n, = keys.shape
+    assert (n,) == keys.shape
+    self.add_c(n, swig_ptr(keys), swig_ptr(vals))
+
+def replacement_map_search_multiple(self, keys):
+    n, = keys.shape
+    vals = np.empty(n, dtype='int64')
+    self.search_multiple_c(n, swig_ptr(keys), swig_ptr(vals))
+    return vals
+
+replace_method(MapLong2Long, 'add', replacement_map_add)
+replace_method(MapLong2Long, 'search_multiple', replacement_map_search_multiple)
+
+
+###########################################
+# Kmeans object
+###########################################
+
+
+class Kmeans:
+    """shallow wrapper around the Clustering object. The important method
+    is train()."""
+
+    def __init__(self, d, k, **kwargs):
+        """d: input dimension, k: nb of centroids. Additional
+         parameters are passed on the ClusteringParameters object,
+         including niter=25, verbose=False, spherical = False
+        """
+        self.d = d
+        self.k = k
+        self.gpu = False
+        self.cp = ClusteringParameters()
+        for k, v in kwargs.items():
+            if k == 'gpu':
+                self.gpu = v
+            else:
+                # if this raises an exception, it means that it is a non-existent field
+                getattr(self.cp, k)
+                setattr(self.cp, k, v)
+        self.centroids = None
+
+    def train(self, x):
+        n, d = x.shape
+        assert d == self.d
+        clus = Clustering(d, self.k, self.cp)
+        if self.cp.spherical:
+            self.index = IndexFlatIP(d)
+        else:
+            self.index = IndexFlatL2(d)
+        if self.gpu:
+            if self.gpu == True:
+                ngpu = -1
+            else:
+                ngpu = self.gpu
+            self.index = index_cpu_to_all_gpus(self.index, ngpu=ngpu)
+        clus.train(x, self.index)
+        centroids = vector_float_to_array(clus.centroids)
+        self.centroids = centroids.reshape(self.k, d)
+        self.obj = vector_float_to_array(clus.obj)
+        return self.obj[-1] if self.obj.size > 0 else 0.0
+
+    def assign(self, x):
+        assert self.centroids is not None, "should train before assigning"
+        self.index.reset()
+        self.index.add(self.centroids)
+        D, I = self.index.search(x, 1)
+        return D.ravel(), I.ravel()
+
+# IndexProxy was renamed to IndexReplicas, remap the old name for any old code
+# people may have
+IndexProxy = IndexReplicas
+ConcatenatedInvertedLists = HStackInvertedLists
+
+###########################################
+# serialization of indexes to byte arrays
+###########################################
+
+def serialize_index(index):
+    """ convert an index to a numpy uint8 array  """
+    writer = VectorIOWriter()
+    write_index(index, writer)
+    return vector_to_array(writer.data)
+
+def deserialize_index(data):
+    reader = VectorIOReader()
+    copy_array_to_vector(data, reader.data)
+    return read_index(reader)
diff --git a/core/src/index/thirdparty/faiss/python/setup.py b/core/src/index/thirdparty/faiss/python/setup.py
new file mode 100644
index 0000000000..592f3730ea
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/python/setup.py
@@ -0,0 +1,50 @@
+from __future__ import print_function
+from setuptools import setup, find_packages
+import os
+import shutil
+
+here = os.path.abspath(os.path.dirname(__file__))
+
+check_fpath = os.path.join("_swigfaiss.so")
+if not os.path.exists(check_fpath):
+    print("Could not find {}".format(check_fpath))
+    print("Have you run `make` and `make -C python`?")
+
+# make the faiss python package dir
+shutil.rmtree("faiss", ignore_errors=True)
+os.mkdir("faiss")
+shutil.copyfile("faiss.py", "faiss/__init__.py")
+shutil.copyfile("swigfaiss.py", "faiss/swigfaiss.py")
+shutil.copyfile("_swigfaiss.so", "faiss/_swigfaiss.so")
+try:
+    shutil.copyfile("swigfaiss_avx2.py", "faiss/swigfaiss_avx2.py")
+    shutil.copyfile("_swigfaiss_avx2.so", "faiss/_swigfaiss_avx2.so")
+except:
+    pass
+
+long_description="""
+Faiss is a library for efficient similarity search and clustering of dense
+vectors. It contains algorithms that search in sets of vectors of any size,
+ up to ones that possibly do not fit in RAM. It also contains supporting
+code for evaluation and parameter tuning. Faiss is written in C++ with
+complete wrappers for Python/numpy. Some of the most useful algorithms
+are implemented on the GPU. It is developed by Facebook AI Research.
+"""
+setup(
+    name='faiss',
+    version='1.6.0',
+    description='A library for efficient similarity search and clustering of dense vectors',
+    long_description=long_description,
+    url='https://github.com/facebookresearch/faiss',
+    author='Matthijs Douze, Jeff Johnson, Herve Jegou, Lucas Hosseini',
+    author_email='matthijs@fb.com',
+    license='MIT',
+    keywords='search nearest neighbors',
+
+    install_requires=['numpy'],
+    packages=['faiss'],
+    package_data={
+        'faiss': ['*.so'],
+    },
+
+)
diff --git a/core/src/index/thirdparty/faiss/python/swigfaiss.swig b/core/src/index/thirdparty/faiss/python/swigfaiss.swig
new file mode 100644
index 0000000000..726823bee4
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/python/swigfaiss.swig
@@ -0,0 +1,983 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- C++ -*-
+
+// This file describes the C++-scripting language bridge for both Lua
+// and Python It contains mainly includes and a few macros. There are
+// 3 preprocessor macros of interest:
+// SWIGLUA: Lua-specific code
+// SWIGPYTHON: Python-specific code
+// GPU_WRAPPER: also compile interfaces for GPU.
+
+%module swigfaiss;
+
+// fbode SWIG fails on warnings, so make them non fatal
+#pragma SWIG nowarn=321
+#pragma SWIG nowarn=403
+#pragma SWIG nowarn=325
+#pragma SWIG nowarn=389
+#pragma SWIG nowarn=341
+#pragma SWIG nowarn=512
+
+%include <stdint.i>
+typedef int64_t size_t;
+
+#define __restrict
+
+
+/*******************************************************************
+ * Copied verbatim to wrapper. Contains the C++-visible includes, and
+ * the language includes for their respective matrix libraries.
+ *******************************************************************/
+
+%{
+
+
+#include <stdint.h>
+#include <omp.h>
+
+
+#ifdef SWIGLUA
+
+#include <pthread.h>
+
+extern "C" {
+
+#include <TH/TH.h>
+#include <luaT.h>
+#undef THTensor
+
+}
+
+#endif
+
+
+#ifdef SWIGPYTHON
+
+#undef popcount64
+
+#define SWIG_FILE_WITH_INIT
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <numpy/arrayobject.h>
+
+#endif
+
+
+#include <faiss/IndexFlat.h>
+#include <faiss/VectorTransform.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/IndexLSH.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/Index2Layer.h>
+#include <faiss/IndexIVFPQR.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/IndexIVFSpectralHash.h>
+#include <faiss/impl/ThreadedIndex.h>
+#include <faiss/IndexShards.h>
+#include <faiss/IndexReplicas.h>
+#include <faiss/impl/HNSW.h>
+#include <faiss/IndexHNSW.h>
+#include <faiss/MetaIndexes.h>
+#include <faiss/impl/FaissAssert.h>
+
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/IndexBinaryIVF.h>
+#include <faiss/IndexBinaryFromFloat.h>
+#include <faiss/IndexBinaryHNSW.h>
+
+#include <faiss/impl/io.h>
+#include <faiss/index_io.h>
+#include <faiss/clone_index.h>
+
+#include <faiss/IVFlib.h>
+#include <faiss/utils/utils.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/extra_distances.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/OnDiskInvertedLists.h>
+
+#include <faiss/Clustering.h>
+
+#include <faiss/utils/hamming.h>
+
+#include <faiss/AutoTune.h>
+#include <faiss/MatrixStats.h>
+#include <faiss/index_factory.h>
+
+#include <faiss/impl/lattice_Zn.h>
+#include <faiss/IndexLattice.h>
+
+
+%}
+
+/********************************************************
+ * GIL manipulation and exception handling
+ ********************************************************/
+
+#ifdef SWIGPYTHON
+// %catches(faiss::FaissException);
+
+
+// Python-specific: release GIL by default for all functions
+%exception {
+    Py_BEGIN_ALLOW_THREADS
+    try {
+        $action
+    } catch(faiss::FaissException & e) {
+        PyEval_RestoreThread(_save);
+
+        if (PyErr_Occurred()) {
+            // some previous code already set the error type.
+        } else {
+            PyErr_SetString(PyExc_RuntimeError, e.what());
+        }
+        SWIG_fail;
+    } catch(std::bad_alloc & ba) {
+        PyEval_RestoreThread(_save);
+        PyErr_SetString(PyExc_MemoryError, "std::bad_alloc");
+        SWIG_fail;
+    }
+    Py_END_ALLOW_THREADS
+}
+
+#endif
+
+#ifdef SWIGLUA
+
+%exception {
+    try {
+        $action
+    } catch(faiss::FaissException & e) {
+        SWIG_Lua_pushferrstring(L, "C++ exception: %s", e.what());       \
+        goto fail;
+    }
+}
+
+#endif
+
+
+/*******************************************************************
+ * Types of vectors we want to manipulate at the scripting language
+ * level.
+ *******************************************************************/
+
+// simplified interface for vector
+namespace std {
+
+    template<class T>
+    class vector {
+    public:
+        vector();
+        void push_back(T);
+        void clear();
+        T * data();
+        size_t size();
+        T at (size_t n) const;
+        void resize (size_t n);
+        void swap (vector<T> & other);
+    };
+};
+
+
+
+%template(FloatVector) std::vector<float>;
+%template(DoubleVector) std::vector<double>;
+%template(ByteVector) std::vector<uint8_t>;
+%template(CharVector) std::vector<char>;
+// NOTE(hoss): Using unsigned long instead of uint64_t because OSX defines
+//   uint64_t as unsigned long long, which SWIG is not aware of.
+%template(Uint64Vector) std::vector<unsigned long>;
+%template(LongVector) std::vector<long>;
+%template(IntVector) std::vector<int>;
+%template(FloatVectorVector) std::vector<std::vector<float> >;
+%template(ByteVectorVector) std::vector<std::vector<unsigned char> >;
+%template(LongVectorVector) std::vector<std::vector<long> >;
+%template(VectorTransformVector) std::vector<faiss::VectorTransform*>;
+%template(OperatingPointVector) std::vector<faiss::OperatingPoint>;
+%template(InvertedListsPtrVector) std::vector<faiss::InvertedLists*>;
+%template(RepeatVector) std::vector<faiss::Repeat>;
+
+#ifdef GPU_WRAPPER
+%template(GpuResourcesVector) std::vector<faiss::gpu::GpuResources*>;
+#endif
+
+%include <std_string.i>
+
+// produces an error on the Mac
+%ignore faiss::hamming;
+
+/*******************************************************************
+ * Parse headers
+ *******************************************************************/
+
+
+%ignore *::cmp;
+
+%include  <faiss/utils/Heap.h>
+%include  <faiss/utils/hamming.h>
+
+int get_num_gpus();
+void gpu_profiler_start();
+void gpu_profiler_stop();
+void gpu_sync_all_devices();
+
+#ifdef GPU_WRAPPER
+
+%{
+
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/GpuIndicesOptions.h>
+#include <faiss/gpu/GpuClonerOptions.h>
+#include <faiss/gpu/utils/MemorySpace.h>
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVF.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/GpuIndexBinaryFlat.h>
+#include <faiss/gpu/GpuAutoTune.h>
+#include <faiss/gpu/GpuCloner.h>
+#include <faiss/gpu/GpuDistance.h>
+
+int get_num_gpus()
+{
+    return faiss::gpu::getNumDevices();
+}
+
+void gpu_profiler_start()
+{
+    return faiss::gpu::profilerStart();
+}
+
+void gpu_profiler_stop()
+{
+    return faiss::gpu::profilerStop();
+}
+
+void gpu_sync_all_devices()
+{
+    return faiss::gpu::synchronizeAllDevices();
+}
+
+%}
+
+// causes weird wrapper bug
+%ignore *::getMemoryManager;
+%ignore *::getMemoryManagerCurrentDevice;
+
+%include  <faiss/gpu/GpuResources.h>
+%include  <faiss/gpu/StandardGpuResources.h>
+
+#else
+
+%{
+int get_num_gpus()
+{
+    return 0;
+}
+
+void gpu_profiler_start()
+{
+}
+
+void gpu_profiler_stop()
+{
+}
+
+void gpu_sync_all_devices()
+{
+}
+%}
+
+
+#endif
+
+// order matters because includes are not recursive
+
+%include  <faiss/utils/utils.h>
+%include  <faiss/utils/distances.h>
+%include  <faiss/utils/random.h>
+
+%include  <faiss/Index.h>
+%include  <faiss/Clustering.h>
+
+%include  <faiss/utils/extra_distances.h>
+
+%ignore faiss::ProductQuantizer::get_centroids(size_t,size_t) const;
+
+%include  <faiss/impl/ProductQuantizer.h>
+
+%include  <faiss/VectorTransform.h>
+%include  <faiss/IndexPreTransform.h>
+%include  <faiss/IndexFlat.h>
+%include  <faiss/IndexLSH.h>
+%include  <faiss/impl/PolysemousTraining.h>
+%include  <faiss/IndexPQ.h>
+%include  <faiss/InvertedLists.h>
+%ignore InvertedListScanner;
+%ignore BinaryInvertedListScanner;
+%include  <faiss/IndexIVF.h>
+// NOTE(hoss): SWIG (wrongly) believes the overloaded const version shadows the
+//   non-const one.
+%warnfilter(509) extract_index_ivf;
+%include  <faiss/IVFlib.h>
+%include  <faiss/impl/ScalarQuantizer.h>
+%include  <faiss/IndexScalarQuantizer.h>
+%include  <faiss/IndexIVFSpectralHash.h>
+%include  <faiss/impl/HNSW.h>
+%include  <faiss/IndexHNSW.h>
+%include  <faiss/IndexIVFFlat.h>
+%include  <faiss/OnDiskInvertedLists.h>
+
+%include  <faiss/impl/lattice_Zn.h>
+%include  <faiss/IndexLattice.h>
+
+%ignore faiss::IndexIVFPQ::alloc_type;
+%include  <faiss/IndexIVFPQ.h>
+%include  <faiss/IndexIVFPQR.h>
+%include  <faiss/Index2Layer.h>
+
+%include  <faiss/IndexBinary.h>
+%include  <faiss/IndexBinaryFlat.h>
+%include  <faiss/IndexBinaryIVF.h>
+%include  <faiss/IndexBinaryFromFloat.h>
+%include  <faiss/IndexBinaryHNSW.h>
+
+
+
+ // %ignore faiss::IndexReplicas::at(int) const;
+
+%include  <faiss/impl/ThreadedIndex.h>
+%template(ThreadedIndexBase) faiss::ThreadedIndex<faiss::Index>;
+%template(ThreadedIndexBaseBinary) faiss::ThreadedIndex<faiss::IndexBinary>;
+
+%include  <faiss/IndexShards.h>
+%template(IndexShards) faiss::IndexShardsTemplate<faiss::Index>;
+%template(IndexBinaryShards) faiss::IndexShardsTemplate<faiss::IndexBinary>;
+
+%include  <faiss/IndexReplicas.h>
+%template(IndexReplicas) faiss::IndexReplicasTemplate<faiss::Index>;
+%template(IndexBinaryReplicas) faiss::IndexReplicasTemplate<faiss::IndexBinary>;
+
+%include  <faiss/MetaIndexes.h>
+%template(IndexIDMap) faiss::IndexIDMapTemplate<faiss::Index>;
+%template(IndexBinaryIDMap) faiss::IndexIDMapTemplate<faiss::IndexBinary>;
+%template(IndexIDMap2) faiss::IndexIDMap2Template<faiss::Index>;
+%template(IndexBinaryIDMap2) faiss::IndexIDMap2Template<faiss::IndexBinary>;
+
+#ifdef GPU_WRAPPER
+
+// quiet SWIG warnings
+%ignore faiss::gpu::GpuIndexIVF::GpuIndexIVF;
+
+%include  <faiss/gpu/GpuIndicesOptions.h>
+%include  <faiss/gpu/GpuClonerOptions.h>
+%include  <faiss/gpu/utils/MemorySpace.h>
+%include  <faiss/gpu/GpuIndex.h>
+%include  <faiss/gpu/GpuIndexFlat.h>
+%include  <faiss/gpu/GpuIndexIVF.h>
+%include  <faiss/gpu/GpuIndexIVFPQ.h>
+%include  <faiss/gpu/GpuIndexIVFFlat.h>
+%include  <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+%include  <faiss/gpu/GpuIndexBinaryFlat.h>
+%include  <faiss/gpu/GpuDistance.h>
+
+#ifdef SWIGLUA
+
+/// in Lua, swigfaiss_gpu is known as swigfaiss
+%luacode {
+local swigfaiss = swigfaiss_gpu
+}
+
+#endif
+
+
+#endif
+
+
+
+
+/*******************************************************************
+ * Lua-specific: support async execution of searches in an index
+ * Python equivalent is just to use Python threads.
+ *******************************************************************/
+
+
+#ifdef SWIGLUA
+
+%{
+
+
+namespace faiss {
+
+struct AsyncIndexSearchC {
+    typedef Index::idx_t idx_t;
+    const Index * index;
+
+    idx_t n;
+    const float *x;
+    idx_t k;
+    float *distances;
+    idx_t *labels;
+
+    bool is_finished;
+
+    pthread_t thread;
+
+
+    AsyncIndexSearchC (const Index *index,
+                      idx_t n, const float *x, idx_t k,
+                      float *distances, idx_t *labels):
+        index(index), n(n), x(x), k(k), distances(distances),
+        labels(labels)
+    {
+        is_finished = false;
+        pthread_create (&thread, NULL, &AsyncIndexSearchC::callback,
+                        this);
+    }
+
+    static void *callback (void *arg)
+    {
+        AsyncIndexSearchC *aidx = (AsyncIndexSearchC *)arg;
+        aidx->do_search();
+        return NULL;
+    }
+
+    void do_search ()
+    {
+        index->search (n, x, k, distances, labels);
+    }
+    void join ()
+    {
+        pthread_join (thread, NULL);
+    }
+
+};
+
+}
+
+%}
+
+// re-decrlare only what we need
+namespace faiss {
+
+struct AsyncIndexSearchC {
+    typedef Index::idx_t idx_t;
+    bool is_finished;
+    AsyncIndexSearchC (const Index *index,
+                      idx_t n, const float *x, idx_t k,
+                       float *distances, idx_t *labels);
+
+
+    void join ();
+};
+
+}
+
+
+#endif
+
+
+
+
+/*******************************************************************
+ * downcast return of some functions so that the sub-class is used
+ * instead of the generic upper-class.
+ *******************************************************************/
+
+#ifdef SWIGLUA
+
+%define DOWNCAST(subclass)
+    if (dynamic_cast<faiss::subclass *> ($1)) {
+      SWIG_NewPointerObj(L,$1,SWIGTYPE_p_faiss__ ## subclass, $owner);
+    } else
+%enddef
+
+%define DOWNCAST2(subclass, longname)
+    if (dynamic_cast<faiss::subclass *> ($1)) {
+      SWIG_NewPointerObj(L,$1,SWIGTYPE_p_faiss__ ## longname, $owner);
+    } else
+%enddef
+
+%define DOWNCAST_GPU(subclass)
+    if (dynamic_cast<faiss::gpu::subclass *> ($1)) {
+      SWIG_NewPointerObj(L,$1,SWIGTYPE_p_faiss__gpu__ ## subclass, $owner);
+    } else
+%enddef
+
+#endif
+
+
+#ifdef SWIGPYTHON
+
+%define DOWNCAST(subclass)
+    if (dynamic_cast<faiss::subclass *> ($1)) {
+      $result = SWIG_NewPointerObj($1,SWIGTYPE_p_faiss__ ## subclass,$owner);
+    } else
+%enddef
+
+%define DOWNCAST2(subclass, longname)
+    if (dynamic_cast<faiss::subclass *> ($1)) {
+      $result = SWIG_NewPointerObj($1,SWIGTYPE_p_faiss__ ## longname,$owner);
+    } else
+%enddef
+
+%define DOWNCAST_GPU(subclass)
+    if (dynamic_cast<faiss::gpu::subclass *> ($1)) {
+      $result = SWIG_NewPointerObj($1,SWIGTYPE_p_faiss__gpu__ ## subclass,$owner);
+    } else
+%enddef
+
+#endif
+
+%newobject read_index;
+%newobject read_index_binary;
+%newobject read_VectorTransform;
+%newobject read_ProductQuantizer;
+%newobject clone_index;
+%newobject clone_VectorTransform;
+
+// Subclasses should appear before their parent
+%typemap(out) faiss::Index * {
+    DOWNCAST2 ( IndexIDMap, IndexIDMapTemplateT_faiss__Index_t )
+    DOWNCAST2 ( IndexIDMap2, IndexIDMap2TemplateT_faiss__Index_t )
+    DOWNCAST2 ( IndexShards, IndexShardsTemplateT_faiss__Index_t )
+    DOWNCAST2 ( IndexReplicas, IndexReplicasTemplateT_faiss__Index_t )
+    DOWNCAST ( IndexIVFPQR )
+    DOWNCAST ( IndexIVFPQ )
+    DOWNCAST ( IndexIVFSpectralHash )
+    DOWNCAST ( IndexIVFScalarQuantizer )
+    DOWNCAST ( IndexIVFFlatDedup )
+    DOWNCAST ( IndexIVFFlat )
+    DOWNCAST ( IndexIVF )
+    DOWNCAST ( IndexFlat )
+    DOWNCAST ( IndexPQ )
+    DOWNCAST ( IndexScalarQuantizer )
+    DOWNCAST ( IndexLSH )
+    DOWNCAST ( IndexLattice )
+    DOWNCAST ( IndexPreTransform )
+    DOWNCAST ( MultiIndexQuantizer )
+    DOWNCAST ( IndexHNSWFlat )
+    DOWNCAST ( IndexHNSWPQ )
+    DOWNCAST ( IndexHNSWSQ )
+    DOWNCAST ( IndexHNSW2Level )
+    DOWNCAST ( Index2Layer )
+#ifdef GPU_WRAPPER
+    DOWNCAST_GPU ( GpuIndexIVFPQ )
+    DOWNCAST_GPU ( GpuIndexIVFFlat )
+    DOWNCAST_GPU ( GpuIndexIVFScalarQuantizer )
+    DOWNCAST_GPU ( GpuIndexFlat )
+#endif
+    // default for non-recognized classes
+    DOWNCAST ( Index )
+    if ($1 == NULL)
+    {
+#ifdef SWIGPYTHON
+        $result = SWIG_Py_Void();
+#endif
+        // Lua does not need a push for nil
+    } else {
+        assert(false);
+    }
+#ifdef SWIGLUA
+    SWIG_arg++;
+#endif
+}
+
+%typemap(out) faiss::IndexBinary * {
+    DOWNCAST2 ( IndexBinaryReplicas, IndexReplicasTemplateT_faiss__IndexBinary_t )
+    DOWNCAST2 ( IndexBinaryIDMap, IndexIDMapTemplateT_faiss__IndexBinary_t )
+    DOWNCAST2 ( IndexBinaryIDMap2, IndexIDMap2TemplateT_faiss__IndexBinary_t )
+    DOWNCAST ( IndexBinaryIVF )
+    DOWNCAST ( IndexBinaryFlat )
+    DOWNCAST ( IndexBinaryFromFloat )
+    DOWNCAST ( IndexBinaryHNSW )
+#ifdef GPU_WRAPPER
+    DOWNCAST_GPU ( GpuIndexBinaryFlat )
+#endif
+    // default for non-recognized classes
+    DOWNCAST ( IndexBinary )
+    if ($1 == NULL)
+    {
+#ifdef SWIGPYTHON
+        $result = SWIG_Py_Void();
+#endif
+        // Lua does not need a push for nil
+    } else {
+        assert(false);
+    }
+#ifdef SWIGLUA
+    SWIG_arg++;
+#endif
+}
+
+%typemap(out) faiss::VectorTransform * {
+    DOWNCAST (RemapDimensionsTransform)
+    DOWNCAST (OPQMatrix)
+    DOWNCAST (PCAMatrix)
+    DOWNCAST (RandomRotationMatrix)
+    DOWNCAST (LinearTransform)
+    DOWNCAST (NormalizationTransform)
+    DOWNCAST (CenteringTransform)
+    DOWNCAST (VectorTransform)
+    {
+        assert(false);
+    }
+#ifdef SWIGLUA
+    SWIG_arg++;
+#endif
+}
+
+%typemap(out) faiss::InvertedLists * {
+    DOWNCAST (ArrayInvertedLists)
+    DOWNCAST (OnDiskInvertedLists)
+    DOWNCAST (VStackInvertedLists)
+    DOWNCAST (HStackInvertedLists)
+    DOWNCAST (MaskedInvertedLists)
+    DOWNCAST (InvertedLists)
+    {
+        assert(false);
+    }
+#ifdef SWIGLUA
+    SWIG_arg++;
+#endif
+}
+
+// just to downcast pointers that come from elsewhere (eg. direct
+// access to object fields)
+%inline %{
+faiss::Index * downcast_index (faiss::Index *index)
+{
+    return index;
+}
+faiss::VectorTransform * downcast_VectorTransform (faiss::VectorTransform *vt)
+{
+    return vt;
+}
+faiss::IndexBinary * downcast_IndexBinary (faiss::IndexBinary *index)
+{
+    return index;
+}
+faiss::InvertedLists * downcast_InvertedLists (faiss::InvertedLists *il)
+{
+    return il;
+}
+%}
+
+%include  <faiss/impl/io.h>
+%include  <faiss/index_io.h>
+%include  <faiss/clone_index.h>
+
+%newobject index_factory;
+%newobject index_binary_factory;
+
+%include  <faiss/AutoTune.h>
+%include  <faiss/index_factory.h>
+%include  <faiss/MatrixStats.h>
+
+
+#ifdef GPU_WRAPPER
+
+%include  <faiss/gpu/GpuAutoTune.h>
+
+%newobject index_gpu_to_cpu;
+%newobject index_cpu_to_gpu;
+%newobject index_cpu_to_gpu_multiple;
+
+%include  <faiss/gpu/GpuCloner.h>
+
+#endif
+
+// Python-specific: do not release GIL any more, as functions below
+// use the Python/C API
+#ifdef SWIGPYTHON
+%exception;
+#endif
+
+
+
+
+
+/*******************************************************************
+ * Python specific: numpy array <-> C++ pointer interface
+ *******************************************************************/
+
+#ifdef SWIGPYTHON
+
+%{
+PyObject *swig_ptr (PyObject *a)
+{
+    if(!PyArray_Check(a)) {
+        PyErr_SetString(PyExc_ValueError, "input not a numpy array");
+        return NULL;
+    }
+    PyArrayObject *ao = (PyArrayObject *)a;
+
+    if(!PyArray_ISCONTIGUOUS(ao)) {
+        PyErr_SetString(PyExc_ValueError, "array is not C-contiguous");
+        return NULL;
+    }
+    void * data = PyArray_DATA(ao);
+    if(PyArray_TYPE(ao) == NPY_FLOAT32) {
+        return SWIG_NewPointerObj(data, SWIGTYPE_p_float, 0);
+    }
+    if(PyArray_TYPE(ao) == NPY_FLOAT64) {
+        return SWIG_NewPointerObj(data, SWIGTYPE_p_double, 0);
+    }
+    if(PyArray_TYPE(ao) == NPY_INT32) {
+        return SWIG_NewPointerObj(data, SWIGTYPE_p_int, 0);
+    }
+    if(PyArray_TYPE(ao) == NPY_UINT8) {
+        return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_char, 0);
+    }
+    if(PyArray_TYPE(ao) == NPY_INT8) {
+        return SWIG_NewPointerObj(data, SWIGTYPE_p_char, 0);
+    }
+    if(PyArray_TYPE(ao) == NPY_UINT64) {
+#ifdef SWIGWORDSIZE64
+        return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_long, 0);
+#else
+        return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_long_long, 0);
+#endif
+    }
+    if(PyArray_TYPE(ao) == NPY_INT64) {
+#ifdef SWIGWORDSIZE64
+        return SWIG_NewPointerObj(data, SWIGTYPE_p_long, 0);
+#else
+        return SWIG_NewPointerObj(data, SWIGTYPE_p_long_long, 0);
+#endif
+    }
+    PyErr_SetString(PyExc_ValueError, "did not recognize array type");
+    return NULL;
+}
+
+
+struct PythonInterruptCallback: faiss::InterruptCallback {
+
+    bool want_interrupt () override {
+        int err;
+        {
+            PyGILState_STATE gstate;
+            gstate = PyGILState_Ensure();
+            err = PyErr_CheckSignals();
+            PyGILState_Release(gstate);
+        }
+        return err == -1;
+    }
+
+};
+
+
+%}
+
+
+%init %{
+    /* needed, else crash at runtime */
+    import_array();
+
+    faiss::InterruptCallback::instance.reset(new PythonInterruptCallback());
+
+%}
+
+// return a pointer usable as input for functions that expect pointers
+PyObject *swig_ptr (PyObject *a);
+
+%define REV_SWIG_PTR(ctype, numpytype)
+
+%{
+PyObject * rev_swig_ptr(ctype *src, npy_intp size) {
+    return PyArray_SimpleNewFromData(1, &size, numpytype, src);
+}
+%}
+
+PyObject * rev_swig_ptr(ctype *src, size_t size);
+
+%enddef
+
+REV_SWIG_PTR(float, NPY_FLOAT32);
+REV_SWIG_PTR(int, NPY_INT32);
+REV_SWIG_PTR(unsigned char, NPY_UINT8);
+REV_SWIG_PTR(int64_t, NPY_INT64);
+REV_SWIG_PTR(uint64_t, NPY_UINT64);
+
+#endif
+
+
+
+/*******************************************************************
+ * Lua specific: Torch tensor <-> C++ pointer interface
+ *******************************************************************/
+
+#ifdef SWIGLUA
+
+
+// provide a XXX_ptr function to convert Lua XXXTensor -> C++ XXX*
+
+%define TYPE_CONVERSION(ctype, tensortype)
+
+// typemap for the *_ptr_from_cdata function
+%typemap(in) ctype** {
+    if(lua_type(L, $input) != 10) {
+        fprintf(stderr, "not cdata input\n");
+        SWIG_fail;
+    }
+    $1 = (ctype**)lua_topointer(L, $input);
+}
+
+
+// SWIG and C declaration for the *_ptr_from_cdata function
+%{
+ctype * ctype ## _ptr_from_cdata(ctype **x, long ofs) {
+     return *x + ofs;
+}
+%}
+ctype * ctype ## _ptr_from_cdata(ctype **x, long ofs);
+
+// the *_ptr function
+%luacode {
+
+function swigfaiss. ctype ## _ptr(tensor)
+   assert(tensor:type() == "torch." .. # tensortype, "need a " .. # tensortype)
+   assert(tensor:isContiguous(), "requires contiguous tensor")
+   return swigfaiss. ctype ## _ptr_from_cdata(
+         tensor:storage():data(),
+         tensor:storageOffset() - 1)
+end
+
+}
+
+%enddef
+
+TYPE_CONVERSION (int, IntTensor)
+TYPE_CONVERSION (float, FloatTensor)
+TYPE_CONVERSION (long, LongTensor)
+TYPE_CONVERSION (uint64_t, LongTensor)
+TYPE_CONVERSION (uint8_t, ByteTensor)
+
+#endif
+
+/*******************************************************************
+ * How should the template objects apprear in the scripting language?
+ *******************************************************************/
+
+// answer: the same as the C++ typedefs, but we still have to redefine them
+
+%template() faiss::CMin<float, int64_t>;
+%template() faiss::CMin<int, int64_t>;
+%template() faiss::CMax<float, int64_t>;
+%template() faiss::CMax<int, int64_t>;
+
+%template(float_minheap_array_t) faiss::HeapArray<faiss::CMin<float, int64_t> >;
+%template(int_minheap_array_t) faiss::HeapArray<faiss::CMin<int, int64_t> >;
+
+%template(float_maxheap_array_t) faiss::HeapArray<faiss::CMax<float, int64_t> >;
+%template(int_maxheap_array_t) faiss::HeapArray<faiss::CMax<int, int64_t> >;
+
+
+/*******************************************************************
+ * Expose a few basic functions
+ *******************************************************************/
+
+
+void omp_set_num_threads (int num_threads);
+int omp_get_max_threads ();
+void *memcpy(void *dest, const void *src, size_t n);
+
+
+/*******************************************************************
+ * For Faiss/Pytorch interop via pointers encoded as longs
+ *******************************************************************/
+
+%inline %{
+float * cast_integer_to_float_ptr (long x) {
+    return (float*)x;
+}
+
+long * cast_integer_to_long_ptr (long x) {
+    return (long*)x;
+}
+
+int * cast_integer_to_int_ptr (long x) {
+    return (int*)x;
+}
+
+%}
+
+
+
+/*******************************************************************
+ * Range search interface
+ *******************************************************************/
+
+%ignore faiss::BufferList::Buffer;
+%ignore faiss::RangeSearchPartialResult::QueryResult;
+%ignore faiss::IDSelectorBatch::set;
+%ignore faiss::IDSelectorBatch::bloom;
+
+%ignore faiss::InterruptCallback::instance;
+%ignore faiss::InterruptCallback::lock;
+%include  <faiss/impl/AuxIndexStructures.h>
+
+%{
+// may be useful for lua code launched in background from shell
+
+#include <signal.h>
+void ignore_SIGTTIN() {
+    signal(SIGTTIN, SIG_IGN);
+}
+%}
+
+void ignore_SIGTTIN();
+
+
+%inline %{
+
+// numpy misses a hash table implementation, hence this class. It
+// represents not found values as -1 like in the Index implementation
+
+struct MapLong2Long {
+    std::unordered_map<int64_t, int64_t> map;
+
+    void add(size_t n, const int64_t *keys, const int64_t *vals) {
+        map.reserve(map.size() + n);
+        for (size_t i = 0; i < n; i++) {
+            map[keys[i]] = vals[i];
+        }
+    }
+
+    long search(int64_t key) {
+        if (map.count(key) == 0) {
+            return -1;
+        } else {
+            return map[key];
+        }
+    }
+
+    void search_multiple(size_t n, int64_t *keys, int64_t * vals) {
+        for (size_t i = 0; i < n; i++) {
+            vals[i] = search(keys[i]);
+        }
+    }
+};
+
+%}
+
+%inline %{
+    void wait() {
+        // in gdb, use return to get out of this function
+        for(int i = 0; i == 0; i += 0);
+    }
+ %}
+
+// End of file...
diff --git a/core/src/index/thirdparty/faiss/tests/Makefile b/core/src/index/thirdparty/faiss/tests/Makefile
new file mode 100644
index 0000000000..684100de70
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/Makefile
@@ -0,0 +1,37 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+-include ../makefile.inc
+
+TESTS_SRC = $(wildcard *.cpp)
+TESTS_OBJ = $(TESTS_SRC:.cpp=.o)
+
+
+all: run
+
+run: tests
+	./tests
+
+tests: $(TESTS_OBJ) ../libfaiss.a gtest/make/gtest_main.a
+	$(CXX) -o $@ $^ $(LDFLAGS) $(LIBS)
+
+%.o: %.cpp gtest
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -c -o $@ $< -Igtest/include -I..
+
+gtest/make/gtest_main.a: gtest
+	$(MAKE) -C gtest/make CXX="$(CXX)" CXXFLAGS="$(CXXFLAGS)" gtest_main.a
+
+gtest:
+	curl -L https://github.com/google/googletest/archive/release-1.8.0.tar.gz | tar xz && \
+	mv googletest-release-1.8.0/googletest gtest && \
+	rm -rf googletest-release-1.8.0
+
+clean:
+	rm -f tests
+	rm -f $(TESTS_OBJ)
+	rm -rf gtest
+
+
+.PHONY: all clean run
diff --git a/core/src/index/thirdparty/faiss/tests/common.py b/core/src/index/thirdparty/faiss/tests/common.py
new file mode 100644
index 0000000000..b6bc37ef17
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/common.py
@@ -0,0 +1,99 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+# a few common functions for the tests
+
+import numpy as np
+import faiss
+
+# reduce number of threads to avoid excessive nb of threads in opt
+# mode (recuces runtime from 100s to 4s!)
+faiss.omp_set_num_threads(4)
+
+
+def random_unitary(n, d, seed):
+    x = faiss.randn(n * d, seed).reshape(n, d)
+    faiss.normalize_L2(x)
+    return x
+
+
+class Randu10k:
+
+    def __init__(self):
+        self.nb = 10000
+        self.nq = 1000
+        self.nt = 10000
+        self.d = 128
+
+        self.xb = random_unitary(self.nb, self.d, 1)
+        self.xt = random_unitary(self.nt, self.d, 2)
+        self.xq = random_unitary(self.nq, self.d, 3)
+
+        dotprods = np.dot(self.xq, self.xb.T)
+        self.gt = dotprods.argmax(1)
+        self.k = 100
+
+    def launch(self, name, index):
+        if not index.is_trained:
+            index.train(self.xt)
+        index.add(self.xb)
+        return index.search(self.xq, self.k)
+
+    def evalres(self, DI):
+        D, I = DI
+        e = {}
+        for rank in 1, 10, 100:
+            e[rank] = ((I[:, :rank] == self.gt.reshape(-1, 1)).sum() /
+                       float(self.nq))
+        print("1-recalls: %s" % e)
+        return e
+
+
+class Randu10kUnbalanced(Randu10k):
+
+    def __init__(self):
+        Randu10k.__init__(self)
+
+        weights = 0.95 ** np.arange(self.d)
+        rs = np.random.RandomState(123)
+        weights = weights[rs.permutation(self.d)]
+        self.xb *= weights
+        self.xb /= np.linalg.norm(self.xb, axis=1)[:, np.newaxis]
+        self.xq *= weights
+        self.xq /= np.linalg.norm(self.xq, axis=1)[:, np.newaxis]
+        self.xt *= weights
+        self.xt /= np.linalg.norm(self.xt, axis=1)[:, np.newaxis]
+
+        dotprods = np.dot(self.xq, self.xb.T)
+        self.gt = dotprods.argmax(1)
+        self.k = 100
+
+
+def get_dataset(d, nb, nt, nq):
+    rs = np.random.RandomState(123)
+    xb = rs.rand(nb, d).astype('float32')
+    xt = rs.rand(nt, d).astype('float32')
+    xq = rs.rand(nq, d).astype('float32')
+
+    return (xt, xb, xq)
+
+
+def get_dataset_2(d, nt, nb, nq):
+    """A dataset that is not completely random but still challenging to
+    index
+    """
+    d1 = 10     # intrinsic dimension (more or less)
+    n = nb + nt + nq
+    rs = np.random.RandomState(1338)
+    x = rs.normal(size=(n, d1))
+    x = np.dot(x, rs.rand(d1, d))
+    # now we have a d1-dim ellipsoid in d-dimensional space
+    # higher factor (>4) -> higher frequency -> less linear
+    x = x * (rs.rand(d) * 4 + 0.1)
+    x = np.sin(x)
+    x = x.astype('float32')
+    return x[:nt], x[nt:nt + nb], x[nt + nb:]
diff --git a/core/src/index/thirdparty/faiss/tests/test_binary_factory.py b/core/src/index/thirdparty/faiss/tests/test_binary_factory.py
new file mode 100644
index 0000000000..dfe618cc38
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_binary_factory.py
@@ -0,0 +1,35 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+import unittest
+import faiss
+
+
+class TestBinaryFactory(unittest.TestCase):
+
+    def test_factory_IVF(self):
+
+        index = faiss.index_binary_factory(16, "BIVF10")
+        assert index.invlists is not None
+        assert index.nlist == 10
+        assert index.code_size == 2
+
+    def test_factory_Flat(self):
+
+        index = faiss.index_binary_factory(16, "BFlat")
+        assert index.code_size == 2
+
+    def test_factory_HNSW(self):
+
+        index = faiss.index_binary_factory(256, "BHNSW32")
+        assert index.code_size == 32
+
+    def test_factory_IVF_HNSW(self):
+
+        index = faiss.index_binary_factory(256, "BIVF1024_BHNSW32")
+        assert index.code_size == 32
+        assert index.nlist == 1024
diff --git a/core/src/index/thirdparty/faiss/tests/test_binary_flat.cpp b/core/src/index/thirdparty/faiss/tests/test_binary_flat.cpp
new file mode 100644
index 0000000000..eb20cee87b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_binary_flat.cpp
@@ -0,0 +1,64 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <gtest/gtest.h>
+
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/utils/hamming.h>
+
+TEST(BinaryFlat, accuracy) {
+  // dimension of the vectors to index
+  int d = 64;
+
+  // size of the database we plan to index
+  size_t nb = 1000;
+
+  // make the index object and train it
+  faiss::IndexBinaryFlat index(d);
+
+  srand(35);
+
+  std::vector<uint8_t> database(nb * (d / 8));
+  for (size_t i = 0; i < nb * (d / 8); i++) {
+    database[i] = rand() % 0x100;
+  }
+
+  { // populating the database
+    index.add(nb, database.data());
+  }
+
+  size_t nq = 200;
+
+  { // searching the database
+
+    std::vector<uint8_t> queries(nq * (d / 8));
+    for (size_t i = 0; i < nq * (d / 8); i++) {
+      queries[i] = rand() % 0x100;
+    }
+
+    int k = 5;
+    std::vector<faiss::IndexBinary::idx_t> nns(k * nq);
+    std::vector<int>                     dis(k * nq);
+
+    index.search(nq, queries.data(), k, dis.data(), nns.data());
+
+    for (size_t i = 0; i < nq; ++i) {
+      faiss::HammingComputer8 hc(queries.data() + i * (d / 8), d / 8);
+      hamdis_t dist_min = hc.hamming(database.data());
+      for (size_t j = 1; j < nb; ++j) {
+        hamdis_t dist = hc.hamming(database.data() + j * (d / 8));
+        if (dist < dist_min) {
+          dist_min = dist;
+        }
+      }
+      EXPECT_EQ(dist_min, dis[k * i]);
+    }
+  }
+}
diff --git a/core/src/index/thirdparty/faiss/tests/test_binary_io.py b/core/src/index/thirdparty/faiss/tests/test_binary_io.py
new file mode 100644
index 0000000000..8cdc91df7a
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_binary_io.py
@@ -0,0 +1,217 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+"""Binary indexes (de)serialization"""
+
+import numpy as np
+import unittest
+import faiss
+import os
+import tempfile
+
+def make_binary_dataset(d, nb, nt, nq):
+    assert d % 8 == 0
+    x = np.random.randint(256, size=(nb + nq + nt, int(d / 8))).astype('uint8')
+    return x[:nt], x[nt:-nq], x[-nq:]
+
+
+class TestBinaryFlat(unittest.TestCase):
+
+    def __init__(self, *args, **kwargs):
+        unittest.TestCase.__init__(self, *args, **kwargs)
+        d = 32
+        nt = 0
+        nb = 1500
+        nq = 500
+
+        (_, self.xb, self.xq) = make_binary_dataset(d, nb, nt, nq)
+
+    def test_flat(self):
+        d = self.xq.shape[1] * 8
+
+        index = faiss.IndexBinaryFlat(d)
+        index.add(self.xb)
+        D, I = index.search(self.xq, 3)
+
+        _, tmpnam = tempfile.mkstemp()
+        try:
+            faiss.write_index_binary(index, tmpnam)
+
+            index2 = faiss.read_index_binary(tmpnam)
+
+            D2, I2 = index2.search(self.xq, 3)
+
+            assert (I2 == I).all()
+            assert (D2 == D).all()
+
+        finally:
+            os.remove(tmpnam)
+
+
+class TestBinaryIVF(unittest.TestCase):
+
+    def __init__(self, *args, **kwargs):
+        unittest.TestCase.__init__(self, *args, **kwargs)
+        d = 32
+        nt = 200
+        nb = 1500
+        nq = 500
+
+        (self.xt, self.xb, self.xq) = make_binary_dataset(d, nb, nt, nq)
+
+    def test_ivf_flat(self):
+        d = self.xq.shape[1] * 8
+
+        quantizer = faiss.IndexBinaryFlat(d)
+        index = faiss.IndexBinaryIVF(quantizer, d, 8)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.nprobe = 4
+        index.train(self.xt)
+        index.add(self.xb)
+        D, I = index.search(self.xq, 3)
+
+        _, tmpnam = tempfile.mkstemp()
+
+        try:
+            faiss.write_index_binary(index, tmpnam)
+
+            index2 = faiss.read_index_binary(tmpnam)
+
+            D2, I2 = index2.search(self.xq, 3)
+
+            assert (I2 == I).all()
+            assert (D2 == D).all()
+
+        finally:
+            os.remove(tmpnam)
+
+
+class TestObjectOwnership(unittest.TestCase):
+
+    def __init__(self, *args, **kwargs):
+        unittest.TestCase.__init__(self, *args, **kwargs)
+        d = 32
+        nt = 200
+        nb = 1500
+        nq = 500
+
+        (self.xt, self.xb, self.xq) = make_binary_dataset(d, nb, nt, nq)
+
+    def test_read_index_ownership(self):
+        d = self.xq.shape[1] * 8
+
+        index = faiss.IndexBinaryFlat(d)
+        index.add(self.xb)
+
+        _, tmpnam = tempfile.mkstemp()
+        try:
+            faiss.write_index_binary(index, tmpnam)
+
+            index2 = faiss.read_index_binary(tmpnam)
+
+            assert index2.thisown
+        finally:
+            os.remove(tmpnam)
+
+
+class TestBinaryFromFloat(unittest.TestCase):
+
+    def __init__(self, *args, **kwargs):
+        unittest.TestCase.__init__(self, *args, **kwargs)
+        d = 32
+        nt = 200
+        nb = 1500
+        nq = 500
+
+        (self.xt, self.xb, self.xq) = make_binary_dataset(d, nb, nt, nq)
+
+    def test_binary_from_float(self):
+        d = self.xq.shape[1] * 8
+
+        float_index = faiss.IndexHNSWFlat(d, 16)
+        index = faiss.IndexBinaryFromFloat(float_index)
+        index.add(self.xb)
+        D, I = index.search(self.xq, 3)
+
+        _, tmpnam = tempfile.mkstemp()
+
+        try:
+            faiss.write_index_binary(index, tmpnam)
+
+            index2 = faiss.read_index_binary(tmpnam)
+
+            D2, I2 = index2.search(self.xq, 3)
+
+            assert (I2 == I).all()
+            assert (D2 == D).all()
+
+        finally:
+            os.remove(tmpnam)
+
+
+class TestBinaryHNSW(unittest.TestCase):
+
+    def __init__(self, *args, **kwargs):
+        unittest.TestCase.__init__(self, *args, **kwargs)
+        d = 32
+        nt = 200
+        nb = 1500
+        nq = 500
+
+        (self.xt, self.xb, self.xq) = make_binary_dataset(d, nb, nt, nq)
+
+    def test_hnsw(self):
+        d = self.xq.shape[1] * 8
+
+        index = faiss.IndexBinaryHNSW(d)
+        index.add(self.xb)
+        D, I = index.search(self.xq, 3)
+
+        _, tmpnam = tempfile.mkstemp()
+
+        try:
+            faiss.write_index_binary(index, tmpnam)
+
+            index2 = faiss.read_index_binary(tmpnam)
+
+            D2, I2 = index2.search(self.xq, 3)
+
+            assert (I2 == I).all()
+            assert (D2 == D).all()
+
+        finally:
+            os.remove(tmpnam)
+
+    def test_ivf_hnsw(self):
+        d = self.xq.shape[1] * 8
+
+        quantizer = faiss.IndexBinaryHNSW(d)
+        index = faiss.IndexBinaryIVF(quantizer, d, 8)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.nprobe = 4
+        index.train(self.xt)
+        index.add(self.xb)
+        D, I = index.search(self.xq, 3)
+
+        _, tmpnam = tempfile.mkstemp()
+
+        try:
+            faiss.write_index_binary(index, tmpnam)
+
+            index2 = faiss.read_index_binary(tmpnam)
+
+            D2, I2 = index2.search(self.xq, 3)
+
+            assert (I2 == I).all()
+            assert (D2 == D).all()
+
+        finally:
+            os.remove(tmpnam)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/core/src/index/thirdparty/faiss/tests/test_build_blocks.py b/core/src/index/thirdparty/faiss/tests/test_build_blocks.py
new file mode 100644
index 0000000000..2c31bf7aeb
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_build_blocks.py
@@ -0,0 +1,489 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+import numpy as np
+
+import faiss
+import unittest
+
+
+class TestClustering(unittest.TestCase):
+
+    def test_clustering(self):
+        d = 64
+        n = 1000
+        rs = np.random.RandomState(123)
+        x = rs.uniform(size=(n, d)).astype('float32')
+
+        x *= 10
+
+        km = faiss.Kmeans(d, 32, niter=10)
+        err32 = km.train(x)
+
+        # check that objective is decreasing
+        prev = 1e50
+        for o in km.obj:
+            self.assertGreater(prev, o)
+            prev = o
+
+        km = faiss.Kmeans(d, 64, niter=10)
+        err64 = km.train(x)
+
+        # check that 64 centroids give a lower quantization error than 32
+        self.assertGreater(err32, err64)
+
+        km = faiss.Kmeans(d, 32, niter=10, int_centroids=True)
+        err_int = km.train(x)
+
+        # check that integer centoids are not as good as float ones
+        self.assertGreater(err_int, err32)
+        self.assertTrue(np.all(km.centroids == np.floor(km.centroids)))
+
+
+    def test_nasty_clustering(self):
+        d = 2
+        rs = np.random.RandomState(123)
+        x = np.zeros((100, d), dtype='float32')
+        for i in range(5):
+            x[i * 20:i * 20 + 20] = rs.uniform(size=d)
+
+        # we have 5 distinct points but ask for 10 centroids...
+        km = faiss.Kmeans(d, 10, niter=10, verbose=True)
+        km.train(x)
+
+    def test_redo(self):
+        d = 64
+        n = 1000
+
+        rs = np.random.RandomState(123)
+        x = rs.uniform(size=(n, d)).astype('float32')
+
+        clus = faiss.Clustering(d, 20)
+        clus.nredo = 1
+        clus.train(x, faiss.IndexFlatL2(d))
+        obj1 = faiss.vector_to_array(clus.obj)
+
+        clus = faiss.Clustering(d, 20)
+        clus.nredo = 10
+        clus.train(x, faiss.IndexFlatL2(d))
+        obj10 = faiss.vector_to_array(clus.obj)
+
+        self.assertGreater(obj1[-1], obj10[-1])
+
+    def test_1ptpercluster(self):
+        # https://github.com/facebookresearch/faiss/issues/842
+        X = np.random.randint(0, 1, (5, 10)).astype('float32')
+        k = 5
+        niter = 10
+        verbose = True
+        kmeans = faiss.Kmeans(X.shape[1], k, niter=niter, verbose=verbose)
+        kmeans.train(X)
+        l2_distances, I = kmeans.index.search(X, 1)
+
+
+class TestPCA(unittest.TestCase):
+
+    def test_pca(self):
+        d = 64
+        n = 1000
+        np.random.seed(123)
+        x = np.random.random(size=(n, d)).astype('float32')
+
+        pca = faiss.PCAMatrix(d, 10)
+        pca.train(x)
+        y = pca.apply_py(x)
+
+        # check that energy per component is decreasing
+        column_norm2 = (y**2).sum(0)
+
+        prev = 1e50
+        for o in column_norm2:
+            self.assertGreater(prev, o)
+            prev = o
+
+
+class TestProductQuantizer(unittest.TestCase):
+
+    def test_pq(self):
+        d = 64
+        n = 2000
+        cs = 4
+        np.random.seed(123)
+        x = np.random.random(size=(n, d)).astype('float32')
+        pq = faiss.ProductQuantizer(d, cs, 8)
+        pq.train(x)
+        codes = pq.compute_codes(x)
+        x2 = pq.decode(codes)
+        diff = ((x - x2)**2).sum()
+
+        # print "diff=", diff
+        # diff= 4418.0562
+        self.assertGreater(5000, diff)
+
+        pq10 = faiss.ProductQuantizer(d, cs, 10)
+        assert pq10.code_size == 5
+        pq10.verbose = True
+        pq10.cp.verbose = True
+        pq10.train(x)
+        codes = pq10.compute_codes(x)
+
+        x10 = pq10.decode(codes)
+        diff10 = ((x - x10)**2).sum()
+        self.assertGreater(diff, diff10)
+
+    def do_test_codec(self, nbit):
+        pq = faiss.ProductQuantizer(16, 2, nbit)
+
+        # simulate training
+        rs = np.random.RandomState(123)
+        centroids = rs.rand(2, 1 << nbit, 8).astype('float32')
+        faiss.copy_array_to_vector(centroids.ravel(), pq.centroids)
+
+        idx = rs.randint(1 << nbit, size=(100, 2))
+        # can be encoded exactly
+        x = np.hstack((
+            centroids[0, idx[:, 0]],
+            centroids[1, idx[:, 1]]
+        ))
+
+        # encode / decode
+        codes = pq.compute_codes(x)
+        xr = pq.decode(codes)
+        assert np.all(xr == x)
+
+        # encode w/ external index
+        assign_index = faiss.IndexFlatL2(8)
+        pq.assign_index = assign_index
+        codes2 = np.empty((100, pq.code_size), dtype='uint8')
+        pq.compute_codes_with_assign_index(
+            faiss.swig_ptr(x), faiss.swig_ptr(codes2), 100)
+        assert np.all(codes == codes2)
+
+    def test_codec(self):
+        for i in range(16):
+            print("Testing nbits=%d" % (i + 1))
+            self.do_test_codec(i + 1)
+
+
+class TestRevSwigPtr(unittest.TestCase):
+
+    def test_rev_swig_ptr(self):
+
+        index = faiss.IndexFlatL2(4)
+        xb0 = np.vstack([
+            i * 10 + np.array([1, 2, 3, 4], dtype='float32')
+            for i in range(5)])
+        index.add(xb0)
+        xb = faiss.rev_swig_ptr(index.xb.data(), 4 * 5).reshape(5, 4)
+        self.assertEqual(np.abs(xb0 - xb).sum(), 0)
+
+
+class TestException(unittest.TestCase):
+
+    def test_exception(self):
+
+        index = faiss.IndexFlatL2(10)
+
+        a = np.zeros((5, 10), dtype='float32')
+        b = np.zeros(5, dtype='int64')
+
+        try:
+            # an unsupported operation for IndexFlat
+            index.add_with_ids(a, b)
+        except RuntimeError as e:
+            assert 'add_with_ids not implemented' in str(e)
+        else:
+            assert False, 'exception did not fire???'
+
+    def test_exception_2(self):
+
+        try:
+            faiss.index_factory(12, 'IVF256,Flat,PQ8')
+        except RuntimeError as e:
+            assert 'could not parse' in str(e)
+        else:
+            assert False, 'exception did not fire???'
+
+class TestMapLong2Long(unittest.TestCase):
+
+    def test_maplong2long(self):
+        keys = np.array([13, 45, 67])
+        vals = np.array([3, 8, 2])
+
+        m = faiss.MapLong2Long()
+        m.add(keys, vals)
+
+        assert np.all(m.search_multiple(keys) == vals)
+
+        assert m.search(12343) == -1
+
+
+class TestOrthognalReconstruct(unittest.TestCase):
+
+    def test_recons_orthonormal(self):
+        lt = faiss.LinearTransform(20, 10, True)
+        rs = np.random.RandomState(10)
+        A, _ = np.linalg.qr(rs.randn(20, 20))
+        A = A[:10].astype('float32')
+        faiss.copy_array_to_vector(A.ravel(), lt.A)
+        faiss.copy_array_to_vector(rs.randn(10).astype('float32'), lt.b)
+
+        lt.set_is_orthonormal()
+        lt.is_trained = True
+        assert lt.is_orthonormal
+
+        x = rs.rand(30, 20).astype('float32')
+        xt = lt.apply_py(x)
+        xtt = lt.reverse_transform(xt)
+        xttt = lt.apply_py(xtt)
+
+        err = ((xt - xttt)**2).sum()
+
+        self.assertGreater(1e-5, err)
+
+    def test_recons_orthogona_impossible(self):
+        lt = faiss.LinearTransform(20, 10, True)
+        rs = np.random.RandomState(10)
+        A = rs.randn(10 * 20).astype('float32')
+        faiss.copy_array_to_vector(A.ravel(), lt.A)
+        faiss.copy_array_to_vector(rs.randn(10).astype('float32'), lt.b)
+        lt.is_trained = True
+
+        lt.set_is_orthonormal()
+        assert not lt.is_orthonormal
+
+        x = rs.rand(30, 20).astype('float32')
+        xt = lt.apply_py(x)
+        try:
+            lt.reverse_transform(xt)
+        except Exception:
+            pass
+        else:
+            self.assertFalse('should do an exception')
+
+
+class TestMAdd(unittest.TestCase):
+
+    def test_1(self):
+        # try with dimensions that are multiples of 16 or not
+        rs = np.random.RandomState(123)
+        swig_ptr = faiss.swig_ptr
+        for dim in 16, 32, 20, 25:
+            for repeat in 1, 2, 3, 4, 5:
+                a = rs.rand(dim).astype('float32')
+                b = rs.rand(dim).astype('float32')
+                c = np.zeros(dim, dtype='float32')
+                bf = rs.uniform(5.0) - 2.5
+                idx = faiss.fvec_madd_and_argmin(
+                    dim, swig_ptr(a), bf, swig_ptr(b),
+                    swig_ptr(c))
+                ref_c = a + b * bf
+                assert np.abs(c - ref_c).max() < 1e-5
+                assert idx == ref_c.argmin()
+
+
+class TestNyFuncs(unittest.TestCase):
+
+    def test_l2(self):
+        rs = np.random.RandomState(123)
+        swig_ptr = faiss.swig_ptr
+        for d in 1, 2, 4, 8, 12, 16:
+            x = rs.rand(d).astype('float32')
+            for ny in 128, 129, 130:
+                print("d=%d ny=%d" % (d, ny))
+                y = rs.rand(ny, d).astype('float32')
+                ref = ((x - y) ** 2).sum(1)
+                new = np.zeros(ny, dtype='float32')
+                faiss.fvec_L2sqr_ny(swig_ptr(new), swig_ptr(x),
+                                    swig_ptr(y), d, ny)
+                assert np.abs(ref - new).max() < 1e-4
+
+    def test_IP(self):
+        # this one is not optimized with SIMD but just in case
+        rs = np.random.RandomState(123)
+        swig_ptr = faiss.swig_ptr
+        for d in 1, 2, 4, 8, 12, 16:
+            x = rs.rand(d).astype('float32')
+            for ny in 128, 129, 130:
+                print("d=%d ny=%d" % (d, ny))
+                y = rs.rand(ny, d).astype('float32')
+                ref = (x * y).sum(1)
+                new = np.zeros(ny, dtype='float32')
+                faiss.fvec_inner_products_ny(
+                    swig_ptr(new), swig_ptr(x), swig_ptr(y), d, ny)
+                assert np.abs(ref - new).max() < 1e-4
+
+
+class TestMatrixStats(unittest.TestCase):
+
+    def test_0s(self):
+        rs = np.random.RandomState(123)
+        m = rs.rand(40, 20).astype('float32')
+        m[5:10] = 0
+        comments = faiss.MatrixStats(m).comments
+        print comments
+        assert 'has 5 copies' in comments
+        assert '5 null vectors' in comments
+
+    def test_copies(self):
+        rs = np.random.RandomState(123)
+        m = rs.rand(40, 20).astype('float32')
+        m[::2] = m[1::2]
+        comments = faiss.MatrixStats(m).comments
+        print comments
+        assert '20 vectors are distinct' in comments
+
+    def test_dead_dims(self):
+        rs = np.random.RandomState(123)
+        m = rs.rand(40, 20).astype('float32')
+        m[:, 5:10] = 0
+        comments = faiss.MatrixStats(m).comments
+        print comments
+        assert '5 dimensions are constant' in comments
+
+    def test_rogue_means(self):
+        rs = np.random.RandomState(123)
+        m = rs.rand(40, 20).astype('float32')
+        m[:, 5:10] += 12345
+        comments = faiss.MatrixStats(m).comments
+        print comments
+        assert '5 dimensions are too large wrt. their variance' in comments
+
+    def test_normalized(self):
+        rs = np.random.RandomState(123)
+        m = rs.rand(40, 20).astype('float32')
+        faiss.normalize_L2(m)
+        comments = faiss.MatrixStats(m).comments
+        print comments
+        assert 'vectors are normalized' in comments
+
+
+class TestScalarQuantizer(unittest.TestCase):
+
+    def test_8bit_equiv(self):
+        rs = np.random.RandomState(123)
+        for it in range(20):
+            for d in 13, 16, 24:
+                x = np.floor(rs.rand(5, d) * 256).astype('float32')
+                x[0] = 0
+                x[1] = 255
+
+                # make sure to test extreme cases
+                x[2, 0] = 0
+                x[3, 0] = 255
+                x[2, 1] = 255
+                x[3, 1] = 0
+
+                ref_index = faiss.IndexScalarQuantizer(
+                    d, faiss.ScalarQuantizer.QT_8bit)
+                ref_index.train(x[:2])
+                ref_index.add(x[2:3])
+
+                index = faiss.IndexScalarQuantizer(
+                    d, faiss.ScalarQuantizer.QT_8bit_direct)
+                assert index.is_trained
+                index.add(x[2:3])
+
+                assert np.all(
+                    faiss.vector_to_array(ref_index.codes) ==
+                    faiss.vector_to_array(index.codes))
+
+                # Note that distances are not the same because ref_index
+                # reconstructs x as x + 0.5
+                D, I = index.search(x[3:], 1)
+
+                # assert D[0, 0] == Dref[0, 0]
+                print(D[0, 0], ((x[3] - x[2]) ** 2).sum())
+                assert D[0, 0] == ((x[3] - x[2]) ** 2).sum()
+
+    def test_6bit_equiv(self):
+        rs = np.random.RandomState(123)
+        for d in 3, 6, 8, 16, 36:
+            trainset = np.zeros((2, d), dtype='float32')
+            trainset[0, :] = 0
+            trainset[0, :] = 63
+
+            index = faiss.IndexScalarQuantizer(
+                d, faiss.ScalarQuantizer.QT_6bit)
+            index.train(trainset)
+
+            print('cs=', index.code_size)
+
+            x = rs.randint(64, size=(100, d)).astype('float32')
+
+            # verify encoder / decoder
+            index.add(x)
+            x2 = index.reconstruct_n(0, x.shape[0])
+            assert np.all(x == x2 - 0.5)
+
+            # verify AVX decoder (used only for search)
+            y = 63 * rs.rand(20, d).astype('float32')
+
+            D, I = index.search(y, 10)
+            for i in range(20):
+                for j in range(10):
+                    dis = ((y[i] - x2[I[i, j]]) ** 2).sum()
+                    print(dis, D[i, j])
+                    assert abs(D[i, j] - dis) / dis < 1e-5
+
+class TestRandom(unittest.TestCase):
+
+    def test_rand(self):
+        x = faiss.rand(2000)
+        assert np.all(x >= 0) and np.all(x < 1)
+        h, _ = np.histogram(x, np.arange(0, 1, 0.1))
+        assert h.min() > 160 and h.max() < 240
+
+    def test_randint(self):
+        x = faiss.randint(20000, vmax=100)
+        assert np.all(x >= 0) and np.all(x < 100)
+        c = np.bincount(x, minlength=100)
+        print(c)
+        assert c.max() - c.min() < 50 * 2
+
+
+class TestPairwiseDis(unittest.TestCase):
+
+    def test_L2(self):
+        swig_ptr = faiss.swig_ptr
+        x = faiss.rand((100, 10), seed=1)
+        y = faiss.rand((200, 10), seed=2)
+        ix = faiss.randint(50, vmax=100)
+        iy = faiss.randint(50, vmax=200)
+        dis = np.empty(50, dtype='float32')
+        faiss.pairwise_indexed_L2sqr(
+            10, 50,
+            swig_ptr(x), swig_ptr(ix),
+            swig_ptr(y), swig_ptr(iy),
+            swig_ptr(dis))
+
+        for i in range(50):
+            assert np.allclose(
+                dis[i], ((x[ix[i]] - y[iy[i]]) ** 2).sum())
+
+    def test_IP(self):
+        swig_ptr = faiss.swig_ptr
+        x = faiss.rand((100, 10), seed=1)
+        y = faiss.rand((200, 10), seed=2)
+        ix = faiss.randint(50, vmax=100)
+        iy = faiss.randint(50, vmax=200)
+        dis = np.empty(50, dtype='float32')
+        faiss.pairwise_indexed_inner_product(
+            10, 50,
+            swig_ptr(x), swig_ptr(ix),
+            swig_ptr(y), swig_ptr(iy),
+            swig_ptr(dis))
+
+        for i in range(50):
+            assert np.allclose(
+                dis[i], np.dot(x[ix[i]], y[iy[i]]))
+
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/core/src/index/thirdparty/faiss/tests/test_dealloc_invlists.cpp b/core/src/index/thirdparty/faiss/tests/test_dealloc_invlists.cpp
new file mode 100644
index 0000000000..d77cd242ac
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_dealloc_invlists.cpp
@@ -0,0 +1,183 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/index_factory.h>
+#include <faiss/AutoTune.h>
+#include <faiss/index_io.h>
+#include <faiss/IVFlib.h>
+
+using namespace faiss;
+
+namespace {
+
+typedef Index::idx_t idx_t;
+
+
+// dimension of the vectors to index
+int d = 32;
+
+// nb of training vectors
+size_t nt = 5000;
+
+// size of the database points per window step
+size_t nb = 1000;
+
+// nb of queries
+size_t nq = 200;
+
+
+std::vector<float> make_data(size_t n)
+{
+    std::vector <float> database (n * d);
+    for (size_t i = 0; i < n * d; i++) {
+        database[i] = drand48();
+    }
+    return database;
+}
+
+std::unique_ptr<Index> make_trained_index(const char *index_type)
+{
+    auto index = std::unique_ptr<Index>(index_factory(d, index_type));
+    auto xt = make_data(nt * d);
+    index->train(nt, xt.data());
+    ParameterSpace().set_index_parameter (index.get(), "nprobe", 4);
+    return index;
+}
+
+std::vector<idx_t> search_index(Index *index, const float *xq) {
+    int k = 10;
+    std::vector<idx_t> I(k * nq);
+    std::vector<float> D(k * nq);
+    index->search (nq, xq, k, D.data(), I.data());
+    return I;
+}
+
+
+
+
+
+/*************************************************************
+ * Test functions for a given index type
+ *************************************************************/
+
+struct EncapsulateInvertedLists: InvertedLists {
+
+    const InvertedLists *il;
+
+    EncapsulateInvertedLists(const InvertedLists *il):
+        InvertedLists(il->nlist, il->code_size),
+        il(il)
+    {}
+
+    static void * memdup (const void *m, size_t size) {
+        if (size == 0) return nullptr;
+        return memcpy (malloc(size), m, size);
+    }
+
+    size_t list_size(size_t list_no) const override {
+        return il->list_size (list_no);
+    }
+
+    const uint8_t * get_codes (size_t list_no) const override {
+        return (uint8_t*)memdup (il->get_codes(list_no),
+                                 list_size(list_no) * code_size);
+    }
+
+    const idx_t * get_ids (size_t list_no) const override {
+        return (idx_t*)memdup (il->get_ids(list_no),
+                               list_size(list_no) * sizeof(idx_t));
+    }
+
+    void release_codes (size_t, const uint8_t *codes) const override {
+        free ((void*)codes);
+    }
+
+    void release_ids (size_t, const idx_t *ids) const override {
+        free ((void*)ids);
+    }
+
+    const uint8_t * get_single_code (size_t list_no, size_t offset)
+        const override {
+        return (uint8_t*)memdup (il->get_single_code(list_no, offset),
+                                 code_size);
+    }
+
+    size_t add_entries(size_t, size_t, const idx_t*, const uint8_t*) override {
+      assert(!"not implemented");
+      return 0;
+    }
+
+    void update_entries(size_t, size_t, size_t, const idx_t*, const uint8_t*)
+        override {
+      assert(!"not implemented");
+    }
+
+    void resize(size_t, size_t) override {
+      assert(!"not implemented");
+    }
+
+    ~EncapsulateInvertedLists() override {}
+};
+
+
+
+int test_dealloc_invlists (const char *index_key) {
+
+    std::unique_ptr<Index> index = make_trained_index(index_key);
+    IndexIVF * index_ivf = ivflib::extract_index_ivf (index.get());
+
+    auto xb = make_data (nb * d);
+    index->add(nb, xb.data());
+
+    auto xq = make_data (nq * d);
+
+    auto ref_res = search_index (index.get(), xq.data());
+
+    EncapsulateInvertedLists eil(index_ivf->invlists);
+
+    index_ivf->own_invlists = false;
+    index_ivf->replace_invlists (&eil, false);
+
+    // TEST: this could crash or leak mem
+    auto new_res = search_index (index.get(), xq.data());
+
+    // delete explicitly
+    delete eil.il;
+
+    // just to make sure
+    EXPECT_EQ (ref_res, new_res);
+    return 0;
+}
+
+} // anonymous namespace
+
+
+
+/*************************************************************
+ * Test entry points
+ *************************************************************/
+
+TEST(TestIvlistDealloc, IVFFlat) {
+    test_dealloc_invlists ("IVF32,Flat");
+}
+
+TEST(TestIvlistDealloc, IVFSQ) {
+    test_dealloc_invlists ("IVF32,SQ8");
+}
+
+TEST(TestIvlistDealloc, IVFPQ) {
+    test_dealloc_invlists ("IVF32,PQ4np");
+}
diff --git a/core/src/index/thirdparty/faiss/tests/test_extra_distances.py b/core/src/index/thirdparty/faiss/tests/test_extra_distances.py
new file mode 100644
index 0000000000..3d87669a2a
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_extra_distances.py
@@ -0,0 +1,137 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+# noqa E741
+
+import numpy as np
+
+import faiss
+import unittest
+
+from common import get_dataset_2
+
+import scipy.spatial.distance
+
+
+class TestExtraDistances(unittest.TestCase):
+    """ check wrt. the scipy implementation """
+
+    def make_example(self):
+        rs = np.random.RandomState(123)
+        x = rs.rand(5, 32).astype('float32')
+        y = rs.rand(3, 32).astype('float32')
+        return x, y
+
+    def run_simple_dis_test(self, ref_func, metric_type):
+        xq, yb = self.make_example()
+        ref_dis = np.array([
+            [ref_func(x, y) for y in yb]
+            for x in xq
+        ])
+        new_dis = faiss.pairwise_distances(xq, yb, metric_type)
+        self.assertTrue(np.allclose(ref_dis, new_dis))
+
+    def test_L1(self):
+        self.run_simple_dis_test(scipy.spatial.distance.cityblock,
+                                 faiss.METRIC_L1)
+
+    def test_Linf(self):
+        self.run_simple_dis_test(scipy.spatial.distance.chebyshev,
+                                 faiss.METRIC_Linf)
+
+    def test_L2(self):
+        xq, yb = self.make_example()
+        ref_dis = np.array([
+            [scipy.spatial.distance.sqeuclidean(x, y) for y in yb]
+            for x in xq
+        ])
+        new_dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_L2)
+        self.assertTrue(np.allclose(ref_dis, new_dis))
+
+        ref_dis = np.array([
+            [scipy.spatial.distance.euclidean(x, y) for y in yb]
+            for x in xq
+        ])
+        new_dis = np.sqrt(new_dis)        # post processing
+        self.assertTrue(np.allclose(ref_dis, new_dis))
+
+    def test_Lp(self):
+        p = 1.5
+        xq, yb = self.make_example()
+        ref_dis = np.array([
+            [scipy.spatial.distance.minkowski(x, y, p) for y in yb]
+            for x in xq
+        ])
+        new_dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_Lp, p)
+        new_dis = new_dis ** (1 / p)     # post processing
+        self.assertTrue(np.allclose(ref_dis, new_dis))
+
+    def test_canberra(self):
+        self.run_simple_dis_test(scipy.spatial.distance.canberra,
+                                 faiss.METRIC_Canberra)
+
+    def test_braycurtis(self):
+        self.run_simple_dis_test(scipy.spatial.distance.braycurtis,
+                                 faiss.METRIC_BrayCurtis)
+
+    def xx_test_jensenshannon(self):
+        # this distance does not seem to be implemented in scipy
+        # vectors should probably be L1 normalized
+        self.run_simple_dis_test(scipy.spatial.distance.jensenshannon,
+                                 faiss.METRIC_JensenShannon)
+
+
+class TestKNN(unittest.TestCase):
+    """ test that the knn search gives the same as distance matrix + argmin """
+
+    def do_test_knn(self, mt):
+        d = 10
+        nb = 100
+        nq = 50
+        nt = 0
+        xt, xb, xq = get_dataset_2(d, nt, nb, nq)
+
+        index = faiss.IndexFlat(d, mt)
+        index.add(xb)
+
+        D, I = index.search(xq, 10)
+
+        dis = faiss.pairwise_distances(xq, xb, mt)
+        o = dis.argsort(axis=1)
+        assert np.all(I == o[:, :10])
+
+        for q in range(nq):
+            assert np.all(D[q] == dis[q, I[q]])
+
+    def test_L1(self):
+        self.do_test_knn(faiss.METRIC_L1)
+
+    def test_Linf(self):
+        self.do_test_knn(faiss.METRIC_Linf)
+
+
+class TestHNSW(unittest.TestCase):
+    """ since it has a distance computer, HNSW should work """
+
+    def test_hnsw(self):
+
+        d = 10
+        nb = 1000
+        nq = 100
+        nt = 0
+        xt, xb, xq = get_dataset_2(d, nt, nb, nq)
+
+        mt = faiss.METRIC_L1
+
+        index = faiss.IndexHNSW(faiss.IndexFlat(d, mt))
+        index.add(xb)
+
+        D, I = index.search(xq, 10)
+
+        dis = faiss.pairwise_distances(xq, xb, mt)
+
+        for q in range(nq):
+            assert np.all(D[q] == dis[q, I[q]])
diff --git a/core/src/index/thirdparty/faiss/tests/test_factory.py b/core/src/index/thirdparty/faiss/tests/test_factory.py
new file mode 100644
index 0000000000..968d52ceaa
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_factory.py
@@ -0,0 +1,52 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+import unittest
+import faiss
+
+
+class TestFactory(unittest.TestCase):
+
+    def test_factory_1(self):
+
+        index = faiss.index_factory(12, "IVF10,PQ4")
+        assert index.do_polysemous_training
+
+        index = faiss.index_factory(12, "IVF10,PQ4np")
+        assert not index.do_polysemous_training
+
+        index = faiss.index_factory(12, "PQ4")
+        assert index.do_polysemous_training
+
+        index = faiss.index_factory(12, "PQ4np")
+        assert not index.do_polysemous_training
+
+        try:
+            index = faiss.index_factory(10, "PQ4")
+        except RuntimeError:
+            pass
+        else:
+            assert False, "should do a runtime error"
+
+    def test_factory_2(self):
+
+        index = faiss.index_factory(12, "SQ8")
+        assert index.code_size == 12
+
+    def test_factory_3(self):
+
+        index = faiss.index_factory(12, "IVF10,PQ4")
+        faiss.ParameterSpace().set_index_parameter(index, "nprobe", 3)
+        assert index.nprobe == 3
+
+        index = faiss.index_factory(12, "PCAR8,IVF10,PQ4")
+        faiss.ParameterSpace().set_index_parameter(index, "nprobe", 3)
+        assert faiss.downcast_index(index.index).nprobe == 3
+
+    def test_factory_4(self):
+        index = faiss.index_factory(12, "IVF10,FlatDedup")
+        assert index.instances is not None
diff --git a/core/src/index/thirdparty/faiss/tests/test_index.py b/core/src/index/thirdparty/faiss/tests/test_index.py
new file mode 100644
index 0000000000..429ba1fb0d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_index.py
@@ -0,0 +1,574 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+"""this is a basic test script for simple indices work"""
+
+import numpy as np
+import unittest
+import faiss
+import tempfile
+import os
+import re
+
+
+from common import get_dataset, get_dataset_2
+
+class TestModuleInterface(unittest.TestCase):
+
+    def test_version_attribute(self):
+        assert hasattr(faiss, '__version__')
+        assert re.match('^\\d+\\.\\d+\\.\\d+$', faiss.__version__)
+
+
+
+class EvalIVFPQAccuracy(unittest.TestCase):
+
+    def test_IndexIVFPQ(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+
+        (xt, xb, xq) = get_dataset_2(d, nt, nb, nq)
+        d = xt.shape[1]
+
+        gt_index = faiss.IndexFlatL2(d)
+        gt_index.add(xb)
+        D, gt_nns = gt_index.search(xq, 1)
+
+        coarse_quantizer = faiss.IndexFlatL2(d)
+        index = faiss.IndexIVFPQ(coarse_quantizer, d, 32, 8, 8)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.train(xt)
+        index.add(xb)
+        index.nprobe = 4
+        D, nns = index.search(xq, 10)
+        n_ok = (nns == gt_nns).sum()
+        nq = xq.shape[0]
+
+        self.assertGreater(n_ok, nq * 0.66)
+
+        # check that and Index2Layer gives the same reconstruction
+        # this is a bit fragile: it assumes 2 runs of training give
+        # the exact same result.
+        index2 = faiss.Index2Layer(coarse_quantizer, 32, 8)
+        if True:
+            index2.train(xt)
+        else:
+            index2.pq = index.pq
+            index2.is_trained = True
+        index2.add(xb)
+        ref_recons = index.reconstruct_n(0, nb)
+        new_recons = index2.reconstruct_n(0, nb)
+        self.assertTrue(np.all(ref_recons == new_recons))
+
+
+    def test_IMI(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+
+        (xt, xb, xq) = get_dataset_2(d, nt, nb, nq)
+        d = xt.shape[1]
+
+        gt_index = faiss.IndexFlatL2(d)
+        gt_index.add(xb)
+        D, gt_nns = gt_index.search(xq, 1)
+
+        nbits = 5
+        coarse_quantizer = faiss.MultiIndexQuantizer(d, 2, nbits)
+        index = faiss.IndexIVFPQ(coarse_quantizer, d, (1 << nbits) ** 2, 8, 8)
+        index.quantizer_trains_alone = 1
+        index.train(xt)
+        index.add(xb)
+        index.nprobe = 100
+        D, nns = index.search(xq, 10)
+        n_ok = (nns == gt_nns).sum()
+
+        # Should return 166 on mac, and 170 on linux.
+        self.assertGreater(n_ok, 165)
+
+        ############# replace with explicit assignment indexes
+        nbits = 5
+        pq = coarse_quantizer.pq
+        centroids = faiss.vector_to_array(pq.centroids)
+        centroids = centroids.reshape(pq.M, pq.ksub, pq.dsub)
+        ai0 = faiss.IndexFlatL2(pq.dsub)
+        ai0.add(centroids[0])
+        ai1 = faiss.IndexFlatL2(pq.dsub)
+        ai1.add(centroids[1])
+
+        coarse_quantizer_2 = faiss.MultiIndexQuantizer2(d, nbits, ai0, ai1)
+        coarse_quantizer_2.pq = pq
+        coarse_quantizer_2.is_trained = True
+
+        index.quantizer = coarse_quantizer_2
+
+        index.reset()
+        index.add(xb)
+
+        D, nns = index.search(xq, 10)
+        n_ok = (nns == gt_nns).sum()
+
+        # should return the same result
+        self.assertGreater(n_ok, 165)
+
+
+    def test_IMI_2(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+
+        (xt, xb, xq) = get_dataset_2(d, nt, nb, nq)
+        d = xt.shape[1]
+
+        gt_index = faiss.IndexFlatL2(d)
+        gt_index.add(xb)
+        D, gt_nns = gt_index.search(xq, 1)
+
+        ############# redo including training
+        nbits = 5
+        ai0 = faiss.IndexFlatL2(int(d / 2))
+        ai1 = faiss.IndexFlatL2(int(d / 2))
+
+        coarse_quantizer = faiss.MultiIndexQuantizer2(d, nbits, ai0, ai1)
+        index = faiss.IndexIVFPQ(coarse_quantizer, d, (1 << nbits) ** 2, 8, 8)
+        index.quantizer_trains_alone = 1
+        index.train(xt)
+        index.add(xb)
+        index.nprobe = 100
+        D, nns = index.search(xq, 10)
+        n_ok = (nns == gt_nns).sum()
+
+        # should return the same result
+        self.assertGreater(n_ok, 165)
+
+
+
+
+
+class TestMultiIndexQuantizer(unittest.TestCase):
+
+    def test_search_k1(self):
+
+        # verify codepath for k = 1 and k > 1
+
+        d = 64
+        nb = 0
+        nt = 1500
+        nq = 200
+
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+
+        miq = faiss.MultiIndexQuantizer(d, 2, 6)
+
+        miq.train(xt)
+
+        D1, I1 = miq.search(xq, 1)
+
+        D5, I5 = miq.search(xq, 5)
+
+        self.assertEqual(np.abs(I1[:, :1] - I5[:, :1]).max(), 0)
+        self.assertEqual(np.abs(D1[:, :1] - D5[:, :1]).max(), 0)
+
+
+class TestScalarQuantizer(unittest.TestCase):
+
+    def test_4variants_ivf(self):
+        d = 32
+        nt = 2500
+        nq = 400
+        nb = 5000
+
+        (xt, xb, xq) = get_dataset_2(d, nt, nb, nq)
+
+        # common quantizer
+        quantizer = faiss.IndexFlatL2(d)
+
+        ncent = 64
+
+        index_gt = faiss.IndexFlatL2(d)
+        index_gt.add(xb)
+        D, I_ref = index_gt.search(xq, 10)
+
+        nok = {}
+
+        index = faiss.IndexIVFFlat(quantizer, d, ncent,
+                                   faiss.METRIC_L2)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.nprobe = 4
+        index.train(xt)
+        index.add(xb)
+        D, I = index.search(xq, 10)
+        nok['flat'] = (I[:, 0] == I_ref[:, 0]).sum()
+
+        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16".split():
+            qtype = getattr(faiss.ScalarQuantizer, qname)
+            index = faiss.IndexIVFScalarQuantizer(quantizer, d, ncent,
+                                                  qtype, faiss.METRIC_L2)
+
+            index.nprobe = 4
+            index.train(xt)
+            index.add(xb)
+            D, I = index.search(xq, 10)
+
+            nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
+        print(nok, nq)
+
+        self.assertGreaterEqual(nok['flat'], nq * 0.6)
+        # The tests below are a bit fragile, it happens that the
+        # ordering between uniform and non-uniform are reverted,
+        # probably because the dataset is small, which introduces
+        # jitter
+        self.assertGreaterEqual(nok['flat'], nok['QT_8bit'])
+        self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit'])
+        self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
+        self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform'])
+        self.assertGreaterEqual(nok['QT_fp16'], nok['QT_8bit'])
+
+    def test_4variants(self):
+        d = 32
+        nt = 2500
+        nq = 400
+        nb = 5000
+
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+
+        index_gt = faiss.IndexFlatL2(d)
+        index_gt.add(xb)
+        D_ref, I_ref = index_gt.search(xq, 10)
+
+        nok = {}
+
+        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16".split():
+            qtype = getattr(faiss.ScalarQuantizer, qname)
+            index = faiss.IndexScalarQuantizer(d, qtype, faiss.METRIC_L2)
+            index.train(xt)
+            index.add(xb)
+            D, I = index.search(xq, 10)
+            nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
+
+        print(nok, nq)
+
+        self.assertGreaterEqual(nok['QT_8bit'], nq * 0.9)
+        self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit'])
+        self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
+        self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform'])
+        self.assertGreaterEqual(nok['QT_fp16'], nok['QT_8bit'])
+
+
+class TestRangeSearch(unittest.TestCase):
+
+    def test_range_search(self):
+        d = 4
+        nt = 100
+        nq = 10
+        nb = 50
+
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+
+        index = faiss.IndexFlatL2(d)
+        index.add(xb)
+
+        Dref, Iref = index.search(xq, 5)
+
+        thresh = 0.1   # *squared* distance
+        lims, D, I = index.range_search(xq, thresh)
+
+        for i in range(nq):
+            Iline = I[lims[i]:lims[i + 1]]
+            Dline = D[lims[i]:lims[i + 1]]
+            for j, dis in zip(Iref[i], Dref[i]):
+                if dis < thresh:
+                    li, = np.where(Iline == j)
+                    self.assertTrue(li.size == 1)
+                    idx = li[0]
+                    self.assertGreaterEqual(1e-4, abs(Dline[idx] - dis))
+
+
+class TestSearchAndReconstruct(unittest.TestCase):
+
+    def run_search_and_reconstruct(self, index, xb, xq, k=10, eps=None):
+        n, d = xb.shape
+        assert xq.shape[1] == d
+        assert index.d == d
+
+        D_ref, I_ref = index.search(xq, k)
+        R_ref = index.reconstruct_n(0, n)
+        D, I, R = index.search_and_reconstruct(xq, k)
+
+        self.assertTrue((D == D_ref).all())
+        self.assertTrue((I == I_ref).all())
+        self.assertEqual(R.shape[:2], I.shape)
+        self.assertEqual(R.shape[2], d)
+
+        # (n, k, ..) -> (n * k, ..)
+        I_flat = I.reshape(-1)
+        R_flat = R.reshape(-1, d)
+        # Filter out -1s when not enough results
+        R_flat = R_flat[I_flat >= 0]
+        I_flat = I_flat[I_flat >= 0]
+
+        recons_ref_err = np.mean(np.linalg.norm(R_flat - R_ref[I_flat]))
+        self.assertLessEqual(recons_ref_err, 1e-6)
+
+        def norm1(x):
+            return np.sqrt((x ** 2).sum(axis=1))
+
+        recons_err = np.mean(norm1(R_flat - xb[I_flat]))
+
+        print('Reconstruction error = %.3f' % recons_err)
+        if eps is not None:
+            self.assertLessEqual(recons_err, eps)
+
+        return D, I, R
+
+    def test_IndexFlat(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+
+        index = faiss.IndexFlatL2(d)
+        index.add(xb)
+
+        self.run_search_and_reconstruct(index, xb, xq, eps=0.0)
+
+    def test_IndexIVFFlat(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+
+        quantizer = faiss.IndexFlatL2(d)
+        index = faiss.IndexIVFFlat(quantizer, d, 32, faiss.METRIC_L2)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.nprobe = 4
+        index.train(xt)
+        index.add(xb)
+
+        self.run_search_and_reconstruct(index, xb, xq, eps=0.0)
+
+    def test_IndexIVFPQ(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+
+        quantizer = faiss.IndexFlatL2(d)
+        index = faiss.IndexIVFPQ(quantizer, d, 32, 8, 8)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.nprobe = 4
+        index.train(xt)
+        index.add(xb)
+
+        self.run_search_and_reconstruct(index, xb, xq, eps=1.0)
+
+    def test_MultiIndex(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+
+        index = faiss.index_factory(d, "IMI2x5,PQ8np")
+        faiss.ParameterSpace().set_index_parameter(index, "nprobe", 4)
+        index.train(xt)
+        index.add(xb)
+
+        self.run_search_and_reconstruct(index, xb, xq, eps=1.0)
+
+    def test_IndexTransform(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+
+        index = faiss.index_factory(d, "L2norm,PCA8,IVF32,PQ8np")
+        faiss.ParameterSpace().set_index_parameter(index, "nprobe", 4)
+        index.train(xt)
+        index.add(xb)
+
+        self.run_search_and_reconstruct(index, xb, xq)
+
+
+class TestHNSW(unittest.TestCase):
+
+    def __init__(self, *args, **kwargs):
+        unittest.TestCase.__init__(self, *args, **kwargs)
+        d = 32
+        nt = 0
+        nb = 1500
+        nq = 500
+
+        (_, self.xb, self.xq) = get_dataset_2(d, nt, nb, nq)
+        index = faiss.IndexFlatL2(d)
+        index.add(self.xb)
+        Dref, Iref = index.search(self.xq, 1)
+        self.Iref = Iref
+
+    def test_hnsw(self):
+        d = self.xq.shape[1]
+
+        index = faiss.IndexHNSWFlat(d, 16)
+        index.add(self.xb)
+        Dhnsw, Ihnsw = index.search(self.xq, 1)
+
+        self.assertGreaterEqual((self.Iref == Ihnsw).sum(), 460)
+
+        self.io_and_retest(index, Dhnsw, Ihnsw)
+
+    def test_hnsw_unbounded_queue(self):
+        d = self.xq.shape[1]
+
+        index = faiss.IndexHNSWFlat(d, 16)
+        index.add(self.xb)
+        index.search_bounded_queue = False
+        Dhnsw, Ihnsw = index.search(self.xq, 1)
+
+        self.assertGreaterEqual((self.Iref == Ihnsw).sum(), 460)
+
+        self.io_and_retest(index, Dhnsw, Ihnsw)
+
+    def io_and_retest(self, index, Dhnsw, Ihnsw):
+        _, tmpfile = tempfile.mkstemp()
+        try:
+            faiss.write_index(index, tmpfile)
+            index2 = faiss.read_index(tmpfile)
+        finally:
+            if os.path.exists(tmpfile):
+                os.unlink(tmpfile)
+
+        Dhnsw2, Ihnsw2 = index2.search(self.xq, 1)
+
+        self.assertTrue(np.all(Dhnsw2 == Dhnsw))
+        self.assertTrue(np.all(Ihnsw2 == Ihnsw))
+
+        # also test clone
+        index3 = faiss.clone_index(index)
+        Dhnsw3, Ihnsw3 = index3.search(self.xq, 1)
+
+        self.assertTrue(np.all(Dhnsw3 == Dhnsw))
+        self.assertTrue(np.all(Ihnsw3 == Ihnsw))
+
+
+    def test_hnsw_2level(self):
+        d = self.xq.shape[1]
+
+        quant = faiss.IndexFlatL2(d)
+
+        index = faiss.IndexHNSW2Level(quant, 256, 8, 8)
+        index.train(self.xb)
+        index.add(self.xb)
+        Dhnsw, Ihnsw = index.search(self.xq, 1)
+
+        self.assertGreaterEqual((self.Iref == Ihnsw).sum(), 310)
+
+        self.io_and_retest(index, Dhnsw, Ihnsw)
+
+    def test_add_0_vecs(self):
+        index = faiss.IndexHNSWFlat(10, 16)
+        zero_vecs = np.zeros((0, 10), dtype='float32')
+        # infinite loop
+        index.add(zero_vecs)
+
+
+class TestIOError(unittest.TestCase):
+
+    def test_io_error(self):
+        d, n = 32, 1000
+        x = np.random.uniform(size=(n, d)).astype('float32')
+        index = faiss.IndexFlatL2(d)
+        index.add(x)
+        _, fname = tempfile.mkstemp()
+        try:
+            faiss.write_index(index, fname)
+
+            # should be fine
+            faiss.read_index(fname)
+
+            # now damage file
+            data = open(fname, 'rb').read()
+            data = data[:int(len(data) / 2)]
+            open(fname, 'wb').write(data)
+
+            # should make a nice readable exception that mentions the
+            try:
+                faiss.read_index(fname)
+            except RuntimeError as e:
+                if fname not in str(e):
+                    raise
+            else:
+                raise
+
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
+
+
+class TestDistancesPositive(unittest.TestCase):
+
+    def test_l2_pos(self):
+        """
+        roundoff errors occur only with the L2 decomposition used
+        with BLAS, ie. in IndexFlatL2 and with
+        n > distance_compute_blas_threshold = 20
+        """
+
+        d = 128
+        n = 100
+
+        rs = np.random.RandomState(1234)
+        x = rs.rand(n, d).astype('float32')
+
+        index = faiss.IndexFlatL2(d)
+        index.add(x)
+
+        D, I = index.search(x, 10)
+
+        assert np.all(D >= 0)
+
+
+class TestReconsException(unittest.TestCase):
+
+    def test_recons(self):
+
+        d = 64                           # dimension
+        nb = 1000
+        rs = np.random.RandomState(1234)
+        xb = rs.rand(nb, d).astype('float32')
+        nlist = 10
+        quantizer = faiss.IndexFlatL2(d)  # the other index
+        index = faiss.IndexIVFFlat(quantizer, d, nlist)
+        index.train(xb)
+        index.add(xb)
+        index.make_direct_map()
+
+        index.reconstruct(9)
+
+        try:
+            index.reconstruct(100001)
+        except RuntimeError:
+            pass
+        else:
+            assert False, "should raise an exception"
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/core/src/index/thirdparty/faiss/tests/test_index_accuracy.py b/core/src/index/thirdparty/faiss/tests/test_index_accuracy.py
new file mode 100644
index 0000000000..41244da326
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_index_accuracy.py
@@ -0,0 +1,643 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+# noqa E741
+# translation of test_knn.lua
+
+import numpy as np
+import unittest
+import faiss
+
+from common import Randu10k, get_dataset_2, Randu10kUnbalanced
+
+ev = Randu10k()
+
+d = ev.d
+
+# Parameters inverted indexes
+ncentroids = int(4 * np.sqrt(ev.nb))
+kprobe = int(np.sqrt(ncentroids))
+
+# Parameters for LSH
+nbits = d
+
+# Parameters for indexes involving PQ
+M = int(d / 8)           # for PQ: #subquantizers
+nbits_per_index = 8      # for PQ
+
+
+class IndexAccuracy(unittest.TestCase):
+
+    def test_IndexFlatIP(self):
+        q = faiss.IndexFlatIP(d)  # Ask inner product
+        res = ev.launch('FLAT / IP', q)
+        e = ev.evalres(res)
+        assert e[1] == 1.0
+
+    def test_IndexFlatL2(self):
+        q = faiss.IndexFlatL2(d)
+        res = ev.launch('FLAT / L2', q)
+        e = ev.evalres(res)
+        assert e[1] == 1.0
+
+    def test_ivf_kmeans(self):
+        ivfk = faiss.IndexIVFFlat(faiss.IndexFlatL2(d), d, ncentroids)
+        ivfk.nprobe = kprobe
+        res = ev.launch('IndexIVFFlat', ivfk)
+        e = ev.evalres(res)
+        # should give 0.260  0.260  0.260
+        assert e[1] > 0.2
+
+        # test parallel mode
+        Dref, Iref = ivfk.search(ev.xq, 100)
+        ivfk.parallel_mode = 1
+        Dnew, Inew = ivfk.search(ev.xq, 100)
+        print((Iref != Inew).sum(), Iref.size)
+        assert (Iref != Inew).sum() < Iref.size / 5000.0
+        assert np.all(Dref == Dnew)
+
+    def test_indexLSH(self):
+        q = faiss.IndexLSH(d, nbits)
+        res = ev.launch('FLAT / LSH Cosine', q)
+        e = ev.evalres(res)
+        # should give 0.070  0.250  0.580
+        assert e[10] > 0.2
+
+    def test_IndexLSH_32_48(self):
+        # CHECK: the difference between 32 and 48 does not make much sense
+        for nbits2 in 32, 48:
+            q = faiss.IndexLSH(d, nbits2)
+            res = ev.launch('LSH half size', q)
+            e = ev.evalres(res)
+            # should give 0.003  0.019  0.108
+            assert e[10] > 0.018
+
+    def test_IndexPQ(self):
+        q = faiss.IndexPQ(d, M, nbits_per_index)
+        res = ev.launch('FLAT / PQ L2', q)
+        e = ev.evalres(res)
+        # should give 0.070  0.230  0.260
+        assert e[10] > 0.2
+
+    # Approximate search module: PQ with inner product distance
+    def test_IndexPQ_ip(self):
+        q = faiss.IndexPQ(d, M, nbits_per_index, faiss.METRIC_INNER_PRODUCT)
+        res = ev.launch('FLAT / PQ IP', q)
+        e = ev.evalres(res)
+        # should give 0.070  0.230  0.260
+        #(same result as regular PQ on normalized distances)
+        assert e[10] > 0.2
+
+    def test_IndexIVFPQ(self):
+        ivfpq = faiss.IndexIVFPQ(faiss.IndexFlatL2(d), d, ncentroids, M, 8)
+        ivfpq.nprobe = kprobe
+        res = ev.launch('IVF PQ', ivfpq)
+        e = ev.evalres(res)
+        # should give 0.070  0.230  0.260
+        assert e[10] > 0.2
+
+    # TODO: translate evaluation of nested
+
+    # Approximate search: PQ with full vector refinement
+    def test_IndexPQ_refined(self):
+        q = faiss.IndexPQ(d, M, nbits_per_index)
+        res = ev.launch('PQ non-refined', q)
+        e = ev.evalres(res)
+        q.reset()
+
+        rq = faiss.IndexRefineFlat(q)
+        res = ev.launch('PQ refined', rq)
+        e2 = ev.evalres(res)
+        assert e2[10] >= e[10]
+        rq.k_factor = 4
+
+        res = ev.launch('PQ refined*4', rq)
+        e3 = ev.evalres(res)
+        assert e3[10] >= e2[10]
+
+    def test_polysemous(self):
+        index = faiss.IndexPQ(d, M, nbits_per_index)
+        index.do_polysemous_training = True
+        # reduce nb iterations to speed up training for the test
+        index.polysemous_training.n_iter = 50000
+        index.polysemous_training.n_redo = 1
+        res = ev.launch('normal PQ', index)
+        e_baseline = ev.evalres(res)
+        index.search_type = faiss.IndexPQ.ST_polysemous
+
+        index.polysemous_ht = int(M / 16. * 58)
+
+        stats = faiss.cvar.indexPQ_stats
+        stats.reset()
+
+        res = ev.launch('Polysemous ht=%d' % index.polysemous_ht,
+                        index)
+        e_polysemous = ev.evalres(res)
+        print(e_baseline, e_polysemous, index.polysemous_ht)
+        print(stats.n_hamming_pass, stats.ncode)
+        # The randu dataset is difficult, so we are not too picky on
+        # the results. Here we assert that we have < 10 % loss when
+        # computing full PQ on fewer than 20% of the data.
+        assert stats.n_hamming_pass < stats.ncode / 5
+        # Test disabled because difference is 0.17 on aarch64
+        # TODO check why???
+        # assert e_polysemous[10] > e_baseline[10] - 0.1
+
+    def test_ScalarQuantizer(self):
+        quantizer = faiss.IndexFlatL2(d)
+        ivfpq = faiss.IndexIVFScalarQuantizer(
+            quantizer, d, ncentroids,
+            faiss.ScalarQuantizer.QT_8bit)
+        ivfpq.nprobe = kprobe
+        res = ev.launch('IVF SQ', ivfpq)
+        e = ev.evalres(res)
+        # should give 0.234  0.236  0.236
+        assert e[10] > 0.235
+
+
+
+class TestSQFlavors(unittest.TestCase):
+    """ tests IP in addition to L2, non multiple of 8 dimensions
+    """
+
+    def add2columns(self, x):
+        return np.hstack((
+            x, np.zeros((x.shape[0], 2), dtype='float32')
+        ))
+
+    def subtest_add2col(self, xb, xq, index, qname):
+        """Test with 2 additional dimensions to take also the non-SIMD
+        codepath. We don't retrain anything but add 2 dims to the
+        queries, the centroids and the trained ScalarQuantizer.
+        """
+        nb, d = xb.shape
+
+        d2 = d + 2
+        xb2 = self.add2columns(xb)
+        xq2 = self.add2columns(xq)
+
+        nlist = index.nlist
+        quantizer = faiss.downcast_index(index.quantizer)
+        quantizer2 = faiss.IndexFlat(d2, index.metric_type)
+        centroids = faiss.vector_to_array(quantizer.xb).reshape(nlist, d)
+        centroids2 = self.add2columns(centroids)
+        quantizer2.add(centroids2)
+        index2 = faiss.IndexIVFScalarQuantizer(
+            quantizer2, d2, index.nlist, index.sq.qtype,
+            index.metric_type)
+        index2.nprobe = 4
+        if qname in ('8bit', '4bit'):
+            trained = faiss.vector_to_array(index.sq.trained).reshape(2, -1)
+            nt = trained.shape[1]
+            # 2 lines: vmins and vdiffs
+            new_nt = int(nt * d2 / d)
+            trained2 = np.hstack((
+                trained,
+                np.zeros((2, new_nt - nt), dtype='float32')
+            ))
+            trained2[1, nt:] = 1.0   # set vdiff to 1 to avoid div by 0
+            faiss.copy_array_to_vector(trained2.ravel(), index2.sq.trained)
+        else:
+            index2.sq.trained = index.sq.trained
+
+        index2.is_trained = True
+        index2.add(xb2)
+        return index2.search(xq2, 10)
+
+
+    # run on Sept 18, 2018 with nprobe=4 + 4 bit bugfix
+    ref_results = {
+        (0, '8bit'): 984,
+        (0, '4bit'): 978,
+        (0, '8bit_uniform'): 985,
+        (0, '4bit_uniform'): 979,
+        (0, 'fp16'): 985,
+        (1, '8bit'): 979,
+        (1, '4bit'): 973,
+        (1, '8bit_uniform'): 979,
+        (1, '4bit_uniform'): 972,
+        (1, 'fp16'): 979,
+        # added 2019-06-26
+        (0, '6bit'): 985,
+        (1, '6bit'): 987,
+    }
+
+    def subtest(self, mt):
+        d = 32
+        xt, xb, xq = get_dataset_2(d, 2000, 1000, 200)
+        nlist = 64
+
+        gt_index = faiss.IndexFlat(d, mt)
+        gt_index.add(xb)
+        gt_D, gt_I = gt_index.search(xq, 10)
+        quantizer = faiss.IndexFlat(d, mt)
+        for qname in '8bit 4bit 8bit_uniform 4bit_uniform fp16 6bit'.split():
+            qtype = getattr(faiss.ScalarQuantizer, 'QT_' + qname)
+            index = faiss.IndexIVFScalarQuantizer(
+                quantizer, d, nlist, qtype, mt)
+            index.train(xt)
+            index.add(xb)
+            index.nprobe = 4   # hopefully more robust than 1
+            D, I = index.search(xq, 10)
+            ninter = faiss.eval_intersection(I, gt_I)
+            print('(%d, %s): %d, ' % (mt, repr(qname), ninter))
+            assert abs(ninter - self.ref_results[(mt, qname)]) <= 10
+
+            if qname == '6bit':
+                # the test below fails triggers ASAN. TODO check what's wrong
+                continue
+
+            D2, I2 = self.subtest_add2col(xb, xq, index, qname)
+            assert np.all(I2 == I)
+
+            # also test range search
+
+            if mt == faiss.METRIC_INNER_PRODUCT:
+                radius = float(D[:, -1].max())
+            else:
+                radius = float(D[:, -1].min())
+            print('radius', radius)
+
+            lims, D3, I3 = index.range_search(xq, radius)
+            ntot = ndiff = 0
+            for i in range(len(xq)):
+                l0, l1 = lims[i], lims[i + 1]
+                Inew = set(I3[l0:l1])
+                if mt == faiss.METRIC_INNER_PRODUCT:
+                    mask = D2[i] > radius
+                else:
+                    mask = D2[i] < radius
+                Iref = set(I2[i, mask])
+                ndiff += len(Inew ^ Iref)
+                ntot += len(Iref)
+            print('ndiff %d / %d' % (ndiff, ntot))
+            assert ndiff < ntot * 0.01
+
+            for pm in 1, 2:
+                print('parallel_mode=%d' % pm)
+                index.parallel_mode = pm
+                lims4, D4, I4 = index.range_search(xq, radius)
+                print('sizes', lims4[1:] - lims4[:-1])
+                for qno in range(len(lims) - 1):
+                    Iref = I3[lims[qno]: lims[qno+1]]
+                    Inew = I4[lims4[qno]: lims4[qno+1]]
+                    assert set(Iref) == set(Inew), "q %d ref %s new %s" % (
+                        qno, Iref, Inew)
+
+    def test_SQ_IP(self):
+        self.subtest(faiss.METRIC_INNER_PRODUCT)
+
+    def test_SQ_L2(self):
+        self.subtest(faiss.METRIC_L2)
+
+
+class TestSQByte(unittest.TestCase):
+
+    def subtest_8bit_direct(self, metric_type, d):
+        xt, xb, xq = get_dataset_2(d, 500, 1000, 30)
+
+        # rescale everything to get integer
+        tmin, tmax = xt.min(), xt.max()
+
+        def rescale(x):
+            x = np.floor((x - tmin) * 256 / (tmax - tmin))
+            x[x < 0] = 0
+            x[x > 255] = 255
+            return x
+
+        xt = rescale(xt)
+        xb = rescale(xb)
+        xq = rescale(xq)
+
+        gt_index = faiss.IndexFlat(d, metric_type)
+        gt_index.add(xb)
+        Dref, Iref = gt_index.search(xq, 10)
+
+        index = faiss.IndexScalarQuantizer(
+            d, faiss.ScalarQuantizer.QT_8bit_direct, metric_type)
+        index.add(xb)
+        D, I = index.search(xq, 10)
+
+        assert np.all(I == Iref)
+        assert np.all(D == Dref)
+
+        # same, with IVF
+
+        nlist = 64
+        quantizer = faiss.IndexFlat(d, metric_type)
+
+        gt_index = faiss.IndexIVFFlat(quantizer, d, nlist, metric_type)
+        gt_index.nprobe = 4
+        gt_index.train(xt)
+        gt_index.add(xb)
+        Dref, Iref = gt_index.search(xq, 10)
+
+        index = faiss.IndexIVFScalarQuantizer(
+            quantizer, d, nlist,
+            faiss.ScalarQuantizer.QT_8bit_direct, metric_type)
+        index.nprobe = 4
+        index.by_residual = False
+        index.train(xt)
+        index.add(xb)
+        D, I = index.search(xq, 10)
+
+        assert np.all(I == Iref)
+        assert np.all(D == Dref)
+
+    def test_8bit_direct(self):
+        for d in 13, 16, 24:
+            for metric_type in faiss.METRIC_L2, faiss.METRIC_INNER_PRODUCT:
+                self.subtest_8bit_direct(metric_type, d)
+
+
+
+class TestPQFlavors(unittest.TestCase):
+
+    # run on Dec 14, 2018
+    ref_results = {
+        (1, True): 800,
+        (1, True, 20): 794,
+        (1, False): 769,
+        (0, True): 831,
+        (0, True, 20): 828,
+        (0, False): 829,
+    }
+
+    def test_IVFPQ_IP(self):
+        self.subtest(faiss.METRIC_INNER_PRODUCT)
+
+    def test_IVFPQ_L2(self):
+        self.subtest(faiss.METRIC_L2)
+
+    def subtest(self, mt):
+        d = 32
+        xt, xb, xq = get_dataset_2(d, 2000, 1000, 200)
+        nlist = 64
+
+        gt_index = faiss.IndexFlat(d, mt)
+        gt_index.add(xb)
+        gt_D, gt_I = gt_index.search(xq, 10)
+        quantizer = faiss.IndexFlat(d, mt)
+        for by_residual in True, False:
+
+            index = faiss.IndexIVFPQ(
+                quantizer, d, nlist, 4, 8)
+            index.metric_type = mt
+            index.by_residual = by_residual
+            if by_residual:
+                # perform cheap polysemous training
+                index.do_polysemous_training = True
+                pt = faiss.PolysemousTraining()
+                pt.n_iter = 50000
+                pt.n_redo = 1
+                index.polysemous_training = pt
+
+            index.train(xt)
+            index.add(xb)
+            index.nprobe = 4
+            D, I = index.search(xq, 10)
+
+            ninter = faiss.eval_intersection(I, gt_I)
+            print('(%d, %s): %d, ' % (mt, by_residual, ninter))
+
+            assert abs(ninter - self.ref_results[mt, by_residual]) <= 3
+
+            index.use_precomputed_table = 0
+            D2, I2 = index.search(xq, 10)
+            assert np.all(I == I2)
+
+            if by_residual:
+
+                index.use_precomputed_table = 1
+                index.polysemous_ht = 20
+                D, I = index.search(xq, 10)
+                ninter = faiss.eval_intersection(I, gt_I)
+                print('(%d, %s, %d): %d, ' % (
+                    mt, by_residual, index.polysemous_ht, ninter))
+
+                # polysemous behaves bizarrely on ARM
+                assert (ninter >= self.ref_results[
+                    mt, by_residual, index.polysemous_ht] - 4)
+
+            # also test range search
+
+            if mt == faiss.METRIC_INNER_PRODUCT:
+                radius = float(D[:, -1].max())
+            else:
+                radius = float(D[:, -1].min())
+            print('radius', radius)
+
+            lims, D3, I3 = index.range_search(xq, radius)
+            ntot = ndiff = 0
+            for i in range(len(xq)):
+                l0, l1 = lims[i], lims[i + 1]
+                Inew = set(I3[l0:l1])
+                if mt == faiss.METRIC_INNER_PRODUCT:
+                    mask = D2[i] > radius
+                else:
+                    mask = D2[i] < radius
+                Iref = set(I2[i, mask])
+                ndiff += len(Inew ^ Iref)
+                ntot += len(Iref)
+            print('ndiff %d / %d' % (ndiff, ntot))
+            assert ndiff < ntot * 0.02
+
+
+
+
+class TestFlat1D(unittest.TestCase):
+
+    def test_flat_1d(self):
+        rs = np.random.RandomState(123545)
+        k = 10
+        xb = rs.uniform(size=(100, 1)).astype('float32')
+        # make sure to test below and above
+        xq = rs.uniform(size=(1000, 1)).astype('float32') * 1.1 - 0.05
+
+        ref = faiss.IndexFlatL2(1)
+        ref.add(xb)
+        ref_D, ref_I = ref.search(xq, k)
+
+        new = faiss.IndexFlat1D()
+        new.add(xb)
+
+        new_D, new_I = new.search(xq, 10)
+
+        ndiff = (np.abs(ref_I - new_I) != 0).sum()
+
+        assert(ndiff < 100)
+        new_D = new_D ** 2
+        max_diff_D = np.abs(ref_D - new_D).max()
+        assert(max_diff_D < 1e-5)
+
+
+class OPQRelativeAccuracy(unittest.TestCase):
+    # translated from test_opq.lua
+
+    def test_OPQ(self):
+
+        M = 4
+
+        ev = Randu10kUnbalanced()
+        d = ev.d
+        index = faiss.IndexPQ(d, M, 8)
+
+        res = ev.launch('PQ', index)
+        e_pq = ev.evalres(res)
+
+        index_pq = faiss.IndexPQ(d, M, 8)
+        opq_matrix = faiss.OPQMatrix(d, M)
+        # opq_matrix.verbose = true
+        opq_matrix.niter = 10
+        opq_matrix.niter_pq = 4
+        index = faiss.IndexPreTransform(opq_matrix, index_pq)
+
+        res = ev.launch('OPQ', index)
+        e_opq = ev.evalres(res)
+
+        print('e_pq=%s' % e_pq)
+        print('e_opq=%s' % e_opq)
+
+        # verify that OPQ better than PQ
+        for r in 1, 10, 100:
+            assert(e_opq[r] > e_pq[r])
+
+    def test_OIVFPQ(self):
+        # Parameters inverted indexes
+        ncentroids = 50
+        M = 4
+
+        ev = Randu10kUnbalanced()
+        d = ev.d
+        quantizer = faiss.IndexFlatL2(d)
+        index = faiss.IndexIVFPQ(quantizer, d, ncentroids, M, 8)
+        index.nprobe = 5
+
+        res = ev.launch('IVFPQ', index)
+        e_ivfpq = ev.evalres(res)
+
+        quantizer = faiss.IndexFlatL2(d)
+        index_ivfpq = faiss.IndexIVFPQ(quantizer, d, ncentroids, M, 8)
+        index_ivfpq.nprobe = 5
+        opq_matrix = faiss.OPQMatrix(d, M)
+        opq_matrix.niter = 10
+        index = faiss.IndexPreTransform(opq_matrix, index_ivfpq)
+
+        res = ev.launch('O+IVFPQ', index)
+        e_oivfpq = ev.evalres(res)
+
+        # verify same on OIVFPQ
+        for r in 1, 10, 100:
+            print(e_oivfpq[r], e_ivfpq[r])
+            assert(e_oivfpq[r] >= e_ivfpq[r])
+
+
+class TestRoundoff(unittest.TestCase):
+
+    def test_roundoff(self):
+        # params that force use of BLAS implementation
+        nb = 100
+        nq = 25
+        d = 4
+        xb = np.zeros((nb, d), dtype='float32')
+
+        xb[:, 0] = np.arange(nb) + 12345
+        xq = xb[:nq] + 0.3
+
+        index = faiss.IndexFlat(d)
+        index.add(xb)
+
+        D, I = index.search(xq, 1)
+
+        # this does not work
+        assert not np.all(I.ravel() == np.arange(nq))
+
+        index = faiss.IndexPreTransform(
+            faiss.CenteringTransform(d),
+            faiss.IndexFlat(d))
+
+        index.train(xb)
+        index.add(xb)
+
+        D, I = index.search(xq, 1)
+
+        # this works
+        assert np.all(I.ravel() == np.arange(nq))
+
+
+class TestSpectralHash(unittest.TestCase):
+
+    # run on 2019-04-02
+    ref_results = {
+        (32, 'global', 10): 505,
+        (32, 'centroid', 10): 524,
+        (32, 'centroid_half', 10): 21,
+        (32, 'median', 10): 510,
+        (32, 'global', 1): 8,
+        (32, 'centroid', 1): 20,
+        (32, 'centroid_half', 1): 26,
+        (32, 'median', 1): 14,
+        (64, 'global', 10): 768,
+        (64, 'centroid', 10): 767,
+        (64, 'centroid_half', 10): 21,
+        (64, 'median', 10): 765,
+        (64, 'global', 1): 28,
+        (64, 'centroid', 1): 21,
+        (64, 'centroid_half', 1): 20,
+        (64, 'median', 1): 29,
+        (128, 'global', 10): 968,
+        (128, 'centroid', 10): 945,
+        (128, 'centroid_half', 10): 21,
+        (128, 'median', 10): 958,
+        (128, 'global', 1): 271,
+        (128, 'centroid', 1): 279,
+        (128, 'centroid_half', 1): 171,
+        (128, 'median', 1): 253,
+    }
+
+    def test_sh(self):
+        d = 32
+        xt, xb, xq = get_dataset_2(d, 2000, 1000, 200)
+        nlist, nprobe = 1, 1
+
+        gt_index = faiss.IndexFlatL2(d)
+        gt_index.add(xb)
+        gt_D, gt_I = gt_index.search(xq, 10)
+
+        for nbit in 32, 64, 128:
+            quantizer = faiss.IndexFlatL2(d)
+
+            index_lsh = faiss.IndexLSH(d, nbit, True)
+            index_lsh.add(xb)
+            D, I = index_lsh.search(xq, 10)
+            ninter = faiss.eval_intersection(I, gt_I)
+
+            print('LSH baseline: %d' % ninter)
+
+            for period in 10.0, 1.0:
+
+                for tt in 'global centroid centroid_half median'.split():
+                    index = faiss.IndexIVFSpectralHash(quantizer, d, nlist,
+                                                       nbit, period)
+                    index.nprobe = nprobe
+                    index.threshold_type = getattr(
+                        faiss.IndexIVFSpectralHash,
+                        'Thresh_' + tt
+                    )
+
+                    index.train(xt)
+                    index.add(xb)
+                    D, I = index.search(xq, 10)
+
+                    ninter = faiss.eval_intersection(I, gt_I)
+                    key = (nbit, tt, period)
+
+                    print('(%d, %s, %g): %d, ' % (nbit, repr(tt), period, ninter))
+                    assert abs(ninter - self.ref_results[key]) <= 4
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/core/src/index/thirdparty/faiss/tests/test_index_binary.py b/core/src/index/thirdparty/faiss/tests/test_index_binary.py
new file mode 100644
index 0000000000..046e2bb3e9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_index_binary.py
@@ -0,0 +1,324 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+"""this is a basic test script for simple indices work"""
+
+import numpy as np
+import unittest
+import faiss
+
+
+def make_binary_dataset(d, nt, nb, nq):
+    assert d % 8 == 0
+    rs = np.random.RandomState(123)
+    x = rs.randint(256, size=(nb + nq + nt, int(d / 8))).astype('uint8')
+    return x[:nt], x[nt:-nq], x[-nq:]
+
+
+def binary_to_float(x):
+    n, d = x.shape
+    x8 = x.reshape(n * d, -1)
+    c8 = 2 * ((x8 >> np.arange(8)) & 1).astype('int8') - 1
+    return c8.astype('float32').reshape(n, d * 8)
+
+
+def binary_dis(x, y):
+    return sum(faiss.popcount64(int(xi ^ yi)) for xi, yi in zip(x, y))
+
+
+class TestBinaryPQ(unittest.TestCase):
+    """ Use a PQ that mimicks a binary encoder """
+
+    def test_encode_to_binary(self):
+        d = 256
+        nt = 256
+        nb = 1500
+        nq = 500
+        (xt, xb, xq) = make_binary_dataset(d, nt, nb, nq)
+        pq = faiss.ProductQuantizer(d, int(d / 8), 8)
+
+        centroids = binary_to_float(
+            np.tile(np.arange(256), int(d / 8)).astype('uint8').reshape(-1, 1))
+
+        faiss.copy_array_to_vector(centroids.ravel(), pq.centroids)
+        pq.is_trained = True
+
+        codes = pq.compute_codes(binary_to_float(xb))
+
+        assert np.all(codes == xb)
+
+        indexpq = faiss.IndexPQ(d, int(d / 8), 8)
+        indexpq.pq = pq
+        indexpq.is_trained = True
+
+        indexpq.add(binary_to_float(xb))
+        D, I = indexpq.search(binary_to_float(xq), 3)
+
+        for i in range(nq):
+            for j, dj in zip(I[i], D[i]):
+                ref_dis = binary_dis(xq[i], xb[j])
+                assert 4 * ref_dis == dj
+
+        nlist = 32
+        quantizer = faiss.IndexFlatL2(d)
+        # pretext class for training
+        iflat = faiss.IndexIVFFlat(quantizer, d, nlist)
+        iflat.train(binary_to_float(xt))
+
+        indexivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, int(d / 8), 8)
+
+        indexivfpq.pq = pq
+        indexivfpq.is_trained = True
+        indexivfpq.by_residual = False
+
+        indexivfpq.add(binary_to_float(xb))
+        indexivfpq.nprobe = 4
+
+        D, I = indexivfpq.search(binary_to_float(xq), 3)
+
+        for i in range(nq):
+            for j, dj in zip(I[i], D[i]):
+                ref_dis = binary_dis(xq[i], xb[j])
+                assert 4 * ref_dis == dj
+
+
+class TestBinaryFlat(unittest.TestCase):
+
+    def __init__(self, *args, **kwargs):
+        unittest.TestCase.__init__(self, *args, **kwargs)
+        d = 32
+        nt = 0
+        nb = 1500
+        nq = 500
+
+        (_, self.xb, self.xq) = make_binary_dataset(d, nt, nb, nq)
+
+    def test_flat(self):
+        d = self.xq.shape[1] * 8
+        nq = self.xq.shape[0]
+
+        index = faiss.IndexBinaryFlat(d)
+        index.add(self.xb)
+        D, I = index.search(self.xq, 3)
+
+        for i in range(nq):
+            for j, dj in zip(I[i], D[i]):
+                ref_dis = binary_dis(self.xq[i], self.xb[j])
+                assert dj == ref_dis
+
+        # test reconstruction
+        assert np.all(index.reconstruct(12) == self.xb[12])
+
+    def test_empty_flat(self):
+        d = self.xq.shape[1] * 8
+
+        index = faiss.IndexBinaryFlat(d)
+
+        for use_heap in [True, False]:
+            index.use_heap = use_heap
+            Dflat, Iflat = index.search(self.xq, 10)
+
+            assert(np.all(Iflat == -1))
+            assert(np.all(Dflat == 2147483647)) # NOTE(hoss): int32_t max
+
+
+class TestBinaryIVF(unittest.TestCase):
+
+    def __init__(self, *args, **kwargs):
+        unittest.TestCase.__init__(self, *args, **kwargs)
+        d = 32
+        nt = 200
+        nb = 1500
+        nq = 500
+
+        (self.xt, self.xb, self.xq) = make_binary_dataset(d, nt, nb, nq)
+        index = faiss.IndexBinaryFlat(d)
+        index.add(self.xb)
+        Dref, Iref = index.search(self.xq, 10)
+        self.Dref = Dref
+
+    def test_ivf_flat_exhaustive(self):
+        d = self.xq.shape[1] * 8
+
+        quantizer = faiss.IndexBinaryFlat(d)
+        index = faiss.IndexBinaryIVF(quantizer, d, 8)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.nprobe = 8
+        index.train(self.xt)
+        index.add(self.xb)
+        Divfflat, _ = index.search(self.xq, 10)
+
+        np.testing.assert_array_equal(self.Dref, Divfflat)
+
+    def test_ivf_flat2(self):
+        d = self.xq.shape[1] * 8
+
+        quantizer = faiss.IndexBinaryFlat(d)
+        index = faiss.IndexBinaryIVF(quantizer, d, 8)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.nprobe = 4
+        index.train(self.xt)
+        index.add(self.xb)
+        Divfflat, _ = index.search(self.xq, 10)
+
+        self.assertEqual((self.Dref == Divfflat).sum(), 4122)
+
+    def test_ivf_flat_empty(self):
+        d = self.xq.shape[1] * 8
+
+        index = faiss.IndexBinaryIVF(faiss.IndexBinaryFlat(d), d, 8)
+        index.train(self.xt)
+
+        for use_heap in [True, False]:
+            index.use_heap = use_heap
+            Divfflat, Iivfflat = index.search(self.xq, 10)
+
+            assert(np.all(Iivfflat == -1))
+            assert(np.all(Divfflat == 2147483647)) # NOTE(hoss): int32_t max
+
+class TestHNSW(unittest.TestCase):
+
+    def __init__(self, *args, **kwargs):
+        unittest.TestCase.__init__(self, *args, **kwargs)
+        d = 32
+        nt = 0
+        nb = 1500
+        nq = 500
+
+        (_, self.xb, self.xq) = make_binary_dataset(d, nt, nb, nq)
+
+    def test_hnsw_exact_distances(self):
+        d = self.xq.shape[1] * 8
+        nq = self.xq.shape[0]
+
+        index = faiss.IndexBinaryHNSW(d, 16)
+        index.add(self.xb)
+        Dists, Ids = index.search(self.xq, 3)
+
+        for i in range(nq):
+            for j, dj in zip(Ids[i], Dists[i]):
+                ref_dis = binary_dis(self.xq[i], self.xb[j])
+                self.assertEqual(dj, ref_dis)
+
+    def test_hnsw(self):
+        d = self.xq.shape[1] * 8
+
+        # NOTE(hoss): Ensure the HNSW construction is deterministic.
+        nthreads = faiss.omp_get_max_threads()
+        faiss.omp_set_num_threads(1)
+
+        index_hnsw_float = faiss.IndexHNSWFlat(d, 16)
+        index_hnsw_ref = faiss.IndexBinaryFromFloat(index_hnsw_float)
+
+        index_hnsw_bin = faiss.IndexBinaryHNSW(d, 16)
+
+        index_hnsw_ref.add(self.xb)
+        index_hnsw_bin.add(self.xb)
+
+        faiss.omp_set_num_threads(nthreads)
+
+        Dref, Iref = index_hnsw_ref.search(self.xq, 3)
+        Dbin, Ibin = index_hnsw_bin.search(self.xq, 3)
+
+        self.assertTrue((Dref == Dbin).all())
+
+
+def compare_binary_result_lists(D1, I1, D2, I2):
+    """comparing result lists is difficult because there are many
+    ties. Here we sort by (distance, index) pairs and ignore the largest
+    distance of each result. Compatible result lists should pass this."""
+    assert D1.shape == I1.shape == D2.shape == I2.shape
+    n, k = D1.shape
+    ndiff = (D1 != D2).sum()
+    assert ndiff == 0, '%d differences in distance matrix %s' % (
+        ndiff, D1.shape)
+
+    def normalize_DI(D, I):
+        norm = I.max() + 1.0
+        Dr = D.astype('float64') + I / norm
+        # ignore -1s and elements on last column
+        Dr[I1 == -1] = 1e20
+        Dr[D == D[:, -1:]] = 1e20
+        Dr.sort(axis=1)
+        return Dr
+    ndiff = (normalize_DI(D1, I1) != normalize_DI(D2, I2)).sum()
+    assert ndiff == 0, '%d differences in normalized D matrix' % ndiff
+
+
+class TestReplicasAndShards(unittest.TestCase):
+
+    def test_replicas(self):
+        d = 32
+        nq = 100
+        nb = 200
+
+        (_, xb, xq) = make_binary_dataset(d, 0, nb, nq)
+
+        index_ref = faiss.IndexBinaryFlat(d)
+        index_ref.add(xb)
+
+        Dref, Iref = index_ref.search(xq, 10)
+
+        nrep = 5
+        index = faiss.IndexBinaryReplicas()
+        for i in range(nrep):
+            sub_idx = faiss.IndexBinaryFlat(d)
+            sub_idx.add(xb)
+            index.addIndex(sub_idx)
+
+        D, I = index.search(xq, 10)
+
+        self.assertTrue((Dref == D).all())
+        self.assertTrue((Iref == I).all())
+
+        index2 = faiss.IndexBinaryReplicas()
+        for i in range(nrep):
+            sub_idx = faiss.IndexBinaryFlat(d)
+            index2.addIndex(sub_idx)
+
+        index2.add(xb)
+        D2, I2 = index2.search(xq, 10)
+
+        self.assertTrue((Dref == D2).all())
+        self.assertTrue((Iref == I2).all())
+
+    def test_shards(self):
+        d = 32
+        nq = 100
+        nb = 200
+
+        (_, xb, xq) = make_binary_dataset(d, 0, nb, nq)
+
+        index_ref = faiss.IndexBinaryFlat(d)
+        index_ref.add(xb)
+
+        Dref, Iref = index_ref.search(xq, 10)
+
+        nrep = 5
+        index = faiss.IndexBinaryShards(d)
+        for i in range(nrep):
+            sub_idx = faiss.IndexBinaryFlat(d)
+            sub_idx.add(xb[i * nb // nrep : (i + 1) * nb // nrep])
+            index.add_shard(sub_idx)
+
+        D, I = index.search(xq, 10)
+
+        compare_binary_result_lists(Dref, Iref, D, I)
+
+        index2 = faiss.IndexBinaryShards(d)
+        for i in range(nrep):
+            sub_idx = faiss.IndexBinaryFlat(d)
+            index2.add_shard(sub_idx)
+
+        index2.add(xb)
+        D2, I2 = index2.search(xq, 10)
+
+        compare_binary_result_lists(Dref, Iref, D2, I2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/core/src/index/thirdparty/faiss/tests/test_index_binary_from_float.py b/core/src/index/thirdparty/faiss/tests/test_index_binary_from_float.py
new file mode 100644
index 0000000000..1293381b17
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_index_binary_from_float.py
@@ -0,0 +1,199 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import absolute_import, division, print_function
+import numpy as np
+import unittest
+import faiss
+
+
+def make_binary_dataset(d, nb, nt, nq):
+    assert d % 8 == 0
+    rs = np.random.RandomState(123)
+    x = rs.randint(256, size=(nb + nq + nt, int(d / 8))).astype('uint8')
+    return x[:nt], x[nt:-nq], x[-nq:]
+
+
+def binary_to_float(x):
+    n, d = x.shape
+    x8 = x.reshape(n * d, -1)
+    c8 = 2 * ((x8 >> np.arange(8)) & 1).astype('int8') - 1
+    return c8.astype('float32').reshape(n, d * 8)
+
+
+class TestIndexBinaryFromFloat(unittest.TestCase):
+    """Use a binary index backed by a float index"""
+
+    def test_index_from_float(self):
+        d = 256
+        nt = 0
+        nb = 1500
+        nq = 500
+        (xt, xb, xq) = make_binary_dataset(d, nb, nt, nq)
+
+        index_ref = faiss.IndexFlatL2(d)
+        index_ref.add(binary_to_float(xb))
+
+        index = faiss.IndexFlatL2(d)
+        index_bin = faiss.IndexBinaryFromFloat(index)
+        index_bin.add(xb)
+
+        D_ref, I_ref = index_ref.search(binary_to_float(xq), 10)
+        D, I = index_bin.search(xq, 10)
+
+        np.testing.assert_allclose((D_ref / 4.0).astype('int32'), D)
+
+    def test_wrapped_quantizer(self):
+        d = 256
+        nt = 150
+        nb = 1500
+        nq = 500
+        (xt, xb, xq) = make_binary_dataset(d, nb, nt, nq)
+
+        nlist = 16
+        quantizer_ref = faiss.IndexBinaryFlat(d)
+        index_ref = faiss.IndexBinaryIVF(quantizer_ref, d, nlist)
+        index_ref.train(xt)
+
+        index_ref.add(xb)
+
+        unwrapped_quantizer = faiss.IndexFlatL2(d)
+        quantizer = faiss.IndexBinaryFromFloat(unwrapped_quantizer)
+        index = faiss.IndexBinaryIVF(quantizer, d, nlist)
+
+        index.train(xt)
+
+        index.add(xb)
+
+        D_ref, I_ref = index_ref.search(xq, 10)
+        D, I = index.search(xq, 10)
+
+        np.testing.assert_array_equal(D_ref, D)
+
+    def test_wrapped_quantizer_IMI(self):
+        d = 256
+        nt = 3500
+        nb = 10000
+        nq = 500
+        (xt, xb, xq) = make_binary_dataset(d, nb, nt, nq)
+
+        index_ref = faiss.IndexBinaryFlat(d)
+
+        index_ref.add(xb)
+
+        nlist_exp = 6
+        nlist = 2 ** (2 * nlist_exp)
+        float_quantizer = faiss.MultiIndexQuantizer(d, 2, nlist_exp)
+        wrapped_quantizer = faiss.IndexBinaryFromFloat(float_quantizer)
+        wrapped_quantizer.train(xt)
+
+        assert nlist == float_quantizer.ntotal
+
+        index = faiss.IndexBinaryIVF(wrapped_quantizer, d,
+                                     float_quantizer.ntotal)
+        index.nprobe = 2048
+        assert index.is_trained
+
+        index.add(xb)
+
+        D_ref, I_ref = index_ref.search(xq, 10)
+        D, I = index.search(xq, 10)
+
+        recall = sum(gti[0] in Di[:10] for gti, Di in zip(D_ref, D)) \
+                 / float(D_ref.shape[0])
+
+        assert recall > 0.82, "recall = %g" % recall
+
+    def test_wrapped_quantizer_HNSW(self):
+        faiss.omp_set_num_threads(1)
+
+        def bin2float(v):
+            def byte2float(byte):
+                return np.array([-1.0 + 2.0 * (byte & (1 << b) != 0)
+                                 for b in range(0, 8)])
+
+            return np.hstack([byte2float(byte) for byte in v]).astype('float32')
+
+        def floatvec2nparray(v):
+            return np.array([np.float32(v.at(i)) for i in range(0, v.size())]) \
+                     .reshape(-1, d)
+
+        d = 256
+        nt = 12800
+        nb = 10000
+        nq = 500
+        (xt, xb, xq) = make_binary_dataset(d, nb, nt, nq)
+
+        index_ref = faiss.IndexBinaryFlat(d)
+
+        index_ref.add(xb)
+
+        nlist = 256
+        clus = faiss.Clustering(d, nlist)
+        clus_index = faiss.IndexFlatL2(d)
+
+        xt_f = np.array([bin2float(v) for v in xt])
+        clus.train(xt_f, clus_index)
+
+        centroids = floatvec2nparray(clus.centroids)
+        hnsw_quantizer = faiss.IndexHNSWFlat(d, 32)
+        hnsw_quantizer.add(centroids)
+        hnsw_quantizer.is_trained = True
+        wrapped_quantizer = faiss.IndexBinaryFromFloat(hnsw_quantizer)
+
+        assert nlist == hnsw_quantizer.ntotal
+        assert nlist == wrapped_quantizer.ntotal
+        assert wrapped_quantizer.is_trained
+
+        index = faiss.IndexBinaryIVF(wrapped_quantizer, d,
+                                     hnsw_quantizer.ntotal)
+        index.nprobe = 128
+
+        assert index.is_trained
+
+        index.add(xb)
+
+        D_ref, I_ref = index_ref.search(xq, 10)
+        D, I = index.search(xq, 10)
+
+        recall = sum(gti[0] in Di[:10] for gti, Di in zip(D_ref, D)) \
+                 / float(D_ref.shape[0])
+
+        assert recall > 0.77, "recall = %g" % recall
+
+
+class TestOverrideKmeansQuantizer(unittest.TestCase):
+
+    def test_override(self):
+        d = 256
+        nt = 3500
+        nb = 10000
+        nq = 500
+        (xt, xb, xq) = make_binary_dataset(d, nb, nt, nq)
+
+        def train_and_get_centroids(override_kmeans_index):
+            index = faiss.index_binary_factory(d, "BIVF10")
+            index.verbose = True
+
+            if override_kmeans_index is not None:
+                index.clustering_index = override_kmeans_index
+
+            index.train(xt)
+
+            centroids = faiss.downcast_IndexBinary(index.quantizer).xb
+            return faiss.vector_to_array(centroids).reshape(-1, d // 8)
+
+        centroids_ref = train_and_get_centroids(None)
+
+        # should do the exact same thing
+        centroids_new = train_and_get_centroids(faiss.IndexFlatL2(d))
+
+        assert np.all(centroids_ref == centroids_new)
+
+        # will do less accurate assignment... Sanity check that the
+        # index is indeed used by kmeans
+        centroids_new = train_and_get_centroids(faiss.IndexLSH(d, 16))
+
+        assert not np.all(centroids_ref == centroids_new)
diff --git a/core/src/index/thirdparty/faiss/tests/test_index_composite.py b/core/src/index/thirdparty/faiss/tests/test_index_composite.py
new file mode 100644
index 0000000000..40b5daac8d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_index_composite.py
@@ -0,0 +1,572 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+""" more elaborate that test_index.py """
+
+import numpy as np
+import unittest
+import faiss
+import os
+import shutil
+import tempfile
+
+from common import get_dataset_2
+
+class TestRemove(unittest.TestCase):
+
+    def do_merge_then_remove(self, ondisk):
+        d = 10
+        nb = 1000
+        nq = 200
+        nt = 200
+
+        xt, xb, xq = get_dataset_2(d, nt, nb, nq)
+
+        quantizer = faiss.IndexFlatL2(d)
+
+        index1 = faiss.IndexIVFFlat(quantizer, d, 20)
+        index1.train(xt)
+
+        filename = None
+        if ondisk:
+            filename = tempfile.mkstemp()[1]
+            invlists = faiss.OnDiskInvertedLists(
+                index1.nlist, index1.code_size,
+                filename)
+            index1.replace_invlists(invlists)
+
+        index1.add(xb[:int(nb / 2)])
+
+        index2 = faiss.IndexIVFFlat(quantizer, d, 20)
+        assert index2.is_trained
+        index2.add(xb[int(nb / 2):])
+
+        Dref, Iref = index1.search(xq, 10)
+        index1.merge_from(index2, int(nb / 2))
+
+        assert index1.ntotal == nb
+
+        index1.remove_ids(faiss.IDSelectorRange(int(nb / 2), nb))
+
+        assert index1.ntotal == int(nb / 2)
+        Dnew, Inew = index1.search(xq, 10)
+
+        assert np.all(Dnew == Dref)
+        assert np.all(Inew == Iref)
+
+        if filename is not None:
+            os.unlink(filename)
+
+    def test_remove_regular(self):
+        self.do_merge_then_remove(False)
+
+    def test_remove_ondisk(self):
+        self.do_merge_then_remove(True)
+
+    def test_remove(self):
+        # only tests the python interface
+
+        index = faiss.IndexFlat(5)
+        xb = np.zeros((10, 5), dtype='float32')
+        xb[:, 0] = np.arange(10) + 1000
+        index.add(xb)
+        index.remove_ids(np.arange(5) * 2)
+        xb2 = faiss.vector_float_to_array(index.xb).reshape(5, 5)
+        assert np.all(xb2[:, 0] == xb[np.arange(5) * 2 + 1, 0])
+
+    def test_remove_id_map(self):
+        sub_index = faiss.IndexFlat(5)
+        xb = np.zeros((10, 5), dtype='float32')
+        xb[:, 0] = np.arange(10) + 1000
+        index = faiss.IndexIDMap2(sub_index)
+        index.add_with_ids(xb, np.arange(10) + 100)
+        assert index.reconstruct(104)[0] == 1004
+        index.remove_ids(np.array([103]))
+        assert index.reconstruct(104)[0] == 1004
+        try:
+            index.reconstruct(103)
+        except:
+            pass
+        else:
+            assert False, 'should have raised an exception'
+
+    def test_remove_id_map_2(self):
+        # from https://github.com/facebookresearch/faiss/issues/255
+        rs = np.random.RandomState(1234)
+        X = rs.randn(10, 10).astype(np.float32)
+        idx = np.array([0, 10, 20, 30, 40, 5, 15, 25, 35, 45], np.int64)
+        remove_set = np.array([10, 30], dtype=np.int64)
+        index = faiss.index_factory(10, 'IDMap,Flat')
+        index.add_with_ids(X[:5, :], idx[:5])
+        index.remove_ids(remove_set)
+        index.add_with_ids(X[5:, :], idx[5:])
+
+        print (index.search(X, 1))
+
+        for i in range(10):
+            _, searchres = index.search(X[i:i + 1, :], 1)
+            if idx[i] in remove_set:
+                assert searchres[0] != idx[i]
+            else:
+                assert searchres[0] == idx[i]
+
+    def test_remove_id_map_binary(self):
+        sub_index = faiss.IndexBinaryFlat(40)
+        xb = np.zeros((10, 5), dtype='uint8')
+        xb[:, 0] = np.arange(10) + 100
+        index = faiss.IndexBinaryIDMap2(sub_index)
+        index.add_with_ids(xb, np.arange(10) + 1000)
+        assert index.reconstruct(1004)[0] == 104
+        index.remove_ids(np.array([1003]))
+        assert index.reconstruct(1004)[0] == 104
+        try:
+            index.reconstruct(1003)
+        except:
+            pass
+        else:
+            assert False, 'should have raised an exception'
+
+        # while we are there, let's test I/O as well...
+        _, tmpnam = tempfile.mkstemp()
+        try:
+            faiss.write_index_binary(index, tmpnam)
+            index = faiss.read_index_binary(tmpnam)
+        finally:
+            os.remove(tmpnam)
+
+        assert index.reconstruct(1004)[0] == 104
+        try:
+            index.reconstruct(1003)
+        except:
+            pass
+        else:
+            assert False, 'should have raised an exception'
+
+
+
+class TestRangeSearch(unittest.TestCase):
+
+    def test_range_search_id_map(self):
+        sub_index = faiss.IndexFlat(5, 1)  # L2 search instead of inner product
+        xb = np.zeros((10, 5), dtype='float32')
+        xb[:, 0] = np.arange(10) + 1000
+        index = faiss.IndexIDMap2(sub_index)
+        index.add_with_ids(xb, np.arange(10) + 100)
+        dist = float(np.linalg.norm(xb[3] - xb[0])) * 0.99
+        res_subindex = sub_index.range_search(xb[[0], :], dist)
+        res_index = index.range_search(xb[[0], :], dist)
+        assert len(res_subindex[2]) == 2
+        np.testing.assert_array_equal(res_subindex[2] + 100, res_index[2])
+
+
+class TestUpdate(unittest.TestCase):
+
+    def test_update(self):
+        d = 64
+        nb = 1000
+        nt = 1500
+        nq = 100
+        np.random.seed(123)
+        xb = np.random.random(size=(nb, d)).astype('float32')
+        xt = np.random.random(size=(nt, d)).astype('float32')
+        xq = np.random.random(size=(nq, d)).astype('float32')
+
+        index = faiss.index_factory(d, "IVF64,Flat")
+        index.train(xt)
+        index.add(xb)
+        index.nprobe = 32
+        D, I = index.search(xq, 5)
+
+        index.make_direct_map()
+        recons_before = np.vstack([index.reconstruct(i) for i in range(nb)])
+
+        # revert order of the 200 first vectors
+        nu = 200
+        index.update_vectors(np.arange(nu), xb[nu - 1::-1].copy())
+
+        recons_after = np.vstack([index.reconstruct(i) for i in range(nb)])
+
+        # make sure reconstructions remain the same
+        diff_recons = recons_before[:nu] - recons_after[nu - 1::-1]
+        assert np.abs(diff_recons).max() == 0
+
+        D2, I2 = index.search(xq, 5)
+
+        assert np.all(D == D2)
+
+        gt_map = np.arange(nb)
+        gt_map[:nu] = np.arange(nu, 0, -1) - 1
+        eqs = I.ravel() == gt_map[I2.ravel()]
+
+        assert np.all(eqs)
+
+
+class TestPCAWhite(unittest.TestCase):
+
+    def test_white(self):
+
+        # generate data
+        d = 4
+        nt = 1000
+        nb = 200
+        nq = 200
+
+        # normal distribition
+        x = faiss.randn((nt + nb + nq) * d, 1234).reshape(nt + nb + nq, d)
+
+        index = faiss.index_factory(d, 'Flat')
+
+        xt = x[:nt]
+        xb = x[nt:-nq]
+        xq = x[-nq:]
+
+        # NN search on normal distribution
+        index.add(xb)
+        Do, Io = index.search(xq, 5)
+
+        # make distribution very skewed
+        x *= [10, 4, 1, 0.5]
+        rr, _ = np.linalg.qr(faiss.randn(d * d).reshape(d, d))
+        x = np.dot(x, rr).astype('float32')
+
+        xt = x[:nt]
+        xb = x[nt:-nq]
+        xq = x[-nq:]
+
+        # L2 search on skewed distribution
+        index = faiss.index_factory(d, 'Flat')
+
+        index.add(xb)
+        Dl2, Il2 = index.search(xq, 5)
+
+        # whiten + L2 search on L2 distribution
+        index = faiss.index_factory(d, 'PCAW%d,Flat' % d)
+
+        index.train(xt)
+        index.add(xb)
+        Dw, Iw = index.search(xq, 5)
+
+        # make sure correlation of whitened results with original
+        # results is much better than simple L2 distances
+        # should be 961 vs. 264
+        assert (faiss.eval_intersection(Io, Iw) >
+                2 * faiss.eval_intersection(Io, Il2))
+
+
+class TestTransformChain(unittest.TestCase):
+
+    def test_chain(self):
+
+        # generate data
+        d = 4
+        nt = 1000
+        nb = 200
+        nq = 200
+
+        # normal distribition
+        x = faiss.randn((nt + nb + nq) * d, 1234).reshape(nt + nb + nq, d)
+
+        # make distribution very skewed
+        x *= [10, 4, 1, 0.5]
+        rr, _ = np.linalg.qr(faiss.randn(d * d).reshape(d, d))
+        x = np.dot(x, rr).astype('float32')
+
+        xt = x[:nt]
+        xb = x[nt:-nq]
+        xq = x[-nq:]
+
+        index = faiss.index_factory(d, "L2norm,PCA2,L2norm,Flat")
+
+        assert index.chain.size() == 3
+        l2_1 = faiss.downcast_VectorTransform(index.chain.at(0))
+        assert l2_1.norm == 2
+        pca = faiss.downcast_VectorTransform(index.chain.at(1))
+        assert not pca.is_trained
+        index.train(xt)
+        assert pca.is_trained
+
+        index.add(xb)
+        D, I = index.search(xq, 5)
+
+        # do the computation manually and check if we get the same result
+        def manual_trans(x):
+            x = x.copy()
+            faiss.normalize_L2(x)
+            x = pca.apply_py(x)
+            faiss.normalize_L2(x)
+            return x
+
+        index2 = faiss.IndexFlatL2(2)
+        index2.add(manual_trans(xb))
+        D2, I2 = index2.search(manual_trans(xq), 5)
+
+        assert np.all(I == I2)
+
+class TestRareIO(unittest.TestCase):
+
+    def compare_results(self, index1, index2, xq):
+
+        Dref, Iref = index1.search(xq, 5)
+        Dnew, Inew = index2.search(xq, 5)
+
+        assert np.all(Dref == Dnew)
+        assert np.all(Iref == Inew)
+
+    def do_mmappedIO(self, sparse, in_pretransform=False):
+        d = 10
+        nb = 1000
+        nq = 200
+        nt = 200
+        xt, xb, xq = get_dataset_2(d, nt, nb, nq)
+
+        quantizer = faiss.IndexFlatL2(d)
+        index1 = faiss.IndexIVFFlat(quantizer, d, 20)
+        if sparse:
+            # makes the inverted lists sparse because all elements get
+            # assigned to the same invlist
+            xt += (np.ones(10) * 1000).astype('float32')
+
+        if in_pretransform:
+            # make sure it still works when wrapped in an IndexPreTransform
+            index1 = faiss.IndexPreTransform(index1)
+
+        index1.train(xt)
+        index1.add(xb)
+
+        _, fname = tempfile.mkstemp()
+        try:
+
+            faiss.write_index(index1, fname)
+
+            index2 = faiss.read_index(fname)
+            self.compare_results(index1, index2, xq)
+
+            index3 = faiss.read_index(fname, faiss.IO_FLAG_MMAP)
+            self.compare_results(index1, index3, xq)
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
+
+    def test_mmappedIO_sparse(self):
+        self.do_mmappedIO(True)
+
+    def test_mmappedIO_full(self):
+        self.do_mmappedIO(False)
+
+    def test_mmappedIO_pretrans(self):
+        self.do_mmappedIO(False, True)
+
+
+class TestIVFFlatDedup(unittest.TestCase):
+
+    def normalize_res(self, D, I):
+        dmax = D[-1]
+        res = [(d, i) for d, i in zip(D, I) if d < dmax]
+        res.sort()
+        return res
+
+    def test_dedup(self):
+        d = 10
+        nb = 1000
+        nq = 200
+        nt = 500
+        xt, xb, xq = get_dataset_2(d, nt, nb, nq)
+
+        # introduce duplicates
+        xb[500:900:2] = xb[501:901:2]
+        xb[901::4] = xb[900::4]
+        xb[902::4] = xb[900::4]
+        xb[903::4] = xb[900::4]
+
+        # also in the train set
+        xt[201::2] = xt[200::2]
+
+        quantizer = faiss.IndexFlatL2(d)
+        index_new = faiss.IndexIVFFlatDedup(quantizer, d, 20)
+
+        index_new.verbose = True
+        # should display
+        # IndexIVFFlatDedup::train: train on 350 points after dedup (was 500 points)
+        index_new.train(xt)
+
+        index_ref = faiss.IndexIVFFlat(quantizer, d, 20)
+        assert index_ref.is_trained
+
+        index_ref.nprobe = 5
+        index_ref.add(xb)
+        index_new.nprobe = 5
+        index_new.add(xb)
+
+        Dref, Iref = index_ref.search(xq, 20)
+        Dnew, Inew = index_new.search(xq, 20)
+
+        for i in range(nq):
+            ref = self.normalize_res(Dref[i], Iref[i])
+            new = self.normalize_res(Dnew[i], Inew[i])
+            assert ref == new
+
+        # test I/O
+        _, tmpfile = tempfile.mkstemp()
+        try:
+            faiss.write_index(index_new, tmpfile)
+            index_st = faiss.read_index(tmpfile)
+        finally:
+            if os.path.exists(tmpfile):
+                os.unlink(tmpfile)
+        Dst, Ist = index_st.search(xq, 20)
+
+        for i in range(nq):
+            new = self.normalize_res(Dnew[i], Inew[i])
+            st = self.normalize_res(Dst[i], Ist[i])
+            assert st == new
+
+        # test remove
+        toremove = np.hstack((np.arange(3, 1000, 5), np.arange(850, 950)))
+        index_ref.remove_ids(toremove)
+        index_new.remove_ids(toremove)
+
+        Dref, Iref = index_ref.search(xq, 20)
+        Dnew, Inew = index_new.search(xq, 20)
+
+        for i in range(nq):
+            ref = self.normalize_res(Dref[i], Iref[i])
+            new = self.normalize_res(Dnew[i], Inew[i])
+            assert ref == new
+
+
+class TestSerialize(unittest.TestCase):
+
+    def test_serialize_to_vector(self):
+        d = 10
+        nb = 1000
+        nq = 200
+        nt = 500
+        xt, xb, xq = get_dataset_2(d, nt, nb, nq)
+
+        index = faiss.IndexFlatL2(d)
+        index.add(xb)
+
+        Dref, Iref = index.search(xq, 5)
+
+        writer = faiss.VectorIOWriter()
+        faiss.write_index(index, writer)
+
+        ar_data = faiss.vector_to_array(writer.data)
+
+        # direct transfer of vector
+        reader = faiss.VectorIOReader()
+        reader.data.swap(writer.data)
+
+        index2 = faiss.read_index(reader)
+
+        Dnew, Inew = index2.search(xq, 5)
+        assert np.all(Dnew == Dref) and np.all(Inew == Iref)
+
+        # from intermediate numpy array
+        reader = faiss.VectorIOReader()
+        faiss.copy_array_to_vector(ar_data, reader.data)
+
+        index3 = faiss.read_index(reader)
+
+        Dnew, Inew = index3.search(xq, 5)
+        assert np.all(Dnew == Dref) and np.all(Inew == Iref)
+
+
+class TestRenameOndisk(unittest.TestCase):
+
+    def test_rename(self):
+        d = 10
+        nb = 500
+        nq = 100
+        nt = 100
+
+        xt, xb, xq = get_dataset_2(d, nt, nb, nq)
+
+        quantizer = faiss.IndexFlatL2(d)
+
+        index1 = faiss.IndexIVFFlat(quantizer, d, 20)
+        index1.train(xt)
+
+        dirname = tempfile.mkdtemp()
+
+        try:
+
+            # make an index with ondisk invlists
+            invlists = faiss.OnDiskInvertedLists(
+                index1.nlist, index1.code_size,
+                dirname + '/aa.ondisk')
+            index1.replace_invlists(invlists)
+            index1.add(xb)
+            D1, I1 = index1.search(xq, 10)
+            faiss.write_index(index1, dirname + '/aa.ivf')
+
+            # move the index elsewhere
+            os.mkdir(dirname + '/1')
+            for fname in 'aa.ondisk', 'aa.ivf':
+                os.rename(dirname + '/' + fname,
+                          dirname + '/1/' + fname)
+
+            # try to read it: fails!
+            try:
+                index2 = faiss.read_index(dirname + '/1/aa.ivf')
+            except RuntimeError:
+                pass   # normal
+            else:
+                assert False
+
+            # read it with magic flag
+            index2 = faiss.read_index(dirname + '/1/aa.ivf',
+                                      faiss.IO_FLAG_ONDISK_SAME_DIR)
+            D2, I2 = index2.search(xq, 10)
+            assert np.all(I1 == I2)
+
+        finally:
+            shutil.rmtree(dirname)
+
+
+class TestInvlistMeta(unittest.TestCase):
+
+    def test_slice_vstack(self):
+        d = 10
+        nb = 1000
+        nq = 100
+        nt = 200
+
+        xt, xb, xq = get_dataset_2(d, nt, nb, nq)
+
+        quantizer = faiss.IndexFlatL2(d)
+        index = faiss.IndexIVFFlat(quantizer, d, 30)
+
+        index.train(xt)
+        index.add(xb)
+        Dref, Iref = index.search(xq, 10)
+
+        # faiss.wait()
+
+        il0 = index.invlists
+        ils = []
+        ilv = faiss.InvertedListsPtrVector()
+        for sl in 0, 1, 2:
+            il = faiss.SliceInvertedLists(il0, sl * 10, sl * 10 + 10)
+            ils.append(il)
+            ilv.push_back(il)
+
+        il2 = faiss.VStackInvertedLists(ilv.size(), ilv.data())
+
+        index2 = faiss.IndexIVFFlat(quantizer, d, 30)
+        index2.replace_invlists(il2)
+        index2.ntotal = index.ntotal
+
+        D, I = index2.search(xq, 10)
+        assert np.all(D == Dref)
+        assert np.all(I == Iref)
+
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/core/src/index/thirdparty/faiss/tests/test_ivflib.py b/core/src/index/thirdparty/faiss/tests/test_ivflib.py
new file mode 100644
index 0000000000..f28ffc5318
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_ivflib.py
@@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+import unittest
+import faiss
+
+
+class TestIVFlib(unittest.TestCase):
+
+    def test_methods_exported(self):
+        methods = ['check_compatible_for_merge', 'extract_index_ivf',
+                   'merge_into', 'search_centroid',
+                   'search_and_return_centroids', 'get_invlist_range',
+                   'set_invlist_range', 'search_with_parameters']
+
+        for method in methods:
+            assert callable(getattr(faiss, method, None))
diff --git a/core/src/index/thirdparty/faiss/tests/test_ivfpq_codec.cpp b/core/src/index/thirdparty/faiss/tests/test_ivfpq_codec.cpp
new file mode 100644
index 0000000000..8d18ac0ad9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_ivfpq_codec.cpp
@@ -0,0 +1,67 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <gtest/gtest.h>
+
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/utils/utils.h>
+#include <faiss/utils/distances.h>
+
+
+namespace {
+
+// dimension of the vectors to index
+int d = 64;
+
+// size of the database we plan to index
+size_t nb = 8000;
+
+
+double eval_codec_error (long ncentroids, long m, const std::vector<float> &v)
+{
+    faiss::IndexFlatL2 coarse_quantizer (d);
+    faiss::IndexIVFPQ index (&coarse_quantizer, d,
+                             ncentroids, m, 8);
+    index.pq.cp.niter = 10; // speed up train
+    index.train (nb, v.data());
+
+    // encode and decode to compute reconstruction error
+
+    std::vector<faiss::Index::idx_t> keys (nb);
+    std::vector<uint8_t> codes (nb * m);
+    index.encode_multiple (nb, keys.data(), v.data(), codes.data(), true);
+
+    std::vector<float> v2 (nb * d);
+    index.decode_multiple (nb, keys.data(), codes.data(), v2.data());
+
+    return faiss::fvec_L2sqr (v.data(), v2.data(), nb * d);
+}
+
+}  // namespace
+
+
+TEST(IVFPQ, codec) {
+
+    std::vector <float> database (nb * d);
+    for (size_t i = 0; i < nb * d; i++) {
+        database[i] = drand48();
+    }
+
+    double err0 = eval_codec_error(16, 8, database);
+
+    // should be more accurate as there are more coarse centroids
+    double err1 = eval_codec_error(128, 8, database);
+    EXPECT_GT(err0, err1);
+
+    // should be more accurate as there are more PQ codes
+    double err2 = eval_codec_error(16, 16, database);
+    EXPECT_GT(err0, err2);
+}
diff --git a/core/src/index/thirdparty/faiss/tests/test_ivfpq_indexing.cpp b/core/src/index/thirdparty/faiss/tests/test_ivfpq_indexing.cpp
new file mode 100644
index 0000000000..9f4bbcd2ca
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_ivfpq_indexing.cpp
@@ -0,0 +1,98 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <gtest/gtest.h>
+
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/index_io.h>
+
+TEST(IVFPQ, accuracy) {
+
+    // dimension of the vectors to index
+    int d = 64;
+
+    // size of the database we plan to index
+    size_t nb = 1000;
+
+    // make a set of nt training vectors in the unit cube
+    // (could be the database)
+    size_t nt = 1500;
+
+    // make the index object and train it
+    faiss::IndexFlatL2 coarse_quantizer (d);
+
+    // a reasonable number of cetroids to index nb vectors
+    int ncentroids = 25;
+
+    faiss::IndexIVFPQ index (&coarse_quantizer, d,
+                             ncentroids, 16, 8);
+
+    // index that gives the ground-truth
+    faiss::IndexFlatL2 index_gt (d);
+
+    srand48 (35);
+
+    { // training
+
+        std::vector <float> trainvecs (nt * d);
+        for (size_t i = 0; i < nt * d; i++) {
+            trainvecs[i] = drand48();
+        }
+        index.verbose = true;
+        index.train (nt, trainvecs.data());
+    }
+
+    { // populating the database
+
+        std::vector <float> database (nb * d);
+        for (size_t i = 0; i < nb * d; i++) {
+            database[i] = drand48();
+        }
+
+        index.add (nb, database.data());
+        index_gt.add (nb, database.data());
+    }
+
+    int nq = 200;
+    int n_ok;
+
+    { // searching the database
+
+        std::vector <float> queries (nq * d);
+        for (size_t i = 0; i < nq * d; i++) {
+            queries[i] = drand48();
+        }
+
+        std::vector<faiss::Index::idx_t> gt_nns (nq);
+        std::vector<float>               gt_dis (nq);
+
+        index_gt.search (nq, queries.data(), 1,
+                         gt_dis.data(), gt_nns.data());
+
+        index.nprobe = 5;
+        int k = 5;
+        std::vector<faiss::Index::idx_t> nns (k * nq);
+        std::vector<float>               dis (k * nq);
+
+        index.search (nq, queries.data(), k, dis.data(), nns.data());
+
+        n_ok = 0;
+        for (int q = 0; q < nq; q++) {
+
+            for (int i = 0; i < k; i++)
+                if (nns[q * k + i] == gt_nns[q])
+                    n_ok++;
+        }
+        EXPECT_GT(n_ok, nq * 0.4);
+    }
+
+}
diff --git a/core/src/index/thirdparty/faiss/tests/test_lowlevel_ivf.cpp b/core/src/index/thirdparty/faiss/tests/test_lowlevel_ivf.cpp
new file mode 100644
index 0000000000..7baf801b7b
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_lowlevel_ivf.cpp
@@ -0,0 +1,566 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <memory>
+#include <vector>
+#include <thread>
+
+#include <gtest/gtest.h>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexBinaryIVF.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/AutoTune.h>
+#include <faiss/index_factory.h>
+#include <faiss/index_io.h>
+#include <faiss/IVFlib.h>
+#include <faiss/VectorTransform.h>
+
+using namespace faiss;
+
+namespace {
+
+typedef Index::idx_t idx_t;
+
+
+// dimension of the vectors to index
+int d = 32;
+
+// nb of training vectors
+size_t nt = 5000;
+
+// size of the database points per window step
+size_t nb = 1000;
+
+// nb of queries
+size_t nq = 200;
+
+int k = 10;
+
+
+std::vector<float> make_data(size_t n)
+{
+    std::vector <float> database (n * d);
+    for (size_t i = 0; i < n * d; i++) {
+        database[i] = drand48();
+    }
+    return database;
+}
+
+std::unique_ptr<Index> make_trained_index(const char *index_type,
+                                          MetricType metric_type)
+{
+    auto index = std::unique_ptr<Index>(index_factory(
+                     d, index_type, metric_type));
+    auto xt = make_data(nt);
+    index->train(nt, xt.data());
+    ParameterSpace().set_index_parameter (index.get(), "nprobe", 4);
+    return index;
+}
+
+std::vector<idx_t> search_index(Index *index, const float *xq) {
+    std::vector<idx_t> I(k * nq);
+    std::vector<float> D(k * nq);
+    index->search (nq, xq, k, D.data(), I.data());
+    return I;
+}
+
+
+
+
+/*************************************************************
+ * Test functions for a given index type
+ *************************************************************/
+
+
+
+void test_lowlevel_access (const char *index_key, MetricType metric) {
+    std::unique_ptr<Index> index = make_trained_index(index_key, metric);
+
+    auto xb = make_data (nb);
+    index->add(nb, xb.data());
+
+    /** handle the case if we have a preprocessor */
+
+    const IndexPreTransform *index_pt =
+        dynamic_cast<const IndexPreTransform*> (index.get());
+
+    int dt = index->d;
+    const float * xbt = xb.data();
+    std::unique_ptr<float []> del_xbt;
+
+    if (index_pt) {
+        dt = index_pt->index->d;
+        xbt = index_pt->apply_chain (nb, xb.data());
+        if (xbt != xb.data()) {
+            del_xbt.reset((float*)xbt);
+        }
+    }
+
+    IndexIVF * index_ivf = ivflib::extract_index_ivf (index.get());
+
+    /** Test independent encoding
+     *
+     * Makes it possible to do additions on a custom inverted list
+     * implementation. From a set of vectors, computes the inverted
+     * list ids + the codes corresponding to each vector.
+     */
+
+    std::vector<idx_t> list_nos (nb);
+    std::vector<uint8_t> codes (index_ivf->code_size * nb);
+    index_ivf->quantizer->assign(nb, xbt, list_nos.data());
+    index_ivf->encode_vectors (nb, xbt, list_nos.data(), codes.data());
+
+    // compare with normal IVF addition
+
+    const InvertedLists *il = index_ivf->invlists;
+
+    for (int list_no = 0; list_no < index_ivf->nlist; list_no++) {
+        InvertedLists::ScopedCodes ivf_codes (il, list_no);
+        InvertedLists::ScopedIds ivf_ids (il, list_no);
+        size_t list_size = il->list_size (list_no);
+        for (int i = 0; i < list_size; i++) {
+            const uint8_t *ref_code = ivf_codes.get() + i * il->code_size;
+            const uint8_t *new_code =
+                codes.data() + ivf_ids[i] * il->code_size;
+            EXPECT_EQ (memcmp(ref_code, new_code, il->code_size), 0);
+        }
+    }
+
+    /** Test independent search
+     *
+     * Manually scans through inverted lists, computing distances and
+     * ordering results organized in a heap.
+     */
+
+    // sample some example queries and get reference search results.
+    auto xq = make_data (nq);
+    auto ref_I = search_index (index.get(), xq.data());
+
+    // handle preprocessing
+    const float * xqt = xq.data();
+    std::unique_ptr<float []> del_xqt;
+
+    if (index_pt) {
+        xqt = index_pt->apply_chain (nq, xq.data());
+        if (xqt != xq.data()) {
+            del_xqt.reset((float*)xqt);
+        }
+    }
+
+    // quantize the queries to get the inverted list ids to visit.
+    int nprobe = index_ivf->nprobe;
+
+    std::vector<idx_t> q_lists (nq * nprobe);
+    std::vector<float> q_dis (nq * nprobe);
+
+    index_ivf->quantizer->search (nq, xqt, nprobe,
+                                  q_dis.data(), q_lists.data());
+
+    // object that does the scanning and distance computations.
+    std::unique_ptr<InvertedListScanner> scanner (
+                   index_ivf->get_InvertedListScanner());
+
+    for (int i = 0; i < nq; i++) {
+        std::vector<idx_t> I (k, -1);
+        float default_dis = metric == METRIC_L2 ? HUGE_VAL : -HUGE_VAL;
+        std::vector<float> D (k, default_dis);
+
+        scanner->set_query (xqt + i * dt);
+
+        for (int j = 0; j < nprobe; j++) {
+            int list_no = q_lists[i * nprobe + j];
+            if (list_no < 0) continue;
+            scanner->set_list (list_no, q_dis[i * nprobe + j]);
+
+            // here we get the inverted lists from the InvertedLists
+            // object but they could come from anywhere
+
+            scanner->scan_codes (
+                 il->list_size (list_no),
+                 InvertedLists::ScopedCodes(il, list_no).get(),
+                 InvertedLists::ScopedIds(il, list_no).get(),
+                 D.data(), I.data(), k);
+
+            if (j == 0) {
+                // all results so far come from list_no, so let's check if
+                // the distance function works
+                for (int jj = 0; jj < k; jj++) {
+                    int vno = I[jj];
+                    if (vno < 0) break; // heap is not full yet
+
+                    // we have the codes from the addition test
+                    float computed_D = scanner->distance_to_code (
+                                 codes.data() + vno * il->code_size);
+
+                    EXPECT_EQ (computed_D, D[jj]);
+                }
+            }
+        }
+
+        // re-order heap
+        if (metric == METRIC_L2) {
+            maxheap_reorder (k, D.data(), I.data());
+        } else {
+            minheap_reorder (k, D.data(), I.data());
+        }
+
+        // check that we have the same results as the reference search
+        for (int j = 0; j < k; j++) {
+            EXPECT_EQ (I[j], ref_I[i * k + j]);
+        }
+    }
+
+
+}
+
+} // anonymous namespace
+
+
+
+/*************************************************************
+ * Test entry points
+ *************************************************************/
+
+TEST(TestLowLevelIVF, IVFFlatL2) {
+    test_lowlevel_access ("IVF32,Flat", METRIC_L2);
+}
+
+TEST(TestLowLevelIVF, PCAIVFFlatL2) {
+    test_lowlevel_access ("PCAR16,IVF32,Flat", METRIC_L2);
+}
+
+TEST(TestLowLevelIVF, IVFFlatIP) {
+    test_lowlevel_access ("IVF32,Flat", METRIC_INNER_PRODUCT);
+}
+
+TEST(TestLowLevelIVF, IVFSQL2) {
+    test_lowlevel_access ("IVF32,SQ8", METRIC_L2);
+}
+
+TEST(TestLowLevelIVF, IVFSQIP) {
+    test_lowlevel_access ("IVF32,SQ8", METRIC_INNER_PRODUCT);
+}
+
+
+TEST(TestLowLevelIVF, IVFPQL2) {
+    test_lowlevel_access ("IVF32,PQ4np", METRIC_L2);
+}
+
+TEST(TestLowLevelIVF, IVFPQIP) {
+    test_lowlevel_access ("IVF32,PQ4np", METRIC_INNER_PRODUCT);
+}
+
+
+/*************************************************************
+ * Same for binary (a bit simpler)
+ *************************************************************/
+
+namespace {
+
+int nbit = 256;
+
+// here d is used the number of ints -> d=32 means 128 bits
+
+std::vector<uint8_t> make_data_binary(size_t n)
+{
+
+    std::vector <uint8_t> database (n * nbit / 8);
+    for (size_t i = 0; i < n * d; i++) {
+        database[i] = lrand48();
+    }
+    return database;
+}
+
+std::unique_ptr<IndexBinary> make_trained_index_binary(const char *index_type)
+{
+    auto index = std::unique_ptr<IndexBinary>(index_binary_factory(
+                     nbit, index_type));
+    auto xt = make_data_binary (nt);
+    index->train(nt, xt.data());
+    return index;
+}
+
+
+void test_lowlevel_access_binary (const char *index_key) {
+    std::unique_ptr<IndexBinary> index =
+        make_trained_index_binary (index_key);
+
+    IndexBinaryIVF * index_ivf = dynamic_cast<IndexBinaryIVF*>
+        (index.get());
+    assert (index_ivf);
+
+    index_ivf->nprobe = 4;
+
+    auto xb = make_data_binary (nb);
+    index->add(nb, xb.data());
+
+    std::vector<idx_t> list_nos (nb);
+    index_ivf->quantizer->assign(nb, xb.data(), list_nos.data());
+
+    /* For binary there is no test for encoding because binary vectors
+     * are copied verbatim to the inverted lists */
+
+    const InvertedLists *il = index_ivf->invlists;
+
+    /** Test independent search
+     *
+     * Manually scans through inverted lists, computing distances and
+     * ordering results organized in a heap.
+     */
+
+    // sample some example queries and get reference search results.
+    auto xq = make_data_binary (nq);
+
+    std::vector<idx_t> I_ref(k * nq);
+    std::vector<int32_t> D_ref(k * nq);
+    index->search (nq, xq.data(), k, D_ref.data(), I_ref.data());
+
+    // quantize the queries to get the inverted list ids to visit.
+    int nprobe = index_ivf->nprobe;
+
+    std::vector<idx_t> q_lists (nq * nprobe);
+    std::vector<int32_t> q_dis (nq * nprobe);
+
+    // quantize queries
+    index_ivf->quantizer->search (nq, xq.data(), nprobe,
+                                  q_dis.data(), q_lists.data());
+
+    // object that does the scanning and distance computations.
+    std::unique_ptr<BinaryInvertedListScanner> scanner (
+                   index_ivf->get_InvertedListScanner());
+
+    for (int i = 0; i < nq; i++) {
+        std::vector<idx_t> I (k, -1);
+        uint32_t default_dis = 1 << 30;
+        std::vector<int32_t> D (k, default_dis);
+
+        scanner->set_query (xq.data() + i * index_ivf->code_size);
+
+        for (int j = 0; j < nprobe; j++) {
+            int list_no = q_lists[i * nprobe + j];
+            if (list_no < 0) continue;
+            scanner->set_list (list_no, q_dis[i * nprobe + j]);
+
+            // here we get the inverted lists from the InvertedLists
+            // object but they could come from anywhere
+
+            scanner->scan_codes (
+                 il->list_size (list_no),
+                 InvertedLists::ScopedCodes(il, list_no).get(),
+                 InvertedLists::ScopedIds(il, list_no).get(),
+                 D.data(), I.data(), k);
+
+            if (j == 0) {
+                // all results so far come from list_no, so let's check if
+                // the distance function works
+                for (int jj = 0; jj < k; jj++) {
+                    int vno = I[jj];
+                    if (vno < 0) break; // heap is not full yet
+
+                    // we have the codes from the addition test
+                    float computed_D = scanner->distance_to_code (
+                               xb.data() + vno * il->code_size);
+
+                    EXPECT_EQ (computed_D, D[jj]);
+                }
+            }
+        }
+
+        printf("new before reroder: [");
+        for (int j = 0; j < k; j++)
+            printf("%ld,%d ", I[j], D[j]);
+        printf("]\n");
+
+        // re-order heap
+        heap_reorder<CMax<int32_t, idx_t> > (k, D.data(), I.data());
+
+        printf("ref: [");
+        for (int j = 0; j < k; j++)
+            printf("%ld,%d ", I_ref[j], D_ref[j]);
+        printf("]\nnew: [");
+        for (int j = 0; j < k; j++)
+            printf("%ld,%d ", I[j], D[j]);
+        printf("]\n");
+
+        // check that we have the same results as the reference search
+        for (int j = 0; j < k; j++) {
+            // here the order is not guaranteed to be the same
+            // so we scan through ref results
+            // EXPECT_EQ (I[j], I_ref[i * k + j]);
+            EXPECT_LE (D[j], D_ref[i * k + k - 1]);
+            if (D[j] < D_ref[i * k + k - 1]) {
+                int j2 = 0;
+                while (j2 < k) {
+                    if (I[j] == I_ref[i * k + j2]) break;
+                    j2++;
+                }
+                EXPECT_LT(j2, k); // it was found
+                if (j2 < k) {
+                    EXPECT_EQ(D[j], D_ref[i * k + j2]);
+                }
+            }
+
+        }
+
+    }
+
+
+}
+
+} // anonymous namespace
+
+
+TEST(TestLowLevelIVF, IVFBinary) {
+    test_lowlevel_access_binary ("BIVF32");
+}
+
+
+namespace {
+
+void test_threaded_search (const char *index_key, MetricType metric) {
+    std::unique_ptr<Index> index = make_trained_index(index_key, metric);
+
+    auto xb = make_data (nb);
+    index->add(nb, xb.data());
+
+    /** handle the case if we have a preprocessor */
+
+    const IndexPreTransform *index_pt =
+        dynamic_cast<const IndexPreTransform*> (index.get());
+
+    int dt = index->d;
+    const float * xbt = xb.data();
+    std::unique_ptr<float []> del_xbt;
+
+    if (index_pt) {
+        dt = index_pt->index->d;
+        xbt = index_pt->apply_chain (nb, xb.data());
+        if (xbt != xb.data()) {
+            del_xbt.reset((float*)xbt);
+        }
+    }
+
+    IndexIVF * index_ivf = ivflib::extract_index_ivf (index.get());
+
+    /** Test independent search
+     *
+     * Manually scans through inverted lists, computing distances and
+     * ordering results organized in a heap.
+     */
+
+    // sample some example queries and get reference search results.
+    auto xq = make_data (nq);
+    auto ref_I = search_index (index.get(), xq.data());
+
+    // handle preprocessing
+    const float * xqt = xq.data();
+    std::unique_ptr<float []> del_xqt;
+
+    if (index_pt) {
+        xqt = index_pt->apply_chain (nq, xq.data());
+        if (xqt != xq.data()) {
+            del_xqt.reset((float*)xqt);
+        }
+    }
+
+    // quantize the queries to get the inverted list ids to visit.
+    int nprobe = index_ivf->nprobe;
+
+    std::vector<idx_t> q_lists (nq * nprobe);
+    std::vector<float> q_dis (nq * nprobe);
+
+    index_ivf->quantizer->search (nq, xqt, nprobe,
+                                  q_dis.data(), q_lists.data());
+
+    // now run search in this many threads
+    int nproc = 3;
+
+
+    for (int i = 0; i < nq; i++) {
+
+        // one result table per thread
+        std::vector<idx_t> I (k * nproc, -1);
+        float default_dis = metric == METRIC_L2 ? HUGE_VAL : -HUGE_VAL;
+        std::vector<float> D (k * nproc, default_dis);
+
+        auto search_function = [index_ivf, &I, &D, dt, i, nproc,
+                                xqt, nprobe, &q_dis, &q_lists]
+            (int rank) {
+            const InvertedLists *il = index_ivf->invlists;
+
+            // object that does the scanning and distance computations.
+            std::unique_ptr<InvertedListScanner> scanner (
+                   index_ivf->get_InvertedListScanner());
+
+            idx_t *local_I = I.data() + rank * k;
+            float *local_D = D.data() + rank * k;
+
+            scanner->set_query (xqt + i * dt);
+
+            for (int j = rank; j < nprobe; j += nproc) {
+                int list_no = q_lists[i * nprobe + j];
+                if (list_no < 0) continue;
+                scanner->set_list (list_no, q_dis[i * nprobe + j]);
+
+                scanner->scan_codes (
+                     il->list_size (list_no),
+                     InvertedLists::ScopedCodes(il, list_no).get(),
+                     InvertedLists::ScopedIds(il, list_no).get(),
+                     local_D, local_I, k);
+            }
+        };
+
+        // start the threads. Threads are numbered rank=0..nproc-1 (a la MPI)
+        // thread rank takes care of inverted lists
+        // rank, rank+nproc, rank+2*nproc,...
+        std::vector<std::thread> threads;
+        for (int rank = 0; rank < nproc; rank++) {
+            threads.emplace_back(search_function, rank);
+        }
+
+        // join threads, merge heaps
+        for (int rank = 0; rank < nproc; rank++) {
+            threads[rank].join();
+            if (rank == 0) continue; // nothing to merge
+            // merge into first result
+            if (metric == METRIC_L2) {
+                maxheap_addn (k, D.data(), I.data(),
+                              D.data() + rank * k,
+                              I.data() + rank * k, k);
+            } else {
+                minheap_addn (k, D.data(), I.data(),
+                              D.data() + rank * k,
+                              I.data() + rank * k, k);
+            }
+        }
+
+        // re-order heap
+        if (metric == METRIC_L2) {
+            maxheap_reorder (k, D.data(), I.data());
+        } else {
+            minheap_reorder (k, D.data(), I.data());
+        }
+
+        // check that we have the same results as the reference search
+        for (int j = 0; j < k; j++) {
+            EXPECT_EQ (I[j], ref_I[i * k + j]);
+        }
+    }
+
+
+}
+
+} // anonymous namepace
+
+
+TEST(TestLowLevelIVF, ThreadedSearch) {
+    test_threaded_search ("IVF32,Flat", METRIC_L2);
+}
diff --git a/core/src/index/thirdparty/faiss/tests/test_merge.cpp b/core/src/index/thirdparty/faiss/tests/test_merge.cpp
new file mode 100644
index 0000000000..b32e7e68e4
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_merge.cpp
@@ -0,0 +1,258 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <gtest/gtest.h>
+
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/MetaIndexes.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/OnDiskInvertedLists.h>
+#include <faiss/IVFlib.h>
+
+
+namespace {
+
+
+struct Tempfilename {
+
+    static pthread_mutex_t mutex;
+
+    std::string filename;
+
+    Tempfilename (const char *prefix = nullptr) {
+        pthread_mutex_lock (&mutex);
+        char *cfname = tempnam (nullptr, prefix);
+        filename = cfname;
+        free(cfname);
+        pthread_mutex_unlock (&mutex);
+    }
+
+    ~Tempfilename () {
+        if (access (filename.c_str(), F_OK)) {
+            unlink (filename.c_str());
+        }
+    }
+
+    const char *c_str() {
+        return filename.c_str();
+    }
+
+};
+
+pthread_mutex_t Tempfilename::mutex = PTHREAD_MUTEX_INITIALIZER;
+
+
+typedef faiss::Index::idx_t idx_t;
+
+// parameters to use for the test
+int d = 64;
+size_t nb = 1000;
+size_t nq = 100;
+int nindex = 4;
+int k = 10;
+int nlist = 40;
+
+struct CommonData {
+
+    std::vector <float> database;
+    std::vector <float> queries;
+    std::vector<idx_t> ids;
+    faiss::IndexFlatL2 quantizer;
+
+    CommonData(): database (nb * d), queries (nq * d), ids(nb), quantizer (d) {
+
+        for (size_t i = 0; i < nb * d; i++) {
+            database[i] = drand48();
+        }
+        for (size_t i = 0; i < nq * d; i++) {
+            queries[i] = drand48();
+        }
+        for (int i = 0; i < nb; i++) {
+            ids[i] = 123 + 456 * i;
+        }
+        { // just to train the quantizer
+            faiss::IndexIVFFlat iflat (&quantizer, d, nlist);
+            iflat.train(nb, database.data());
+        }
+    }
+};
+
+CommonData cd;
+
+/// perform a search on shards, then merge and search again and
+/// compare results.
+int compare_merged (faiss::IndexShards *index_shards, bool shift_ids,
+                    bool standard_merge = true)
+{
+
+    std::vector<idx_t> refI(k * nq);
+    std::vector<float> refD(k * nq);
+
+    index_shards->search(nq, cd.queries.data(), k, refD.data(), refI.data());
+    Tempfilename filename;
+
+    std::vector<idx_t> newI(k * nq);
+    std::vector<float> newD(k * nq);
+
+    if (standard_merge) {
+
+        for (int i = 1; i < nindex; i++) {
+            faiss::ivflib::merge_into(
+                   index_shards->at(0), index_shards->at(i),
+                   shift_ids);
+        }
+
+        index_shards->sync_with_shard_indexes();
+    } else {
+        std::vector<const faiss::InvertedLists *> lists;
+        faiss::IndexIVF *index0 = nullptr;
+        size_t ntotal = 0;
+        for (int i = 0; i < nindex; i++) {
+            auto index_ivf = dynamic_cast<faiss::IndexIVF*>(index_shards->at(i));
+            assert (index_ivf);
+            if (i == 0) {
+                index0 = index_ivf;
+            }
+            lists.push_back (index_ivf->invlists);
+            ntotal += index_ivf->ntotal;
+        }
+
+        auto il = new faiss::OnDiskInvertedLists(
+                        index0->nlist, index0->code_size,
+                        filename.c_str());
+
+        il->merge_from(lists.data(), lists.size());
+
+        index0->replace_invlists(il, true);
+        index0->ntotal = ntotal;
+    }
+    // search only on first index
+    index_shards->at(0)->search(nq, cd.queries.data(),
+                                k, newD.data(), newI.data());
+
+    size_t ndiff = 0;
+    for (size_t i = 0; i < k * nq; i++) {
+        if (refI[i] != newI[i]) {
+            ndiff ++;
+        }
+    }
+    return ndiff;
+}
+
+}  // namespace
+
+
+// test on IVFFlat with implicit numbering
+TEST(MERGE, merge_flat_no_ids) {
+    faiss::IndexShards index_shards(d);
+    index_shards.own_fields = true;
+    for (int i = 0; i < nindex; i++) {
+        index_shards.add_shard (
+            new faiss::IndexIVFFlat (&cd.quantizer, d, nlist));
+    }
+    EXPECT_TRUE(index_shards.is_trained);
+    index_shards.add(nb, cd.database.data());
+    size_t prev_ntotal = index_shards.ntotal;
+    int ndiff = compare_merged(&index_shards, true);
+    EXPECT_EQ (prev_ntotal, index_shards.ntotal);
+    EXPECT_EQ(0, ndiff);
+}
+
+
+// test on IVFFlat, explicit ids
+TEST(MERGE, merge_flat) {
+    faiss::IndexShards index_shards(d, false, false);
+    index_shards.own_fields = true;
+
+    for (int i = 0; i < nindex; i++) {
+        index_shards.add_shard (
+             new faiss::IndexIVFFlat (&cd.quantizer, d, nlist));
+    }
+
+    EXPECT_TRUE(index_shards.is_trained);
+    index_shards.add_with_ids(nb, cd.database.data(), cd.ids.data());
+    int ndiff = compare_merged(&index_shards, false);
+    EXPECT_GE(0, ndiff);
+}
+
+// test on IVFFlat and a VectorTransform
+TEST(MERGE, merge_flat_vt) {
+    faiss::IndexShards index_shards(d, false, false);
+    index_shards.own_fields = true;
+
+    // here we have to retrain because of the vectorTransform
+    faiss::RandomRotationMatrix rot(d, d);
+    rot.init(1234);
+    faiss::IndexFlatL2 quantizer (d);
+
+    { // just to train the quantizer
+        faiss::IndexIVFFlat iflat (&quantizer, d, nlist);
+        faiss::IndexPreTransform ipt (&rot, &iflat);
+        ipt.train(nb, cd.database.data());
+    }
+
+    for (int i = 0; i < nindex; i++) {
+        faiss::IndexPreTransform * ipt = new faiss::IndexPreTransform (
+             new faiss::RandomRotationMatrix (rot),
+             new faiss::IndexIVFFlat (&quantizer, d, nlist)
+        );
+        ipt->own_fields = true;
+        index_shards.add_shard (ipt);
+    }
+    EXPECT_TRUE(index_shards.is_trained);
+    index_shards.add_with_ids(nb, cd.database.data(), cd.ids.data());
+    size_t prev_ntotal = index_shards.ntotal;
+    int ndiff = compare_merged(&index_shards, false);
+    EXPECT_EQ (prev_ntotal, index_shards.ntotal);
+    EXPECT_GE(0, ndiff);
+}
+
+
+// put the merged invfile on disk
+TEST(MERGE, merge_flat_ondisk) {
+    faiss::IndexShards index_shards(d, false, false);
+    index_shards.own_fields = true;
+    Tempfilename filename;
+
+    for (int i = 0; i < nindex; i++) {
+        auto ivf = new faiss::IndexIVFFlat (&cd.quantizer, d, nlist);
+        if (i == 0) {
+            auto il = new faiss::OnDiskInvertedLists (
+                ivf->nlist, ivf->code_size,
+                filename.c_str());
+            ivf->replace_invlists(il, true);
+        }
+        index_shards.add_shard (ivf);
+    }
+
+    EXPECT_TRUE(index_shards.is_trained);
+    index_shards.add_with_ids(nb, cd.database.data(), cd.ids.data());
+    int ndiff = compare_merged(&index_shards, false);
+
+    EXPECT_EQ(ndiff, 0);
+}
+
+// now use ondisk specific merge
+TEST(MERGE, merge_flat_ondisk_2) {
+    faiss::IndexShards index_shards(d, false, false);
+    index_shards.own_fields = true;
+
+    for (int i = 0; i < nindex; i++) {
+        index_shards.add_shard (
+             new faiss::IndexIVFFlat (&cd.quantizer, d, nlist));
+    }
+    EXPECT_TRUE(index_shards.is_trained);
+    index_shards.add_with_ids(nb, cd.database.data(), cd.ids.data());
+    int ndiff = compare_merged(&index_shards, false, false);
+    EXPECT_GE(0, ndiff);
+}
diff --git a/core/src/index/thirdparty/faiss/tests/test_meta_index.py b/core/src/index/thirdparty/faiss/tests/test_meta_index.py
new file mode 100644
index 0000000000..d072516e00
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_meta_index.py
@@ -0,0 +1,264 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+# translation of test_meta_index.lua
+
+import numpy as np
+import faiss
+import unittest
+
+from common import Randu10k
+
+ru = Randu10k()
+
+xb = ru.xb
+xt = ru.xt
+xq = ru.xq
+nb, d = xb.shape
+nq, d = xq.shape
+
+
+class IDRemap(unittest.TestCase):
+
+    def test_id_remap_idmap(self):
+        # reference: index without remapping
+
+        index = faiss.IndexPQ(d, 8, 8)
+        k = 10
+        index.train(xt)
+        index.add(xb)
+        _Dref, Iref = index.search(xq, k)
+
+        # try a remapping
+        ids = np.arange(nb)[::-1].copy()
+
+        sub_index = faiss.IndexPQ(d, 8, 8)
+        index2 = faiss.IndexIDMap(sub_index)
+
+        index2.train(xt)
+        index2.add_with_ids(xb, ids)
+
+        _D, I = index2.search(xq, k)
+
+        assert np.all(I == nb - 1 - Iref)
+
+    def test_id_remap_ivf(self):
+        # coarse quantizer in common
+        coarse_quantizer = faiss.IndexFlatIP(d)
+        ncentroids = 25
+
+        # reference: index without remapping
+
+        index = faiss.IndexIVFPQ(coarse_quantizer, d,
+                                        ncentroids, 8, 8)
+        index.nprobe = 5
+        k = 10
+        index.train(xt)
+        index.add(xb)
+        _Dref, Iref = index.search(xq, k)
+
+        # try a remapping
+        ids = np.arange(nb)[::-1].copy()
+
+        index2 = faiss.IndexIVFPQ(coarse_quantizer, d,
+                                        ncentroids, 8, 8)
+        index2.nprobe = 5
+
+        index2.train(xt)
+        index2.add_with_ids(xb, ids)
+
+        _D, I = index2.search(xq, k)
+        assert np.all(I == nb - 1 - Iref)
+
+
+class Shards(unittest.TestCase):
+
+    def test_shards(self):
+        k = 32
+        ref_index = faiss.IndexFlatL2(d)
+
+        print('ref search')
+        ref_index.add(xb)
+        _Dref, Iref = ref_index.search(xq, k)
+        print(Iref[:5, :6])
+
+        shard_index = faiss.IndexShards(d)
+        shard_index_2 = faiss.IndexShards(d, True, False)
+
+        ni = 3
+        for i in range(ni):
+            i0 = int(i * nb / ni)
+            i1 = int((i + 1) * nb / ni)
+            index = faiss.IndexFlatL2(d)
+            index.add(xb[i0:i1])
+            shard_index.add_shard(index)
+
+            index_2 = faiss.IndexFlatL2(d)
+            irm = faiss.IndexIDMap(index_2)
+            shard_index_2.add_shard(irm)
+
+        # test parallel add
+        shard_index_2.verbose = True
+        shard_index_2.add(xb)
+
+        for test_no in range(3):
+            with_threads = test_no == 1
+
+            print('shard search test_no = %d' % test_no)
+            if with_threads:
+                remember_nt = faiss.omp_get_max_threads()
+                faiss.omp_set_num_threads(1)
+                shard_index.threaded = True
+            else:
+                shard_index.threaded = False
+
+            if test_no != 2:
+                _D, I = shard_index.search(xq, k)
+            else:
+                _D, I = shard_index_2.search(xq, k)
+
+            print(I[:5, :6])
+
+            if with_threads:
+                faiss.omp_set_num_threads(remember_nt)
+
+            ndiff = (I != Iref).sum()
+
+            print('%d / %d differences' % (ndiff, nq * k))
+            assert(ndiff < nq * k / 1000.)
+
+
+class Merge(unittest.TestCase):
+
+    def make_index_for_merge(self, quant, index_type, master_index):
+        ncent = 40
+        if index_type == 1:
+            index = faiss.IndexIVFFlat(quant, d, ncent, faiss.METRIC_L2)
+            if master_index:
+                index.is_trained = True
+        elif index_type == 2:
+            index = faiss.IndexIVFPQ(quant, d, ncent, 4, 8)
+            if master_index:
+                index.pq = master_index.pq
+                index.is_trained = True
+        elif index_type == 3:
+            index = faiss.IndexIVFPQR(quant, d, ncent, 4, 8, 8, 8)
+            if master_index:
+                index.pq = master_index.pq
+                index.refine_pq = master_index.refine_pq
+                index.is_trained = True
+        elif index_type == 4:
+            # quant used as the actual index
+            index = faiss.IndexIDMap(quant)
+        return index
+
+    def do_test_merge(self, index_type):
+        k = 16
+        quant = faiss.IndexFlatL2(d)
+        ref_index = self.make_index_for_merge(quant, index_type, False)
+
+        # trains the quantizer
+        ref_index.train(xt)
+
+        print('ref search')
+        ref_index.add(xb)
+        _Dref, Iref = ref_index.search(xq, k)
+        print(Iref[:5, :6])
+
+        indexes = []
+        ni = 3
+        for i in range(ni):
+            i0 = int(i * nb / ni)
+            i1 = int((i + 1) * nb / ni)
+            index = self.make_index_for_merge(quant, index_type, ref_index)
+            index.is_trained = True
+            index.add(xb[i0:i1])
+            indexes.append(index)
+
+        index = indexes[0]
+
+        for i in range(1, ni):
+            print('merge ntotal=%d other.ntotal=%d ' % (
+                index.ntotal, indexes[i].ntotal))
+            index.merge_from(indexes[i], index.ntotal)
+
+        _D, I = index.search(xq, k)
+        print(I[:5, :6])
+
+        ndiff = (I != Iref).sum()
+        print('%d / %d differences' % (ndiff, nq * k))
+        assert(ndiff < nq * k / 1000.)
+
+    def test_merge(self):
+        self.do_test_merge(1)
+        self.do_test_merge(2)
+        self.do_test_merge(3)
+
+    def do_test_remove(self, index_type):
+        k = 16
+        quant = faiss.IndexFlatL2(d)
+        index = self.make_index_for_merge(quant, index_type, None)
+
+        # trains the quantizer
+        index.train(xt)
+
+        if index_type < 4:
+            index.add(xb)
+        else:
+            gen = np.random.RandomState(1234)
+            id_list = gen.permutation(nb * 7)[:nb]
+            index.add_with_ids(xb, id_list)
+
+
+        print('ref search ntotal=%d' % index.ntotal)
+        Dref, Iref = index.search(xq, k)
+
+        toremove = np.zeros(nq * k, dtype=int)
+        nr = 0
+        for i in range(nq):
+            for j in range(k):
+                # remove all even results (it's ok if there are duplicates
+                # in the list of ids)
+                if Iref[i, j] % 2 == 0:
+                    nr = nr + 1
+                    toremove[nr] = Iref[i, j]
+
+        print('nr=', nr)
+
+        idsel = faiss.IDSelectorBatch(
+            nr, faiss.swig_ptr(toremove))
+
+        for i in range(nr):
+            assert(idsel.is_member(int(toremove[i])))
+
+        nremoved = index.remove_ids(idsel)
+
+        print('nremoved=%d ntotal=%d' % (nremoved, index.ntotal))
+
+        D, I = index.search(xq, k)
+
+        # make sure results are in the same order with even ones removed
+        for i in range(nq):
+            j2 = 0
+            for j in range(k):
+                if Iref[i, j] % 2 != 0:
+                    assert I[i, j2] == Iref[i, j]
+                    assert abs(D[i, j2] - Dref[i, j]) < 1e-5
+                    j2 += 1
+
+    def test_remove(self):
+        self.do_test_remove(1)
+        self.do_test_remove(2)
+        self.do_test_remove(4)
+
+
+
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/core/src/index/thirdparty/faiss/tests/test_omp_threads.cpp b/core/src/index/thirdparty/faiss/tests/test_omp_threads.cpp
new file mode 100644
index 0000000000..216a89dde1
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_omp_threads.cpp
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <faiss/utils/utils.h>
+
+TEST(Threading, openmp) {
+  EXPECT_TRUE(faiss::check_openmp());
+}
diff --git a/core/src/index/thirdparty/faiss/tests/test_omp_threads_py.py b/core/src/index/thirdparty/faiss/tests/test_omp_threads_py.py
new file mode 100644
index 0000000000..1aa5da0ba4
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_omp_threads_py.py
@@ -0,0 +1,14 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+import faiss
+import unittest
+
+
+class TestOpenMP(unittest.TestCase):
+
+    def test_openmp(self):
+        assert faiss.check_openmp()
diff --git a/core/src/index/thirdparty/faiss/tests/test_ondisk_ivf.cpp b/core/src/index/thirdparty/faiss/tests/test_ondisk_ivf.cpp
new file mode 100644
index 0000000000..c7f717fafe
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_ondisk_ivf.cpp
@@ -0,0 +1,220 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <omp.h>
+
+#include <unordered_map>
+#include <pthread.h>
+
+#include <gtest/gtest.h>
+
+#include <faiss/OnDiskInvertedLists.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/utils/random.h>
+#include <faiss/index_io.h>
+
+
+namespace {
+
+struct Tempfilename {
+
+    static pthread_mutex_t mutex;
+
+    std::string filename;
+
+    Tempfilename (const char *prefix = nullptr) {
+        pthread_mutex_lock (&mutex);
+        char *cfname = tempnam (nullptr, prefix);
+        filename = cfname;
+        free(cfname);
+        pthread_mutex_unlock (&mutex);
+    }
+
+    ~Tempfilename () {
+        if (access (filename.c_str(), F_OK)) {
+            unlink (filename.c_str());
+        }
+    }
+
+    const char *c_str() {
+        return filename.c_str();
+    }
+
+};
+
+pthread_mutex_t Tempfilename::mutex = PTHREAD_MUTEX_INITIALIZER;
+
+}  // namespace
+
+
+TEST(ONDISK, make_invlists) {
+    int nlist = 100;
+    int code_size = 32;
+    int nadd = 1000000;
+    std::unordered_map<int, int> listnos;
+
+    Tempfilename filename;
+
+    faiss::OnDiskInvertedLists ivf (
+                nlist, code_size,
+                filename.c_str());
+
+    {
+        std::vector<uint8_t> code(32);
+        for (int i = 0; i < nadd; i++) {
+            double d = drand48();
+            int list_no = int(nlist * d * d); // skewed distribution
+            int * ar = (int*)code.data();
+            ar[0] = i;
+            ar[1] = list_no;
+            ivf.add_entry (list_no, i, code.data());
+            listnos[i] = list_no;
+        }
+    }
+
+    int ntot = 0;
+    for (int i = 0; i < nlist; i++) {
+        int size = ivf.list_size(i);
+        const faiss::Index::idx_t *ids = ivf.get_ids (i);
+        const uint8_t *codes = ivf.get_codes (i);
+        for (int j = 0; j < size; j++) {
+            faiss::Index::idx_t id = ids[j];
+            const int * ar = (const int*)&codes[code_size * j];
+            EXPECT_EQ (ar[0], id);
+            EXPECT_EQ (ar[1], i);
+            EXPECT_EQ (listnos[id], i);
+            ntot ++;
+        }
+    }
+    EXPECT_EQ (ntot, nadd);
+};
+
+
+TEST(ONDISK, test_add) {
+    int d = 8;
+    int nlist = 30, nq = 200, nb = 1500, k = 10;
+    faiss::IndexFlatL2 quantizer(d);
+    {
+        std::vector<float> x(d * nlist);
+        faiss::float_rand(x.data(), d * nlist, 12345);
+        quantizer.add(nlist, x.data());
+    }
+    std::vector<float> xb(d * nb);
+    faiss::float_rand(xb.data(), d * nb, 23456);
+
+    faiss::IndexIVFFlat index(&quantizer, d, nlist);
+    index.add(nb, xb.data());
+
+    std::vector<float> xq(d * nb);
+    faiss::float_rand(xq.data(), d * nq, 34567);
+
+    std::vector<float> ref_D (nq * k);
+    std::vector<faiss::Index::idx_t> ref_I (nq * k);
+
+    index.search (nq, xq.data(), k,
+                  ref_D.data(), ref_I.data());
+
+    Tempfilename filename, filename2;
+
+    // test add + search
+    {
+        faiss::IndexIVFFlat index2(&quantizer, d, nlist);
+
+        faiss::OnDiskInvertedLists ivf (
+                index.nlist, index.code_size,
+                filename.c_str());
+
+        index2.replace_invlists(&ivf);
+
+        index2.add(nb, xb.data());
+
+        std::vector<float> new_D (nq * k);
+        std::vector<faiss::Index::idx_t> new_I (nq * k);
+
+        index2.search (nq, xq.data(), k,
+                       new_D.data(), new_I.data());
+
+        EXPECT_EQ (ref_D, new_D);
+        EXPECT_EQ (ref_I, new_I);
+
+        write_index(&index2, filename2.c_str());
+
+    }
+
+    // test io
+    {
+        faiss::Index *index3 = faiss::read_index(filename2.c_str());
+
+        std::vector<float> new_D (nq * k);
+        std::vector<faiss::Index::idx_t> new_I (nq * k);
+
+        index3->search (nq, xq.data(), k,
+                        new_D.data(), new_I.data());
+
+        EXPECT_EQ (ref_D, new_D);
+        EXPECT_EQ (ref_I, new_I);
+
+        delete index3;
+    }
+
+};
+
+
+
+// WARN this thest will run multithreaded only in opt mode
+TEST(ONDISK, make_invlists_threaded) {
+    int nlist = 100;
+    int code_size = 32;
+    int nadd = 1000000;
+
+    Tempfilename filename;
+
+    faiss::OnDiskInvertedLists ivf (
+                nlist, code_size,
+                filename.c_str());
+
+    std::vector<int> list_nos (nadd);
+
+    for (int i = 0; i < nadd; i++) {
+        double d = drand48();
+        list_nos[i] = int(nlist * d * d); // skewed distribution
+    }
+
+#pragma omp parallel
+    {
+        std::vector<uint8_t> code(32);
+#pragma omp for
+        for (int i = 0; i < nadd; i++) {
+            int list_no = list_nos[i];
+            int * ar = (int*)code.data();
+            ar[0] = i;
+            ar[1] = list_no;
+            ivf.add_entry (list_no, i, code.data());
+        }
+    }
+
+    int ntot = 0;
+    for (int i = 0; i < nlist; i++) {
+        int size = ivf.list_size(i);
+        const faiss::Index::idx_t *ids = ivf.get_ids (i);
+        const uint8_t *codes = ivf.get_codes (i);
+        for (int j = 0; j < size; j++) {
+            faiss::Index::idx_t id = ids[j];
+            const int * ar = (const int*)&codes[code_size * j];
+            EXPECT_EQ (ar[0], id);
+            EXPECT_EQ (ar[1], i);
+            EXPECT_EQ (list_nos[id], i);
+            ntot ++;
+        }
+    }
+    EXPECT_EQ (ntot, nadd);
+
+};
diff --git a/core/src/index/thirdparty/faiss/tests/test_oom_exception.py b/core/src/index/thirdparty/faiss/tests/test_oom_exception.py
new file mode 100644
index 0000000000..72dfdc7e47
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_oom_exception.py
@@ -0,0 +1,37 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+import sys
+import faiss
+import unittest
+import resource
+
+class TestOOMException(unittest.TestCase):
+
+    def test_outrageous_alloc(self):
+        # Disable test on OSX.
+        if sys.platform == "darwin":
+            return
+
+        # https://github.com/facebookresearch/faiss/issues/758
+        soft_as, hard_as = resource.getrlimit(resource.RLIMIT_AS)
+        # make sure that allocing more than 10G will fail
+        resource.setrlimit(resource.RLIMIT_AS, (10 * 1024 * 1024, hard_as))
+        try:
+            x = faiss.IntVector()
+            try:
+                x.resize(10**11)   # 400 G of RAM
+            except MemoryError:
+                pass               # good, that's what we expect
+            else:
+                assert False, "should raise exception"
+        finally:
+            resource.setrlimit(resource.RLIMIT_AS, (soft_as, hard_as))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/core/src/index/thirdparty/faiss/tests/test_pairs_decoding.cpp b/core/src/index/thirdparty/faiss/tests/test_pairs_decoding.cpp
new file mode 100644
index 0000000000..7857d0fb50
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_pairs_decoding.cpp
@@ -0,0 +1,189 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/index_factory.h>
+#include <faiss/VectorTransform.h>
+#include <faiss/IVFlib.h>
+
+
+namespace {
+
+typedef faiss::Index::idx_t idx_t;
+
+/*************************************************************
+ * Test utils
+ *************************************************************/
+
+
+// dimension of the vectors to index
+int d = 64;
+
+// size of the database we plan to index
+size_t nb = 8000;
+
+// nb of queries
+size_t nq = 200;
+
+std::vector<float> make_data(size_t n)
+{
+    std::vector <float> database (n * d);
+    for (size_t i = 0; i < n * d; i++) {
+        database[i] = drand48();
+    }
+    return database;
+}
+
+std::unique_ptr<faiss::Index> make_index(const char *index_type,
+                                         const std::vector<float> & x) {
+
+    auto index = std::unique_ptr<faiss::Index> (
+            faiss::index_factory(d, index_type));
+    index->train(nb, x.data());
+    index->add(nb, x.data());
+    return index;
+}
+
+/*************************************************************
+ * Test functions for a given index type
+ *************************************************************/
+
+bool test_search_centroid(const char *index_key) {
+    std::vector<float> xb = make_data(nb); // database vectors
+    auto index = make_index(index_key, xb);
+
+    /* First test: find the centroids associated to the database
+       vectors and make sure that each vector does indeed appear in
+       the inverted list corresponding to its centroid */
+
+    std::vector<idx_t> centroid_ids (nb);
+    faiss::ivflib::search_centroid(
+         index.get(), xb.data(), nb, centroid_ids.data());
+
+    const faiss::IndexIVF * ivf = faiss::ivflib::extract_index_ivf
+        (index.get());
+
+    for(int i = 0; i < nb; i++) {
+        bool found = false;
+        int list_no = centroid_ids[i];
+        int list_size = ivf->invlists->list_size (list_no);
+        auto * list = ivf->invlists->get_ids (list_no);
+
+        for(int j = 0; j < list_size; j++) {
+            if (list[j] == i) {
+                found = true;
+                break;
+            }
+        }
+        if(!found) return false;
+    }
+    return true;
+}
+
+int test_search_and_return_centroids(const char *index_key) {
+    std::vector<float> xb = make_data(nb); // database vectors
+    auto index = make_index(index_key, xb);
+
+    std::vector<idx_t> centroid_ids (nb);
+    faiss::ivflib::search_centroid(index.get(), xb.data(),
+                                   nb, centroid_ids.data());
+
+    faiss::IndexIVF * ivf =
+        faiss::ivflib::extract_index_ivf (index.get());
+    ivf->nprobe = 4;
+
+    std::vector<float> xq = make_data(nq); // database vectors
+
+    int k = 5;
+
+    // compute a reference search result
+
+    std::vector<idx_t> refI (nq * k);
+    std::vector<float> refD (nq * k);
+    index->search (nq, xq.data(), k, refD.data(), refI.data());
+
+    // compute search result
+
+    std::vector<idx_t> newI (nq * k);
+    std::vector<float> newD (nq * k);
+
+    std::vector<idx_t> query_centroid_ids (nq);
+    std::vector<idx_t> result_centroid_ids (nq * k);
+
+    faiss::ivflib::search_and_return_centroids(index.get(),
+                                nq, xq.data(), k,
+                                newD.data(), newI.data(),
+                                query_centroid_ids.data(),
+                                result_centroid_ids.data());
+
+    // first verify that we have the same result as the standard search
+
+    if (newI != refI) {
+        return 1;
+    }
+
+    // then check if the result ids are indeed in the inverted list
+    // they are supposed to be in
+
+    for(int i = 0; i < nq * k; i++) {
+        int list_no = result_centroid_ids[i];
+        int result_no = newI[i];
+
+        if (result_no < 0) continue;
+
+        bool found = false;
+
+        int list_size = ivf->invlists->list_size (list_no);
+        auto * list = ivf->invlists->get_ids (list_no);
+
+        for(int j = 0; j < list_size; j++) {
+            if (list[j] == result_no) {
+                found = true;
+                break;
+            }
+        }
+        if(!found) return 2;
+    }
+    return 0;
+}
+
+}  // namespace
+
+
+/*************************************************************
+ * Test entry points
+ *************************************************************/
+
+TEST(test_search_centroid, IVFFlat) {
+    bool ok = test_search_centroid("IVF32,Flat");
+    EXPECT_TRUE(ok);
+}
+
+TEST(test_search_centroid, PCAIVFFlat) {
+    bool ok = test_search_centroid("PCA16,IVF32,Flat");
+    EXPECT_TRUE(ok);
+}
+
+TEST(test_search_and_return_centroids, IVFFlat) {
+    int err = test_search_and_return_centroids("IVF32,Flat");
+    EXPECT_NE(err, 1);
+    EXPECT_NE(err, 2);
+}
+
+TEST(test_search_and_return_centroids, PCAIVFFlat) {
+    int err = test_search_and_return_centroids("PCA16,IVF32,Flat");
+    EXPECT_NE(err, 1);
+    EXPECT_NE(err, 2);
+}
diff --git a/core/src/index/thirdparty/faiss/tests/test_params_override.cpp b/core/src/index/thirdparty/faiss/tests/test_params_override.cpp
new file mode 100644
index 0000000000..d6df2a4efe
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_params_override.cpp
@@ -0,0 +1,231 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexBinaryIVF.h>
+#include <faiss/index_factory.h>
+#include <faiss/AutoTune.h>
+#include <faiss/IVFlib.h>
+
+using namespace faiss;
+
+namespace {
+
+typedef Index::idx_t idx_t;
+
+
+// dimension of the vectors to index
+int d = 32;
+
+// size of the database we plan to index
+size_t nb = 1000;
+
+// nb of queries
+size_t nq = 200;
+
+
+
+std::vector<float> make_data(size_t n)
+{
+    std::vector <float> database (n * d);
+    for (size_t i = 0; i < n * d; i++) {
+        database[i] = drand48();
+    }
+    return database;
+}
+
+std::unique_ptr<Index> make_index(const char *index_type,
+                                     MetricType metric,
+                                     const std::vector<float> & x)
+{
+    std::unique_ptr<Index> index(index_factory(d, index_type, metric));
+    index->train(nb, x.data());
+    index->add(nb, x.data());
+    return index;
+}
+
+std::vector<idx_t> search_index(Index *index, const float *xq) {
+    int k = 10;
+    std::vector<idx_t> I(k * nq);
+    std::vector<float> D(k * nq);
+    index->search (nq, xq, k, D.data(), I.data());
+    return I;
+}
+
+std::vector<idx_t> search_index_with_params(
+        Index *index, const float *xq, IVFSearchParameters *params) {
+    int k = 10;
+    std::vector<idx_t> I(k * nq);
+    std::vector<float> D(k * nq);
+    ivflib::search_with_parameters (index, nq, xq, k,
+                                    D.data(), I.data(), params);
+    return I;
+}
+
+
+
+
+/*************************************************************
+ * Test functions for a given index type
+ *************************************************************/
+
+int test_params_override (const char *index_key, MetricType metric) {
+    std::vector<float> xb = make_data(nb); // database vectors
+    auto index = make_index(index_key, metric, xb);
+    //index->train(nb, xb.data());
+    // index->add(nb, xb.data());
+    std::vector<float> xq = make_data(nq);
+    ParameterSpace ps;
+    ps.set_index_parameter(index.get(), "nprobe", 2);
+    auto res2ref = search_index(index.get(), xq.data());
+    ps.set_index_parameter(index.get(), "nprobe", 9);
+    auto res9ref = search_index(index.get(), xq.data());
+    ps.set_index_parameter(index.get(), "nprobe", 1);
+
+    IVFSearchParameters params;
+    params.max_codes = 0;
+    params.nprobe = 2;
+    auto res2new = search_index_with_params(index.get(), xq.data(), &params);
+    params.nprobe = 9;
+    auto res9new = search_index_with_params(index.get(), xq.data(), &params);
+
+    if (res2ref != res2new)
+        return 2;
+
+    if (res9ref != res9new)
+        return 9;
+
+    return 0;
+}
+
+
+}  // namespace
+
+
+/*************************************************************
+ * Test entry points
+ *************************************************************/
+
+TEST(TPO, IVFFlat) {
+    int err1 = test_params_override ("IVF32,Flat", METRIC_L2);
+    EXPECT_EQ(err1, 0);
+    int err2 = test_params_override ("IVF32,Flat", METRIC_INNER_PRODUCT);
+    EXPECT_EQ(err2, 0);
+}
+
+TEST(TPO, IVFPQ) {
+    int err1 = test_params_override ("IVF32,PQ8np", METRIC_L2);
+    EXPECT_EQ(err1, 0);
+    int err2 = test_params_override ("IVF32,PQ8np", METRIC_INNER_PRODUCT);
+    EXPECT_EQ(err2, 0);
+}
+
+TEST(TPO, IVFSQ) {
+    int err1 = test_params_override ("IVF32,SQ8", METRIC_L2);
+    EXPECT_EQ(err1, 0);
+    int err2 = test_params_override ("IVF32,SQ8", METRIC_INNER_PRODUCT);
+    EXPECT_EQ(err2, 0);
+}
+
+TEST(TPO, IVFFlatPP) {
+    int err1 = test_params_override ("PCA16,IVF32,SQ8", METRIC_L2);
+    EXPECT_EQ(err1, 0);
+    int err2 = test_params_override ("PCA16,IVF32,SQ8", METRIC_INNER_PRODUCT);
+    EXPECT_EQ(err2, 0);
+}
+
+
+
+/*************************************************************
+ * Same for binary indexes
+ *************************************************************/
+
+
+std::vector<uint8_t> make_data_binary(size_t n) {
+    std::vector <uint8_t> database (n * d / 8);
+    for (size_t i = 0; i < n * d / 8; i++) {
+        database[i] = lrand48();
+    }
+    return database;
+}
+
+std::unique_ptr<IndexBinaryIVF> make_index(const char *index_type,
+                                     const std::vector<uint8_t> & x)
+{
+
+    auto index = std::unique_ptr<IndexBinaryIVF>
+        (dynamic_cast<IndexBinaryIVF*>(index_binary_factory (d, index_type)));
+    index->train(nb, x.data());
+    index->add(nb, x.data());
+    return index;
+}
+
+std::vector<idx_t> search_index(IndexBinaryIVF *index, const uint8_t *xq) {
+    int k = 10;
+    std::vector<idx_t> I(k * nq);
+    std::vector<int32_t> D(k * nq);
+    index->search (nq, xq, k, D.data(), I.data());
+    return I;
+}
+
+std::vector<idx_t> search_index_with_params(
+        IndexBinaryIVF *index, const uint8_t *xq, IVFSearchParameters *params) {
+    int k = 10;
+    std::vector<idx_t> I(k * nq);
+    std::vector<int32_t> D(k * nq);
+
+    std::vector<idx_t> Iq(params->nprobe * nq);
+    std::vector<int32_t> Dq(params->nprobe * nq);
+
+    index->quantizer->search(nq, xq, params->nprobe,
+                             Dq.data(), Iq.data());
+    index->search_preassigned(nq, xq, k, Iq.data(), Dq.data(),
+                              D.data(), I.data(),
+                              false, params);
+    return I;
+}
+
+int test_params_override_binary (const char *index_key) {
+    std::vector<uint8_t> xb = make_data_binary(nb); // database vectors
+    auto index = make_index (index_key, xb);
+    index->train(nb, xb.data());
+    index->add(nb, xb.data());
+    std::vector<uint8_t> xq = make_data_binary(nq);
+    index->nprobe = 2;
+    auto res2ref = search_index(index.get(), xq.data());
+    index->nprobe = 9;
+    auto res9ref = search_index(index.get(), xq.data());
+    index->nprobe = 1;
+
+    IVFSearchParameters params;
+    params.max_codes = 0;
+    params.nprobe = 2;
+    auto res2new = search_index_with_params(index.get(), xq.data(), &params);
+    params.nprobe = 9;
+    auto res9new = search_index_with_params(index.get(), xq.data(), &params);
+
+    if (res2ref != res2new)
+        return 2;
+
+    if (res9ref != res9new)
+        return 9;
+
+    return 0;
+}
+
+TEST(TPOB, IVF) {
+    int err1 = test_params_override_binary ("BIVF32");
+    EXPECT_EQ(err1, 0);
+}
diff --git a/core/src/index/thirdparty/faiss/tests/test_pq_encoding.cpp b/core/src/index/thirdparty/faiss/tests/test_pq_encoding.cpp
new file mode 100644
index 0000000000..6d11a69b6c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_pq_encoding.cpp
@@ -0,0 +1,98 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <iostream>
+#include <vector>
+#include <memory>
+
+#include <gtest/gtest.h>
+
+#include <faiss/impl/ProductQuantizer.h>
+
+
+namespace {
+
+const std::vector<uint64_t> random_vector(size_t s) {
+  std::vector<uint64_t> v(s, 0);
+  for (size_t i = 0; i < s; ++i) {
+    v[i] = rand();
+  }
+
+  return v;
+}
+
+}  // namespace
+
+
+TEST(PQEncoderGeneric, encode) {
+  const int nsubcodes = 97;
+  const int minbits = 1;
+  const int maxbits = 24;
+  const std::vector<uint64_t> values = random_vector(nsubcodes);
+
+  for(int nbits = minbits; nbits <= maxbits; ++nbits) {
+    std::cerr << "nbits = " << nbits << std::endl;
+
+    const uint64_t mask = (1ull << nbits) - 1;
+    std::unique_ptr<uint8_t[]> codes(
+      new uint8_t[(nsubcodes * maxbits + 7) / 8]
+    );
+
+    // NOTE(hoss): Necessary scope to ensure trailing bits are flushed to mem.
+    {
+      faiss::ProductQuantizer::PQEncoderGeneric encoder(codes.get(), nbits);
+      for (const auto& v : values) {
+        encoder.encode(v & mask);
+      }
+    }
+
+    faiss::ProductQuantizer::PQDecoderGeneric decoder(codes.get(), nbits);
+    for (int i = 0; i < nsubcodes; ++i) {
+      uint64_t v = decoder.decode();
+      EXPECT_EQ(values[i] & mask, v);
+    }
+  }
+}
+
+
+TEST(PQEncoder8, encode) {
+  const int nsubcodes = 100;
+  const std::vector<uint64_t> values = random_vector(nsubcodes);
+  const uint64_t mask = 0xFF;
+  std::unique_ptr<uint8_t[]> codes(new uint8_t[nsubcodes]);
+
+  faiss::ProductQuantizer::PQEncoder8 encoder(codes.get(), 8);
+  for (const auto& v : values) {
+    encoder.encode(v & mask);
+  }
+
+  faiss::ProductQuantizer::PQDecoder8 decoder(codes.get(), 8);
+  for (int i = 0; i < nsubcodes; ++i) {
+    uint64_t v = decoder.decode();
+    EXPECT_EQ(values[i] & mask, v);
+  }
+}
+
+
+TEST(PQEncoder16, encode) {
+  const int nsubcodes = 100;
+  const std::vector<uint64_t> values = random_vector(nsubcodes);
+  const uint64_t mask = 0xFFFF;
+  std::unique_ptr<uint8_t[]> codes(new uint8_t[2 * nsubcodes]);
+
+  faiss::ProductQuantizer::PQEncoder16 encoder(codes.get(), 16);
+  for (const auto& v : values) {
+    encoder.encode(v & mask);
+  }
+
+  faiss::ProductQuantizer::PQDecoder16 decoder(codes.get(), 16);
+  for (int i = 0; i < nsubcodes; ++i) {
+    uint64_t v = decoder.decode();
+    EXPECT_EQ(values[i] & mask, v);
+  }
+}
diff --git a/core/src/index/thirdparty/faiss/tests/test_referenced_objects.py b/core/src/index/thirdparty/faiss/tests/test_referenced_objects.py
new file mode 100644
index 0000000000..64cac3a1ef
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_referenced_objects.py
@@ -0,0 +1,107 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+"""make sure that the referenced objects are kept"""
+
+import numpy as np
+import unittest
+import faiss
+import sys
+import gc
+
+d = 10
+xt = np.random.rand(100, d).astype('float32')
+xb = np.random.rand(20, d).astype('float32')
+
+
+class TestReferenced(unittest.TestCase):
+
+    def test_IndexIVF(self):
+        quantizer = faiss.IndexFlatL2(d)
+        index = faiss.IndexIVFFlat(quantizer, d, 10)
+        index.train(xt)
+        index.add(xb)
+        del quantizer
+        gc.collect()
+        index.add(xb)
+
+    def test_count_refs(self):
+        quantizer = faiss.IndexFlatL2(d)
+        index = faiss.IndexIVFFlat(quantizer, d, 10)
+        refc1 = sys.getrefcount(quantizer)
+        del index
+        gc.collect()
+        refc2 = sys.getrefcount(quantizer)
+        assert refc2 == refc1 - 1
+
+    def test_IndexIVF_2(self):
+        index = faiss.IndexIVFFlat(faiss.IndexFlatL2(d), d, 10)
+        index.train(xt)
+        index.add(xb)
+
+    def test_IndexPreTransform(self):
+        ltrans = faiss.NormalizationTransform(d)
+        sub_index = faiss.IndexFlatL2(d)
+        index = faiss.IndexPreTransform(ltrans, sub_index)
+        index.add(xb)
+        del ltrans
+        gc.collect()
+        index.add(xb)
+        del sub_index
+        gc.collect()
+        index.add(xb)
+
+    def test_IndexPreTransform_2(self):
+        sub_index = faiss.IndexFlatL2(d)
+        index = faiss.IndexPreTransform(sub_index)
+        ltrans = faiss.NormalizationTransform(d)
+        index.prepend_transform(ltrans)
+        index.add(xb)
+        del ltrans
+        gc.collect()
+        index.add(xb)
+        del sub_index
+        gc.collect()
+        index.add(xb)
+
+    def test_IDMap(self):
+        sub_index = faiss.IndexFlatL2(d)
+        index = faiss.IndexIDMap(sub_index)
+        index.add_with_ids(xb, np.arange(len(xb)))
+        del sub_index
+        gc.collect()
+        index.add_with_ids(xb, np.arange(len(xb)))
+
+    def test_shards(self):
+        index = faiss.IndexShards(d)
+        for i in range(3):
+            sub_index = faiss.IndexFlatL2(d)
+            sub_index.add(xb)
+            index.add_shard(sub_index)
+        gc.collect()
+        index.search(xb, 10)
+
+
+dbin = 32
+xtbin = np.random.randint(256, size=(100, int(dbin / 8))).astype('uint8')
+xbbin = np.random.randint(256, size=(20, int(dbin / 8))).astype('uint8')
+
+
+class TestReferencedBinary(unittest.TestCase):
+
+    def test_binary_ivf(self):
+        index = faiss.IndexBinaryIVF(faiss.IndexBinaryFlat(dbin), dbin, 10)
+        gc.collect()
+        index.train(xtbin)
+
+    def test_wrap(self):
+        index = faiss.IndexBinaryFromFloat(faiss.IndexFlatL2(dbin))
+        gc.collect()
+        index.add(xbbin)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/core/src/index/thirdparty/faiss/tests/test_sliding_ivf.cpp b/core/src/index/thirdparty/faiss/tests/test_sliding_ivf.cpp
new file mode 100644
index 0000000000..90ab516c83
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_sliding_ivf.cpp
@@ -0,0 +1,240 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/AutoTune.h>
+#include <faiss/index_factory.h>
+#include <faiss/clone_index.h>
+#include <faiss/IVFlib.h>
+
+using namespace faiss;
+
+typedef Index::idx_t idx_t;
+
+
+// dimension of the vectors to index
+int d = 32;
+
+// nb of training vectors
+size_t nt = 5000;
+
+// size of the database points per window step
+size_t nb = 1000;
+
+// nb of queries
+size_t nq = 200;
+
+
+int total_size = 40;
+int window_size = 10;
+
+
+
+
+
+std::vector<float> make_data(size_t n)
+{
+    std::vector <float> database (n * d);
+    for (size_t i = 0; i < n * d; i++) {
+        database[i] = drand48();
+    }
+    return database;
+}
+
+std::unique_ptr<Index> make_trained_index(const char *index_type)
+{
+    auto index = std::unique_ptr<Index>(index_factory(d, index_type));
+    auto xt = make_data(nt * d);
+    index->train(nt, xt.data());
+    ParameterSpace().set_index_parameter (index.get(), "nprobe", 4);
+    return index;
+}
+
+std::vector<idx_t> search_index(Index *index, const float *xq) {
+    int k = 10;
+    std::vector<idx_t> I(k * nq);
+    std::vector<float> D(k * nq);
+    index->search (nq, xq, k, D.data(), I.data());
+    return I;
+}
+
+
+
+
+
+/*************************************************************
+ * Test functions for a given index type
+ *************************************************************/
+
+
+// make a few slices of indexes that can be merged
+void make_index_slices (const Index* trained_index,
+                        std::vector<std::unique_ptr<Index> > & sub_indexes) {
+
+    for (int i = 0; i < total_size; i++) {
+        sub_indexes.emplace_back (clone_index (trained_index));
+
+        printf ("preparing sub-index # %d\n", i);
+
+        Index * index = sub_indexes.back().get();
+
+        auto xb = make_data(nb * d);
+        std::vector<faiss::Index::idx_t> ids (nb);
+        for (int j = 0; j < nb; j++) {
+            ids[j] = lrand48();
+        }
+        index->add_with_ids (nb, xb.data(), ids.data());
+    }
+
+}
+
+// build merged index explicitly at sliding window position i
+Index *make_merged_index(
+         const Index* trained_index,
+         const std::vector<std::unique_ptr<Index> > & sub_indexes,
+         int i) {
+
+    Index * merged_index = clone_index (trained_index);
+    for (int j = i - window_size + 1; j <= i; j++) {
+        if (j < 0 || j >= total_size) continue;
+        std::unique_ptr<Index> sub_index (
+                clone_index (sub_indexes[j].get()));
+        IndexIVF *ivf0 = ivflib::extract_index_ivf (merged_index);
+        IndexIVF *ivf1 = ivflib::extract_index_ivf (sub_index.get());
+        ivf0->merge_from (*ivf1, 0);
+        merged_index->ntotal = ivf0->ntotal;
+    }
+    return merged_index;
+}
+
+int test_sliding_window (const char *index_key) {
+
+    std::unique_ptr<Index> trained_index = make_trained_index(index_key);
+
+    // make the index slices
+    std::vector<std::unique_ptr<Index> > sub_indexes;
+
+    make_index_slices (trained_index.get(), sub_indexes);
+
+    // now slide over the windows
+    std::unique_ptr<Index> index (clone_index (trained_index.get()));
+    ivflib::SlidingIndexWindow window (index.get());
+
+    auto xq = make_data (nq * d);
+
+    for (int i = 0; i < total_size + window_size; i++) {
+
+        printf ("doing step %d / %d\n", i, total_size + window_size);
+
+        // update the index
+        window.step (i < total_size ? sub_indexes[i].get() : nullptr,
+                     i >= window_size);
+        printf ("   current n_slice = %d\n", window.n_slice);
+
+        auto new_res = search_index (index.get(), xq.data());
+
+        std::unique_ptr<Index> merged_index (
+             make_merged_index (trained_index.get(), sub_indexes, i));
+
+        auto ref_res = search_index (merged_index.get(), xq.data ());
+
+        EXPECT_EQ (ref_res.size(), new_res.size());
+
+        EXPECT_EQ (ref_res, new_res);
+    }
+    return 0;
+}
+
+
+int test_sliding_invlists (const char *index_key) {
+
+    std::unique_ptr<Index> trained_index = make_trained_index(index_key);
+
+    // make the index slices
+    std::vector<std::unique_ptr<Index> > sub_indexes;
+
+    make_index_slices (trained_index.get(), sub_indexes);
+
+    // now slide over the windows
+    std::unique_ptr<Index> index (clone_index (trained_index.get()));
+    IndexIVF * index_ivf = ivflib::extract_index_ivf (index.get());
+
+    auto xq = make_data (nq * d);
+
+    for (int i = 0; i < total_size + window_size; i++) {
+
+        printf ("doing step %d / %d\n", i, total_size + window_size);
+
+        // update the index
+        std::vector<const InvertedLists*> ils;
+        for (int j = i - window_size + 1; j <= i; j++) {
+            if (j < 0 || j >= total_size) continue;
+            ils.push_back (ivflib::extract_index_ivf (
+                      sub_indexes[j].get())->invlists);
+        }
+        if (ils.size() == 0) continue;
+
+        ConcatenatedInvertedLists *ci =
+            new ConcatenatedInvertedLists (ils.size(), ils.data());
+
+        // will be deleted by the index
+        index_ivf->replace_invlists (ci, true);
+
+        printf ("   nb invlists = %ld\n", ils.size());
+
+        auto new_res = search_index (index.get(), xq.data());
+
+        std::unique_ptr<Index> merged_index (
+             make_merged_index (trained_index.get(), sub_indexes, i));
+
+        auto ref_res = search_index (merged_index.get(), xq.data ());
+
+        EXPECT_EQ (ref_res.size(), new_res.size());
+
+        size_t ndiff = 0;
+        for (size_t j = 0; j < ref_res.size(); j++) {
+            if (ref_res[j] != new_res[j])
+                ndiff++;
+        }
+        printf("  nb differences: %ld / %ld\n",
+               ndiff, ref_res.size());
+        EXPECT_EQ (ref_res, new_res);
+    }
+    return 0;
+}
+
+
+
+
+
+/*************************************************************
+ * Test entry points
+ *************************************************************/
+
+TEST(SlidingWindow, IVFFlat) {
+    test_sliding_window ("IVF32,Flat");
+}
+
+TEST(SlidingWindow, PCAIVFFlat) {
+    test_sliding_window ("PCA24,IVF32,Flat");
+}
+
+TEST(SlidingInvlists, IVFFlat) {
+    test_sliding_invlists ("IVF32,Flat");
+}
+
+TEST(SlidingInvlists, PCAIVFFlat) {
+    test_sliding_invlists ("PCA24,IVF32,Flat");
+}
diff --git a/core/src/index/thirdparty/faiss/tests/test_standalone_codec.py b/core/src/index/thirdparty/faiss/tests/test_standalone_codec.py
new file mode 100644
index 0000000000..95dc58c998
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_standalone_codec.py
@@ -0,0 +1,314 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+""" test byte codecs """
+
+from __future__ import print_function
+import numpy as np
+import unittest
+import faiss
+import tempfile
+import os
+
+from common import get_dataset_2
+
+
+class TestEncodeDecode(unittest.TestCase):
+
+    def do_encode_twice(self, factory_key):
+        d = 96
+        nb = 1000
+        nq = 0
+        nt = 2000
+
+        xt, x, _ = get_dataset_2(d, nt, nb, nq)
+
+        assert x.size > 0
+
+        codec = faiss.index_factory(d, factory_key)
+
+        codec.train(xt)
+
+        codes = codec.sa_encode(x)
+        x2 = codec.sa_decode(codes)
+
+        codes2 = codec.sa_encode(x2)
+
+        if 'IVF' not in factory_key:
+            self.assertTrue(np.all(codes == codes2))
+        else:
+            # some rows are not reconstructed exactly because they
+            # flip into another quantization cell
+            nrowdiff = (codes != codes2).any(axis=1).sum()
+            self.assertTrue(nrowdiff < 10)
+
+        x3 = codec.sa_decode(codes2)
+        if 'IVF' not in factory_key:
+            self.assertTrue(np.allclose(x2, x3))
+        else:
+            diffs = np.abs(x2 - x3).sum(axis=1)
+            avg = np.abs(x2).sum(axis=1).mean()
+            diffs.sort()
+            assert diffs[-10] < avg * 1e-5
+
+    def test_SQ8(self):
+        self.do_encode_twice('SQ8')
+
+    def test_IVFSQ8(self):
+        self.do_encode_twice('IVF256,SQ8')
+
+    def test_PCAIVFSQ8(self):
+        self.do_encode_twice('PCAR32,IVF256,SQ8')
+
+    def test_PQ6x8(self):
+        self.do_encode_twice('PQ6np')
+
+    def test_PQ6x6(self):
+        self.do_encode_twice('PQ6x6np')
+
+    def test_IVFPQ6x8np(self):
+        self.do_encode_twice('IVF512,PQ6np')
+
+    def test_LSH(self):
+        self.do_encode_twice('LSHrt')
+
+
+class TestIndexEquiv(unittest.TestCase):
+
+    def do_test(self, key1, key2):
+        d = 96
+        nb = 1000
+        nq = 0
+        nt = 2000
+
+        xt, x, _ = get_dataset_2(d, nt, nb, nq)
+
+        codec_ref = faiss.index_factory(d, key1)
+        codec_ref.train(xt)
+
+        code_ref = codec_ref.sa_encode(x)
+        x_recons_ref = codec_ref.sa_decode(code_ref)
+
+        codec_new = faiss.index_factory(d, key2)
+        codec_new.pq = codec_ref.pq
+
+        # replace quantizer, avoiding mem leak
+        oldq = codec_new.q1.quantizer
+        oldq.this.own()
+        codec_new.q1.own_fields = False
+        codec_new.q1.quantizer = codec_ref.quantizer
+        codec_new.is_trained = True
+
+        code_new = codec_new.sa_encode(x)
+        x_recons_new = codec_new.sa_decode(code_new)
+
+        self.assertTrue(np.all(code_new == code_ref))
+        self.assertTrue(np.all(x_recons_new == x_recons_ref))
+
+        codec_new_2 = faiss.deserialize_index(
+            faiss.serialize_index(codec_new))
+
+        code_new = codec_new_2.sa_encode(x)
+        x_recons_new = codec_new_2.sa_decode(code_new)
+
+        self.assertTrue(np.all(code_new == code_ref))
+        self.assertTrue(np.all(x_recons_new == x_recons_ref))
+
+    def test_IVFPQ(self):
+        self.do_test("IVF512,PQ6np", "Residual512,PQ6")
+
+    def test_IMI(self):
+        self.do_test("IMI2x5,PQ6np", "Residual2x5,PQ6")
+
+
+class TestAccuracy(unittest.TestCase):
+    """ comparative accuracy of a few types of indexes """
+
+    def compare_accuracy(self, lowac, highac, max_errs=(1e10, 1e10)):
+        d = 96
+        nb = 1000
+        nq = 0
+        nt = 2000
+
+        xt, x, _ = get_dataset_2(d, nt, nb, nq)
+
+        errs = []
+
+        for factory_string in lowac, highac:
+
+            codec = faiss.index_factory(d, factory_string)
+            print('sa codec: code size %d' % codec.sa_code_size())
+            codec.train(xt)
+
+            codes = codec.sa_encode(x)
+            x2 = codec.sa_decode(codes)
+
+            err = ((x - x2) ** 2).sum()
+            errs.append(err)
+
+        print(errs)
+        self.assertGreater(errs[0], errs[1])
+
+        self.assertGreater(max_errs[0], errs[0])
+        self.assertGreater(max_errs[1], errs[1])
+
+        # just a small IndexLattice I/O test
+        if 'Lattice' in highac:
+            codec2 = faiss.deserialize_index(
+                faiss.serialize_index(codec))
+            codes = codec.sa_encode(x)
+            x3 = codec.sa_decode(codes)
+            self.assertTrue(np.all(x2 == x3))
+
+    def test_SQ(self):
+        self.compare_accuracy('SQ4', 'SQ8')
+
+    def test_SQ2(self):
+        self.compare_accuracy('SQ6', 'SQ8')
+
+    def test_SQ3(self):
+        self.compare_accuracy('SQ8', 'SQfp16')
+
+    def test_PQ(self):
+        self.compare_accuracy('PQ6x8np', 'PQ8x8np')
+
+    def test_PQ2(self):
+        self.compare_accuracy('PQ8x6np', 'PQ8x8np')
+
+    def test_IVFvsPQ(self):
+        self.compare_accuracy('PQ8np', 'IVF256,PQ8np')
+
+    def test_Lattice(self):
+        # measured low/high: 20946.244, 5277.483
+        self.compare_accuracy('ZnLattice3x10_4',
+                              'ZnLattice3x20_4',
+                              (22000, 5400))
+
+    def test_Lattice2(self):
+        # here the difference is actually tiny
+        # measured errs: [16403.072, 15967.735]
+        self.compare_accuracy('ZnLattice3x12_1',
+                              'ZnLattice3x12_7',
+                              (18000, 16000))
+
+
+swig_ptr = faiss.swig_ptr
+
+
+class LatticeTest(unittest.TestCase):
+    """ Low-level lattice tests """
+
+    def test_repeats(self):
+        rs = np.random.RandomState(123)
+        dim = 32
+        for i in range(1000):
+            vec = np.floor((rs.rand(dim) ** 7) * 3).astype('float32')
+            vecs = vec.copy()
+            vecs.sort()
+            repeats = faiss.Repeats(dim, swig_ptr(vecs))
+            rr = [repeats.repeats.at(i) for i in range(repeats.repeats.size())]
+            # print([(r.val, r.n) for r in rr])
+            code = repeats.encode(swig_ptr(vec))
+            #print(vec, code)
+            vec2 = np.zeros(dim, dtype='float32')
+            repeats.decode(code, swig_ptr(vec2))
+            # print(vec2)
+            assert np.all(vec == vec2)
+
+    def test_ZnSphereCodec_encode_centroid(self):
+        dim = 8
+        r2 = 5
+        ref_codec = faiss.ZnSphereCodec(dim, r2)
+        codec = faiss.ZnSphereCodecRec(dim, r2)
+        # print(ref_codec.nv, codec.nv)
+        assert ref_codec.nv == codec.nv
+        s = set()
+        for i in range(ref_codec.nv):
+            c = np.zeros(dim, dtype='float32')
+            ref_codec.decode(i, swig_ptr(c))
+            code = codec.encode_centroid(swig_ptr(c))
+            assert 0 <= code < codec.nv
+            s.add(code)
+        assert len(s) == codec.nv
+
+    def test_ZnSphereCodecRec(self):
+        dim = 16
+        r2 = 6
+        codec = faiss.ZnSphereCodecRec(dim, r2)
+        # print("nv=", codec.nv)
+        for i in range(codec.nv):
+            c = np.zeros(dim, dtype='float32')
+            codec.decode(i, swig_ptr(c))
+            code = codec.encode_centroid(swig_ptr(c))
+            assert code == i
+
+    def run_ZnSphereCodecAlt(self, dim, r2):
+        # dim = 32
+        # r2 = 14
+        codec = faiss.ZnSphereCodecAlt(dim, r2)
+        rs = np.random.RandomState(123)
+        n = 100
+        codes = rs.randint(codec.nv, size=n).astype('uint64')
+        x = np.empty((n, dim), dtype='float32')
+        codec.decode_multi(n, swig_ptr(codes), swig_ptr(x))
+        codes2 = np.empty(n, dtype='uint64')
+        codec.encode_multi(n, swig_ptr(x), swig_ptr(codes2))
+
+        assert np.all(codes == codes2)
+
+    def test_ZnSphereCodecAlt32(self):
+        self.run_ZnSphereCodecAlt(32, 14)
+
+    def test_ZnSphereCodecAlt24(self):
+        self.run_ZnSphereCodecAlt(24, 14)
+
+
+class TestBitstring(unittest.TestCase):
+    """ Low-level bit string tests """
+
+    def test_rw(self):
+        rs = np.random.RandomState(1234)
+        nbyte = 1000
+        sz = 0
+
+        bs = np.ones(nbyte, dtype='uint8')
+        bw = faiss.BitstringWriter(swig_ptr(bs), nbyte)
+
+        if False:
+            ctrl = [(7, 0x35), (13, 0x1d74)]
+            for nbit, x in ctrl:
+                bw.write(x, nbit)
+        else:
+            ctrl = []
+            while True:
+                nbit = int(1 + 62 * rs.rand() ** 4)
+                if sz + nbit > nbyte * 8:
+                    break
+                x = rs.randint(1 << nbit)
+                bw.write(x, nbit)
+                ctrl.append((nbit, x))
+                sz += nbit
+
+        bignum = 0
+        sz = 0
+        for nbit, x in ctrl:
+            bignum |= x << sz
+            sz += nbit
+
+        for i in range(nbyte):
+            self.assertTrue(((bignum >> (i * 8)) & 255) == bs[i])
+
+        for i in range(nbyte):
+            print(bin(bs[i] + 256)[3:], end=' ')
+        print()
+
+        br = faiss.BitstringReader(swig_ptr(bs), nbyte)
+
+        for nbit, xref in ctrl:
+            xnew = br.read(nbit)
+            print('nbit %d xref %x xnew %x' % (nbit, xref, xnew))
+            self.assertTrue(xnew == xref)
diff --git a/core/src/index/thirdparty/faiss/tests/test_threaded_index.cpp b/core/src/index/thirdparty/faiss/tests/test_threaded_index.cpp
new file mode 100644
index 0000000000..7cad760c09
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_threaded_index.cpp
@@ -0,0 +1,253 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/ThreadedIndex.h>
+#include <faiss/IndexReplicas.h>
+#include <faiss/IndexShards.h>
+
+#include <chrono>
+#include <gtest/gtest.h>
+#include <memory>
+#include <vector>
+#include <thread>
+
+namespace {
+
+struct TestException : public std::exception { };
+
+struct MockIndex : public faiss::Index {
+  explicit MockIndex(idx_t d) :
+      faiss::Index(d) {
+    resetMock();
+  }
+
+  void resetMock() {
+    flag = false;
+    nCalled = 0;
+    xCalled = nullptr;
+    kCalled = 0;
+    distancesCalled = nullptr;
+    labelsCalled = nullptr;
+  }
+
+  void add(idx_t n, const float* x) override {
+    nCalled = n;
+    xCalled = x;
+  }
+
+  void search(idx_t n,
+              const float* x,
+              idx_t k,
+              float* distances,
+              idx_t* labels) const override {
+    nCalled = n;
+    xCalled = x;
+    kCalled = k;
+    distancesCalled = distances;
+    labelsCalled = labels;
+  }
+
+  void reset() override { }
+
+  bool flag;
+
+  mutable idx_t nCalled;
+  mutable const float* xCalled;
+  mutable idx_t kCalled;
+  mutable float* distancesCalled;
+  mutable idx_t* labelsCalled;
+};
+
+template <typename IndexT>
+struct MockThreadedIndex : public faiss::ThreadedIndex<IndexT> {
+  using idx_t = faiss::Index::idx_t;
+
+  explicit MockThreadedIndex(bool threaded)
+      : faiss::ThreadedIndex<IndexT>(threaded) {
+  }
+
+  void add(idx_t, const float*) override { }
+  void search(idx_t, const float*, idx_t, float*, idx_t*) const override {}
+  void reset() override {}
+};
+
+}
+
+TEST(ThreadedIndex, SingleException) {
+  std::vector<std::unique_ptr<MockIndex>> idxs;
+
+  for (int i = 0; i < 3; ++i) {
+    idxs.emplace_back(new MockIndex(1));
+  }
+
+  auto fn =
+    [](int i, MockIndex* index) {
+      if (i == 1) {
+        throw TestException();
+      } else {
+        std::this_thread::sleep_for(std::chrono::milliseconds(i * 250));
+
+        index->flag = true;
+      }
+    };
+
+  // Try with threading and without
+  for (bool threaded : {true, false}) {
+    // clear flags
+    for (auto& idx : idxs) {
+      idx->resetMock();
+    }
+
+    MockThreadedIndex<MockIndex> ti(threaded);
+    for (auto& idx : idxs) {
+      ti.addIndex(idx.get());
+    }
+
+    // The second index should throw
+    EXPECT_THROW(ti.runOnIndex(fn), TestException);
+
+    // Index 0 and 2 should have processed
+    EXPECT_TRUE(idxs[0]->flag);
+    EXPECT_TRUE(idxs[2]->flag);
+  }
+}
+
+TEST(ThreadedIndex, MultipleException) {
+  std::vector<std::unique_ptr<MockIndex>> idxs;
+
+  for (int i = 0; i < 3; ++i) {
+    idxs.emplace_back(new MockIndex(1));
+  }
+
+  auto fn =
+    [](int i, MockIndex* index) {
+      if (i < 2) {
+        throw TestException();
+      } else {
+        std::this_thread::sleep_for(std::chrono::milliseconds(i * 250));
+
+        index->flag = true;
+      }
+    };
+
+  // Try with threading and without
+  for (bool threaded : {true, false}) {
+    // clear flags
+    for (auto& idx : idxs) {
+      idx->resetMock();
+    }
+
+    MockThreadedIndex<MockIndex> ti(threaded);
+    for (auto& idx : idxs) {
+      ti.addIndex(idx.get());
+    }
+
+    // Multiple indices threw an exception that was aggregated into a
+    // FaissException
+    EXPECT_THROW(ti.runOnIndex(fn), faiss::FaissException);
+
+    // Index 2 should have processed
+    EXPECT_TRUE(idxs[2]->flag);
+  }
+}
+
+TEST(ThreadedIndex, TestReplica) {
+  int numReplicas = 5;
+  int n = 10 * numReplicas;
+  int d = 3;
+  int k = 6;
+
+  // Try with threading and without
+  for (bool threaded : {true, false}) {
+    std::vector<std::unique_ptr<MockIndex>> idxs;
+    faiss::IndexReplicas replica(d);
+
+    for (int i = 0; i < numReplicas; ++i) {
+      idxs.emplace_back(new MockIndex(d));
+      replica.addIndex(idxs.back().get());
+    }
+
+    std::vector<float> x(n * d);
+    std::vector<float> distances(n * k);
+    std::vector<faiss::Index::idx_t> labels(n * k);
+
+    replica.add(n, x.data());
+
+    for (int i = 0; i < idxs.size(); ++i) {
+      EXPECT_EQ(idxs[i]->nCalled, n);
+      EXPECT_EQ(idxs[i]->xCalled, x.data());
+    }
+
+    for (auto& idx : idxs) {
+      idx->resetMock();
+    }
+
+    replica.search(n, x.data(), k, distances.data(), labels.data());
+
+    for (int i = 0; i < idxs.size(); ++i) {
+      auto perReplica = n / idxs.size();
+
+      EXPECT_EQ(idxs[i]->nCalled, perReplica);
+      EXPECT_EQ(idxs[i]->xCalled, x.data() + i * perReplica * d);
+      EXPECT_EQ(idxs[i]->kCalled, k);
+      EXPECT_EQ(idxs[i]->distancesCalled,
+                distances.data() + (i * perReplica) * k);
+      EXPECT_EQ(idxs[i]->labelsCalled,
+                labels.data() + (i * perReplica) * k);
+    }
+  }
+}
+
+TEST(ThreadedIndex, TestShards) {
+  int numShards = 7;
+  int d = 3;
+  int n = 10 * numShards;
+  int k = 6;
+
+  // Try with threading and without
+  for (bool threaded : {true, false}) {
+    std::vector<std::unique_ptr<MockIndex>> idxs;
+    faiss::IndexShards shards(d, threaded);
+
+    for (int i = 0; i < numShards; ++i) {
+      idxs.emplace_back(new MockIndex(d));
+      shards.addIndex(idxs.back().get());
+    }
+
+    std::vector<float> x(n * d);
+    std::vector<float> distances(n * k);
+    std::vector<faiss::Index::idx_t> labels(n * k);
+
+    shards.add(n, x.data());
+
+    for (int i = 0; i < idxs.size(); ++i) {
+      auto perShard = n / idxs.size();
+
+      EXPECT_EQ(idxs[i]->nCalled, perShard);
+      EXPECT_EQ(idxs[i]->xCalled, x.data() + i * perShard * d);
+    }
+
+    for (auto& idx : idxs) {
+      idx->resetMock();
+    }
+
+    shards.search(n, x.data(), k, distances.data(), labels.data());
+
+    for (int i = 0; i < idxs.size(); ++i) {
+      auto perShard = n / idxs.size();
+
+      EXPECT_EQ(idxs[i]->nCalled, n);
+      EXPECT_EQ(idxs[i]->xCalled, x.data());
+      EXPECT_EQ(idxs[i]->kCalled, k);
+      // There is a temporary buffer used for shards
+      EXPECT_EQ(idxs[i]->distancesCalled,
+                idxs[0]->distancesCalled + i * k * n);
+      EXPECT_EQ(idxs[i]->labelsCalled,
+                idxs[0]->labelsCalled + i * k * n);
+    }
+  }
+}
diff --git a/core/src/index/thirdparty/faiss/tests/test_transfer_invlists.cpp b/core/src/index/thirdparty/faiss/tests/test_transfer_invlists.cpp
new file mode 100644
index 0000000000..8766d88e6f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_transfer_invlists.cpp
@@ -0,0 +1,159 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <memory>
+#include <cstdio>
+#include <cstdlib>
+
+#include <gtest/gtest.h>
+
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/index_io.h>
+#include <faiss/impl/io.h>
+#include <faiss/AutoTune.h>
+#include <faiss/index_factory.h>
+#include <faiss/clone_index.h>
+#include <faiss/VectorTransform.h>
+#include <faiss/utils/random.h>
+#include <faiss/IVFlib.h>
+
+
+namespace {
+
+// parameters to use for the test
+int d = 64;
+size_t nb = 1000;
+size_t nq = 100;
+size_t nt = 500;
+int k = 10;
+int nlist = 40;
+
+using namespace faiss;
+
+typedef faiss::Index::idx_t idx_t;
+
+
+std::vector<float> get_data (size_t nb, int seed) {
+    std::vector<float> x (nb * d);
+    float_randn (x.data(), nb * d, seed);
+    return x;
+}
+
+
+void test_index_type(const char *factory_string) {
+
+    // transfer inverted lists in nslice slices
+    int nslice = 3;
+
+    /****************************************************************
+     * trained reference index
+     ****************************************************************/
+
+    std::unique_ptr<Index> trained (index_factory (d, factory_string));
+
+    {
+        auto xt = get_data (nt, 123);
+        trained->train (nt, xt.data());
+    }
+
+    // sample nq query vectors to check if results are the same
+    auto xq = get_data (nq, 818);
+
+
+    /****************************************************************
+     * source index
+     ***************************************************************/
+    std::unique_ptr<Index> src_index (clone_index (trained.get()));
+
+    { // add some data to source index
+        auto xb = get_data (nb, 245);
+        src_index->add (nb, xb.data());
+    }
+
+    ParameterSpace().set_index_parameter (src_index.get(), "nprobe", 4);
+
+    // remember reference search result on source index
+    std::vector<idx_t> Iref (nq * k);
+    std::vector<float> Dref (nq * k);
+    src_index->search (nq, xq.data(), k, Dref.data(), Iref.data());
+
+    /****************************************************************
+     * destination index -- should be replaced by source index
+     ***************************************************************/
+
+    std::unique_ptr<Index> dst_index (clone_index (trained.get()));
+
+    { // initial state: filled in with some garbage
+        int nb2 = nb + 10;
+        auto xb = get_data (nb2, 366);
+        dst_index->add (nb2, xb.data());
+    }
+
+    std::vector<idx_t> Inew (nq * k);
+    std::vector<float> Dnew (nq * k);
+
+    ParameterSpace().set_index_parameter (dst_index.get(), "nprobe", 4);
+
+    // transfer from source to destination in nslice slices
+    for (int sl = 0; sl < nslice; sl++) {
+
+        // so far, the indexes are different
+        dst_index->search (nq, xq.data(), k, Dnew.data(), Inew.data());
+        EXPECT_TRUE (Iref != Inew);
+        EXPECT_TRUE (Dref != Dnew);
+
+        // range of inverted list indices to transfer
+        long i0 = sl * nlist / nslice;
+        long i1 = (sl + 1) * nlist / nslice;
+
+        std::vector<uint8_t> data_to_transfer;
+        {
+            std::unique_ptr<ArrayInvertedLists> il
+                (ivflib::get_invlist_range (src_index.get(), i0, i1));
+            // serialize inverted lists
+            VectorIOWriter wr;
+            write_InvertedLists (il.get(), &wr);
+            data_to_transfer.swap (wr.data);
+        }
+
+        // transfer data here from source machine to dest machine
+
+        {
+            VectorIOReader reader;
+            reader.data.swap (data_to_transfer);
+
+            // deserialize inverted lists
+            std::unique_ptr<ArrayInvertedLists> il
+                (dynamic_cast<ArrayInvertedLists *>
+                 (read_InvertedLists (&reader)));
+
+            // swap inverted lists. Block searches here!
+            {
+                ivflib::set_invlist_range (dst_index.get(), i0, i1, il.get());
+            }
+        }
+
+    }
+    EXPECT_EQ (dst_index->ntotal, src_index->ntotal);
+
+    // now, the indexes are the same
+    dst_index->search (nq, xq.data(), k, Dnew.data(), Inew.data());
+    EXPECT_TRUE (Iref == Inew);
+    EXPECT_TRUE (Dref == Dnew);
+
+}
+
+}  // namespace
+
+
+TEST(TRANS, IVFFlat) {
+    test_index_type ("IVF40,Flat");
+}
+
+TEST(TRANS, IVFFlatPreproc) {
+    test_index_type ("PCAR32,IVF40,Flat");
+}
diff --git a/core/src/index/thirdparty/faiss/tutorial/cpp/1-Flat.cpp b/core/src/index/thirdparty/faiss/tutorial/cpp/1-Flat.cpp
new file mode 100644
index 0000000000..f8632bb6c8
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/cpp/1-Flat.cpp
@@ -0,0 +1,98 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <faiss/IndexFlat.h>
+
+
+int main() {
+    int d = 64;                            // dimension
+    int nb = 100000;                       // database size
+    int nq = 10000;                        // nb of queries
+
+    float *xb = new float[d * nb];
+    float *xq = new float[d * nq];
+
+    for(int i = 0; i < nb; i++) {
+        for(int j = 0; j < d; j++)
+            xb[d * i + j] = drand48();
+        xb[d * i] += i / 1000.;
+    }
+
+    for(int i = 0; i < nq; i++) {
+        for(int j = 0; j < d; j++)
+            xq[d * i + j] = drand48();
+        xq[d * i] += i / 1000.;
+    }
+
+    faiss::IndexFlatL2 index(d);           // call constructor
+    printf("is_trained = %s\n", index.is_trained ? "true" : "false");
+    index.add(nb, xb);                     // add vectors to the index
+    printf("ntotal = %ld\n", index.ntotal);
+
+    int k = 4;
+
+    {       // sanity check: search 5 first vectors of xb
+        long *I = new long[k * 5];
+        float *D = new float[k * 5];
+
+        index.search(5, xb, k, D, I);
+
+        // print results
+        printf("I=\n");
+        for(int i = 0; i < 5; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+
+        printf("D=\n");
+        for(int i = 0; i < 5; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%7g ", D[i * k + j]);
+            printf("\n");
+        }
+
+        delete [] I;
+        delete [] D;
+    }
+
+
+    {       // search xq
+        long *I = new long[k * nq];
+        float *D = new float[k * nq];
+
+        index.search(nq, xq, k, D, I);
+
+        // print results
+        printf("I (5 first results)=\n");
+        for(int i = 0; i < 5; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+
+        printf("I (5 last results)=\n");
+        for(int i = nq - 5; i < nq; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+
+        delete [] I;
+        delete [] D;
+    }
+
+
+
+    delete [] xb;
+    delete [] xq;
+
+    return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/tutorial/cpp/2-IVFFlat.cpp b/core/src/index/thirdparty/faiss/tutorial/cpp/2-IVFFlat.cpp
new file mode 100644
index 0000000000..ce13f1d1ae
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/cpp/2-IVFFlat.cpp
@@ -0,0 +1,81 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFFlat.h>
+
+
+int main() {
+    int d = 64;                            // dimension
+    int nb = 100000;                       // database size
+    int nq = 10000;                        // nb of queries
+
+    float *xb = new float[d * nb];
+    float *xq = new float[d * nq];
+
+    for(int i = 0; i < nb; i++) {
+        for(int j = 0; j < d; j++)
+            xb[d * i + j] = drand48();
+        xb[d * i] += i / 1000.;
+    }
+
+    for(int i = 0; i < nq; i++) {
+        for(int j = 0; j < d; j++)
+            xq[d * i + j] = drand48();
+        xq[d * i] += i / 1000.;
+    }
+
+
+    int nlist = 100;
+    int k = 4;
+
+    faiss::IndexFlatL2 quantizer(d);       // the other index
+    faiss::IndexIVFFlat index(&quantizer, d, nlist, faiss::METRIC_L2);
+    // here we specify METRIC_L2, by default it performs inner-product search
+    assert(!index.is_trained);
+    index.train(nb, xb);
+    assert(index.is_trained);
+    index.add(nb, xb);
+
+    {       // search xq
+        long *I = new long[k * nq];
+        float *D = new float[k * nq];
+
+        index.search(nq, xq, k, D, I);
+
+        printf("I=\n");
+        for(int i = nq - 5; i < nq; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+
+        index.nprobe = 10;
+        index.search(nq, xq, k, D, I);
+
+        printf("I=\n");
+        for(int i = nq - 5; i < nq; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+
+        delete [] I;
+        delete [] D;
+    }
+
+
+
+    delete [] xb;
+    delete [] xq;
+
+    return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/tutorial/cpp/3-IVFPQ.cpp b/core/src/index/thirdparty/faiss/tutorial/cpp/3-IVFPQ.cpp
new file mode 100644
index 0000000000..5a76367f48
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/cpp/3-IVFPQ.cpp
@@ -0,0 +1,93 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFPQ.h>
+
+
+int main() {
+    int d = 64;                            // dimension
+    int nb = 100000;                       // database size
+    int nq = 10000;                        // nb of queries
+
+    float *xb = new float[d * nb];
+    float *xq = new float[d * nq];
+
+    for(int i = 0; i < nb; i++) {
+        for(int j = 0; j < d; j++)
+            xb[d * i + j] = drand48();
+        xb[d * i] += i / 1000.;
+    }
+
+    for(int i = 0; i < nq; i++) {
+        for(int j = 0; j < d; j++)
+            xq[d * i + j] = drand48();
+        xq[d * i] += i / 1000.;
+    }
+
+
+    int nlist = 100;
+    int k = 4;
+    int m = 8;                             // bytes per vector
+    faiss::IndexFlatL2 quantizer(d);       // the other index
+    faiss::IndexIVFPQ index(&quantizer, d, nlist, m, 8);
+    // here we specify METRIC_L2, by default it performs inner-product search
+    index.train(nb, xb);
+    index.add(nb, xb);
+
+    {       // sanity check
+        long *I = new long[k * 5];
+        float *D = new float[k * 5];
+
+        index.search(5, xb, k, D, I);
+
+        printf("I=\n");
+        for(int i = 0; i < 5; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+
+        printf("D=\n");
+        for(int i = 0; i < 5; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%7g ", D[i * k + j]);
+            printf("\n");
+        }
+
+        delete [] I;
+        delete [] D;
+    }
+
+    {       // search xq
+        long *I = new long[k * nq];
+        float *D = new float[k * nq];
+
+        index.nprobe = 10;
+        index.search(nq, xq, k, D, I);
+
+        printf("I=\n");
+        for(int i = nq - 5; i < nq; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+
+        delete [] I;
+        delete [] D;
+    }
+
+
+
+    delete [] xb;
+    delete [] xq;
+
+    return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/tutorial/cpp/4-GPU.cpp b/core/src/index/thirdparty/faiss/tutorial/cpp/4-GPU.cpp
new file mode 100644
index 0000000000..49c5c8a06e
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/cpp/4-GPU.cpp
@@ -0,0 +1,119 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/StandardGpuResources.h>
+
+
+int main() {
+    int d = 64;                            // dimension
+    int nb = 100000;                       // database size
+    int nq = 10000;                        // nb of queries
+
+    float *xb = new float[d * nb];
+    float *xq = new float[d * nq];
+
+    for(int i = 0; i < nb; i++) {
+        for(int j = 0; j < d; j++)
+            xb[d * i + j] = drand48();
+        xb[d * i] += i / 1000.;
+    }
+
+    for(int i = 0; i < nq; i++) {
+        for(int j = 0; j < d; j++)
+            xq[d * i + j] = drand48();
+        xq[d * i] += i / 1000.;
+    }
+
+    faiss::gpu::StandardGpuResources res;
+
+    // Using a flat index
+
+    faiss::gpu::GpuIndexFlatL2 index_flat(&res, d);
+
+    printf("is_trained = %s\n", index_flat.is_trained ? "true" : "false");
+    index_flat.add(nb, xb);  // add vectors to the index
+    printf("ntotal = %ld\n", index_flat.ntotal);
+
+    int k = 4;
+
+    {       // search xq
+        long *I = new long[k * nq];
+        float *D = new float[k * nq];
+
+        index_flat.search(nq, xq, k, D, I);
+
+        // print results
+        printf("I (5 first results)=\n");
+        for(int i = 0; i < 5; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+
+        printf("I (5 last results)=\n");
+        for(int i = nq - 5; i < nq; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+
+        delete [] I;
+        delete [] D;
+    }
+
+    // Using an IVF index
+
+    int nlist = 100;
+    faiss::gpu::GpuIndexIVFFlat index_ivf(&res, d, nlist, faiss::METRIC_L2);
+    // here we specify METRIC_L2, by default it performs inner-product search
+
+    assert(!index_ivf.is_trained);
+    index_ivf.train(nb, xb);
+    assert(index_ivf.is_trained);
+    index_ivf.add(nb, xb);  // add vectors to the index
+
+    printf("is_trained = %s\n", index_ivf.is_trained ? "true" : "false");
+    printf("ntotal = %ld\n", index_ivf.ntotal);
+
+    {       // search xq
+        long *I = new long[k * nq];
+        float *D = new float[k * nq];
+
+        index_ivf.search(nq, xq, k, D, I);
+
+        // print results
+        printf("I (5 first results)=\n");
+        for(int i = 0; i < 5; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+
+        printf("I (5 last results)=\n");
+        for(int i = nq - 5; i < nq; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+
+        delete [] I;
+        delete [] D;
+    }
+
+
+    delete [] xb;
+    delete [] xq;
+
+    return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/tutorial/cpp/5-GPU.cpp b/core/src/index/thirdparty/faiss/tutorial/cpp/5-GPU.cpp
new file mode 100644
index 0000000000..212fb53f1c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/cpp/5-GPU.cpp
@@ -0,0 +1,234 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <unistd.h>
+
+#include <iostream>
+
+#include "faiss/IndexIVF.h"
+#include "faiss/IndexFlat.h"
+#include "faiss/index_io.h"
+#include "faiss/gpu/GpuIndexFlat.h"
+#include "faiss/gpu/StandardGpuResources.h"
+#include "faiss/gpu/GpuAutoTune.h"
+#include "faiss/gpu/GpuCloner.h"
+#include "faiss/gpu/GpuClonerOptions.h"
+#include "faiss/gpu/GpuIndexIVF.h"
+
+#include "faiss/impl/FaissAssert.h"
+#include "faiss/impl/AuxIndexStructures.h"
+
+#include "faiss/IndexFlat.h"
+#include "faiss/VectorTransform.h"
+#include "faiss/IndexLSH.h"
+#include "faiss/IndexPQ.h"
+
+#include "faiss/IndexIVFPQ.h"
+#include "faiss/IndexIVFFlat.h"
+#include "faiss/IndexIVFSpectralHash.h"
+#include "faiss/MetaIndexes.h"
+#include "faiss/IndexScalarQuantizer.h"
+#include "faiss/IndexHNSW.h"
+#include "faiss/OnDiskInvertedLists.h"
+#include "faiss/IndexBinaryFlat.h"
+#include "faiss/IndexBinaryFromFloat.h"
+#include "faiss/IndexBinaryHNSW.h"
+#include "faiss/IndexBinaryIVF.h"
+#include "faiss/utils/distances.h"
+#include "faiss/index_factory.h"
+
+using namespace faiss;
+
+#define PRINT_RESULT 0
+
+void print_result(const char* unit, long number, long k, long nq, long *I) {
+    printf("%s: I (2 first results)=\n", unit);
+    for(int i = 0; i < number; i++) {
+        for(int j = 0; j < k; j++)
+            printf("%5ld ", I[i * k + j]);
+        printf("\n");
+    }
+
+    printf("%s: I (2 last results)=\n", unit);
+    for(int i = nq - number; i < nq; i++) {
+        for(int j = 0; j < k; j++)
+            printf("%5ld ", I[i * k + j]);
+        printf("\n");
+    }
+}
+
+
+int main() {
+    const char* filename = "index500k.index";
+    
+#if PRINT_RESULT
+    int number = 8;
+#endif
+
+    int d = 512;                            // dimension
+    int nq = 10;                        // nb of queries
+    int nprobe = 1;
+    float *xq = new float[d * nq];
+    for(int i = 0; i < nq; i++) {
+        for(int j = 0; j < d; j++) {
+            xq[d * i + j] = drand48();
+        }
+    }
+    faiss::distance_compute_blas_threshold = 800;
+
+    faiss::gpu::StandardGpuResources res;
+
+    int k = 8;
+    std::shared_ptr<faiss::Index> gpu_index_ivf_ptr;
+
+    const char* index_description = "IVF16384,SQ8";
+//     const char* index_description = "IVF3276,SQ8";
+
+    faiss::Index *cpu_index = nullptr;
+    faiss::IndexIVF* cpu_ivf_index = nullptr;
+    if((access(filename,F_OK))==-1) {
+        // create database
+        long nb = 500000;                       // database size
+//        printf("-----------------------\n");
+        long size = d * nb;
+        float *xb = new float[size];
+        memset(xb, 0, size * sizeof(float));
+        printf("size: %ld\n", (size * sizeof(float)) );
+        for(long i = 0; i < nb; i++) {
+            for(long j = 0; j < d; j++) {
+                float rand = drand48();
+                xb[d * i + j] = rand;
+            }
+        }
+
+        faiss::Index *ori_index = faiss::index_factory(d, index_description, faiss::METRIC_L2);
+        auto device_index = faiss::gpu::index_cpu_to_gpu(&res, 0, ori_index);
+
+        gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(device_index);
+
+        assert(!device_index->is_trained);
+        device_index->train(nb, xb);
+        assert(device_index->is_trained);
+        device_index->add(nb, xb);  // add vectors to the index
+
+        printf("is_trained = %s\n", device_index->is_trained ? "true" : "false");
+        printf("ntotal = %ld\n", device_index->ntotal);
+
+        cpu_index = faiss::gpu::index_gpu_to_cpu ((device_index));
+        faiss::write_index(cpu_index, filename);
+        printf("index.index is stored successfully.\n");
+        delete [] xb;
+    } else {
+        cpu_index = faiss::read_index(filename);
+    }
+
+    cpu_ivf_index = dynamic_cast<faiss::IndexIVF*>(cpu_index);
+    if(cpu_ivf_index != nullptr) {
+        cpu_ivf_index->to_readonly();
+    }
+
+    auto init_gpu =[&](int device_id, faiss::gpu::GpuClonerOptions* option) {
+        option->allInGpu = true;
+        faiss::Index* tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, cpu_index, option);
+        delete tmp_index;
+    };
+
+    auto gpu_executor = [&](int device_id, faiss::gpu::GpuClonerOptions* option) {
+    auto tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, cpu_index, option);
+    delete tmp_index;
+    double t0 = getmillisecs ();
+    {
+        // cpu to gpu
+        option->allInGpu = true;
+
+        tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, cpu_index, option);
+        gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
+    }
+    double t1 = getmillisecs ();
+    printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
+
+    {
+        long *I = new long[k * nq];
+        float *D = new float[k * nq];
+	if(option->allInGpu) {
+	    faiss::gpu::GpuIndexIVF* gpu_index_ivf =
+		dynamic_cast<faiss::gpu::GpuIndexIVF*>(gpu_index_ivf_ptr.get());
+	    gpu_index_ivf->setNumProbes(nprobe);
+        for(long i = 0; i < 1; ++ i) {
+            double t2 = getmillisecs();
+            gpu_index_ivf_ptr->search(nq, xq, k, D, I);
+            double t3 = getmillisecs();
+            printf("* GPU: %d, execution time: %0.2f\n", device_id, t3 - t2);
+        }
+	} else {
+	    faiss::IndexIVFScalarQuantizer* index_ivf =
+		dynamic_cast<faiss::IndexIVFScalarQuantizer*>(gpu_index_ivf_ptr.get());
+	    index_ivf->nprobe = nprobe;
+        for(long i = 0; i < 1; ++ i) {
+            double t2 = getmillisecs();
+	        index_ivf->search(nq, xq, k, D, I);
+            double t3 = getmillisecs();
+            printf("- GPU: %d, execution time: %0.2f\n", device_id, t3 - t2);
+        }
+	}
+
+        // print results
+#if PRINT_RESULT
+        print_result("GPU", number, k, nq, I);
+#endif
+        delete [] I;
+        delete [] D;
+    }
+    double t4 = getmillisecs();
+
+    printf("GPU:%d total time: %0.2f\n", device_id, t4 - t0);
+
+    };
+    printf("----------------------------------\n");
+    auto cpu_executor = [&]() {       // search xq
+        printf("CPU: \n");
+        long *I = new long[k * nq];
+        float *D = new float[k * nq];
+
+        double t4 = getmillisecs();
+        faiss::IndexIVF* ivf_index =
+            dynamic_cast<faiss::IndexIVF*>(cpu_index);
+        ivf_index->nprobe = nprobe;
+        cpu_index->search(nq, xq, k, D, I);
+        double t5 = getmillisecs();
+        printf("CPU execution time: %0.2f\n", t5 - t4);
+#if PRINT_RESULT
+        print_result("CPU", number, k, nq, I);
+#endif
+        delete [] I;
+        delete [] D;
+    };
+
+    for(long i = 0; i < 1; ++ i) {
+        cpu_executor();
+    }
+
+    faiss::gpu::GpuClonerOptions option0;
+    faiss::gpu::GpuClonerOptions option1;
+
+//    init_gpu(0, &option0);
+//    init_gpu(1, &option1);
+
+//    double tx = getmillisecs();
+    std::thread t1(gpu_executor, 0, &option0);
+    std::thread t2(gpu_executor, 1, &option1);
+    t1.join();
+    t2.join();
+//    double ty = getmillisecs();
+//    printf("Total GPU execution time: %0.2f\n", ty - tx);
+
+    delete [] xq;
+    return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/tutorial/cpp/6-GPU.cpp b/core/src/index/thirdparty/faiss/tutorial/cpp/6-GPU.cpp
new file mode 100644
index 0000000000..f992884cba
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/cpp/6-GPU.cpp
@@ -0,0 +1,255 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <unistd.h>
+
+#include <iostream>
+
+#include "faiss/IndexIVF.h"
+#include "faiss/IndexFlat.h"
+#include "faiss/index_io.h"
+#include "faiss/gpu/GpuIndexFlat.h"
+#include "faiss/gpu/StandardGpuResources.h"
+#include "faiss/gpu/GpuAutoTune.h"
+#include "faiss/gpu/GpuCloner.h"
+#include "faiss/gpu/GpuClonerOptions.h"
+#include "faiss/gpu/GpuIndexIVF.h"
+#include "faiss/gpu/GpuIndexIVFSQHybrid.h"
+
+#include "faiss/impl/FaissAssert.h"
+#include "faiss/impl/AuxIndexStructures.h"
+
+#include "faiss/IndexFlat.h"
+#include "faiss/VectorTransform.h"
+#include "faiss/IndexLSH.h"
+#include "faiss/IndexPQ.h"
+
+#include "faiss/IndexIVFPQ.h"
+#include "faiss/IndexIVFFlat.h"
+#include "faiss/IndexIVFSpectralHash.h"
+#include "faiss/MetaIndexes.h"
+#include "faiss/IndexSQHybrid.h"
+#include "faiss/IndexHNSW.h"
+#include "faiss/OnDiskInvertedLists.h"
+#include "faiss/IndexBinaryFlat.h"
+#include "faiss/IndexBinaryFromFloat.h"
+#include "faiss/IndexBinaryHNSW.h"
+#include "faiss/IndexBinaryIVF.h"
+#include "faiss/utils/distances.h"
+#include "faiss/index_factory.h"
+
+using namespace faiss;
+
+#define PRINT_RESULT 0
+
+void print_result(const char* unit, long number, long k, long nq, long *I) {
+    printf("%s: I (2 first results)=\n", unit);
+    for(int i = 0; i < number; i++) {
+        for(int j = 0; j < k; j++)
+            printf("%5ld ", I[i * k + j]);
+        printf("\n");
+    }
+
+    printf("%s: I (2 last results)=\n", unit);
+    for(int i = nq - number; i < nq; i++) {
+        for(int j = 0; j < k; j++)
+            printf("%5ld ", I[i * k + j]);
+        printf("\n");
+    }
+}
+
+
+int main() {
+    const char* filename = "index500k-h.index";
+
+#if PRINT_RESULT
+    int number = 8;
+#endif
+
+    int d = 512;                            // dimension
+    int nq = 10;                        // nb of queries
+    int nprobe = 1;
+    float *xq = new float[d * nq];
+    for(int i = 0; i < nq; i++) {
+        for(int j = 0; j < d; j++) {
+            xq[d * i + j] = drand48();
+        }
+    }
+    faiss::distance_compute_blas_threshold = 800;
+
+    faiss::gpu::StandardGpuResources res;
+
+    int k = 8;
+    std::shared_ptr<faiss::Index> gpu_index_ivf_ptr;
+
+    const char* index_description = "IVF16384,SQ8Hybrid";
+//     const char* index_description = "IVF3276,SQ8";
+
+    faiss::Index *cpu_index = nullptr;
+    faiss::IndexIVF* cpu_ivf_index = nullptr;
+    if((access(filename,F_OK))==-1) {
+        // create database
+        long nb = 500000;                       // database size
+//        printf("-----------------------\n");
+        long size = d * nb;
+        float *xb = new float[size];
+        memset(xb, 0, size * sizeof(float));
+        printf("size: %ld\n", (size * sizeof(float)) );
+        for(long i = 0; i < nb; i++) {
+            for(long j = 0; j < d; j++) {
+                float rand = drand48();
+                xb[d * i + j] = rand;
+            }
+        }
+
+        faiss::Index *ori_index = faiss::index_factory(d, index_description, faiss::METRIC_L2);
+        auto device_index = faiss::gpu::index_cpu_to_gpu(&res, 0, ori_index);
+
+        gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(device_index);
+
+        assert(!device_index->is_trained);
+        device_index->train(nb, xb);
+        assert(device_index->is_trained);
+        device_index->add(nb, xb);  // add vectors to the index
+
+        printf("is_trained = %s\n", device_index->is_trained ? "true" : "false");
+        printf("ntotal = %ld\n", device_index->ntotal);
+
+        cpu_index = faiss::gpu::index_gpu_to_cpu ((device_index));
+        faiss::write_index(cpu_index, filename);
+        printf("index.index is stored successfully.\n");
+        delete [] xb;
+    } else {
+        cpu_index = faiss::read_index(filename);
+    }
+
+    cpu_ivf_index = dynamic_cast<faiss::IndexIVF*>(cpu_index);
+    if(cpu_ivf_index != nullptr) {
+        cpu_ivf_index->to_readonly();
+    }
+
+    auto gpu_executor = [&](int device_id, faiss::gpu::GpuClonerOptions* option, faiss::IndexComposition* index_composition) {
+        auto tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, index_composition, option);
+        delete tmp_index;
+        double t0 = getmillisecs ();
+        {
+            // cpu to gpu
+            tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, index_composition, option);
+            gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
+        }
+        double t1 = getmillisecs ();
+        printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
+
+        {
+            long *I = new long[k * nq];
+            float *D = new float[k * nq];
+
+            faiss::gpu::GpuIndexIVFSQHybrid* gpu_index_ivf_hybrid =
+                    dynamic_cast<faiss::gpu::GpuIndexIVFSQHybrid*>(gpu_index_ivf_ptr.get());
+            gpu_index_ivf_hybrid->setNumProbes(nprobe);
+            for(long i = 0; i < 1; ++ i) {
+                double t2 = getmillisecs();
+                gpu_index_ivf_ptr->search(nq, xq, k, D, I);
+                double t3 = getmillisecs();
+                printf("* GPU: %d, execution time: %0.2f\n", device_id, t3 - t2);
+            }
+
+            // print results
+#if PRINT_RESULT
+            print_result("GPU", number, k, nq, I);
+#endif
+            delete [] I;
+            delete [] D;
+        }
+        double t4 = getmillisecs();
+
+        printf("GPU:%d total time: %0.2f\n", device_id, t4 - t0);
+
+    };
+    printf("----------------------------------\n");
+    auto cpu_executor = [&](faiss::IndexComposition* index_composition) {       // search xq
+        printf("CPU: \n");
+        long *I = new long[k * nq];
+        float *D = new float[k * nq];
+
+        double t4 = getmillisecs();
+        faiss::IndexIVF* ivf_index =
+                dynamic_cast<faiss::IndexIVF*>(cpu_index);
+        ivf_index->nprobe = nprobe;
+
+        faiss::gpu::GpuIndexFlat* is_gpu_flat_index = dynamic_cast<faiss::gpu::GpuIndexFlat*>(ivf_index->quantizer);
+        if(is_gpu_flat_index == nullptr) {
+            delete ivf_index->quantizer;
+            ivf_index->quantizer = index_composition->quantizer;
+        }
+
+        cpu_index->search(nq, xq, k, D, I);
+        double t5 = getmillisecs();
+        printf("CPU execution time: %0.2f\n", t5 - t4);
+#if PRINT_RESULT
+        print_result("CPU", number, k, nq, I);
+#endif
+        delete [] I;
+        delete [] D;
+    };
+
+
+    faiss::gpu::GpuClonerOptions option0;
+    faiss::gpu::GpuClonerOptions option1;
+
+    faiss::IndexComposition index_composition0;
+    index_composition0.index = cpu_index;
+    index_composition0.quantizer = nullptr;
+    index_composition0.mode = 0; // only quantizer
+
+    // Copy quantizer to GPU 0
+    auto index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
+    delete index1;
+
+    faiss::IndexComposition index_composition1;
+    index_composition1.index = cpu_index;
+    index_composition1.quantizer = nullptr;
+    index_composition1.mode = 0; // only quantizer
+
+    // Copy quantizer to GPU 1
+    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
+    delete index1;
+
+    std::thread t_cpu1(cpu_executor, &index_composition0);
+    t_cpu1.join();
+    std::thread t_cpu2(cpu_executor, &index_composition1);
+    t_cpu2.join();
+
+    index_composition0.mode = 2; // only data
+    index_composition1.mode = 2; // only data
+
+    index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
+    delete index1;
+    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
+    delete index1;
+
+//    double tx = getmillisecs();
+    std::thread t1(gpu_executor, 0, &option0, &index_composition0);
+    std::thread t2(gpu_executor, 1, &option1, &index_composition1);
+    t1.join();
+    t2.join();
+
+//    std::thread t3(gpu_executor, 0, &option0, &index_composition0);
+//    std::thread t4(gpu_executor, 1, &option1, &index_composition1);
+//    t3.join();
+//    t4.join();
+//    double ty = getmillisecs();
+//    printf("Total GPU execution time: %0.2f\n", ty - tx);
+    cpu_executor(&index_composition0);
+    cpu_executor(&index_composition1);
+
+    delete [] xq;
+    return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/tutorial/cpp/6-RUN.cpp b/core/src/index/thirdparty/faiss/tutorial/cpp/6-RUN.cpp
new file mode 100644
index 0000000000..2c09fef266
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/cpp/6-RUN.cpp
@@ -0,0 +1,247 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <unistd.h>
+
+#include <iostream>
+
+#include "faiss/IndexIVF.h"
+#include "faiss/IndexFlat.h"
+#include "faiss/index_io.h"
+#include "faiss/gpu/GpuIndexFlat.h"
+#include "faiss/gpu/StandardGpuResources.h"
+#include "faiss/gpu/GpuAutoTune.h"
+#include "faiss/gpu/GpuCloner.h"
+#include "faiss/gpu/GpuClonerOptions.h"
+#include "faiss/gpu/GpuIndexIVF.h"
+#include "faiss/gpu/GpuIndexIVFSQHybrid.h"
+
+#include "faiss/impl/FaissAssert.h"
+#include "faiss/impl/AuxIndexStructures.h"
+
+#include "faiss/IndexFlat.h"
+#include "faiss/VectorTransform.h"
+#include "faiss/IndexLSH.h"
+#include "faiss/IndexPQ.h"
+
+#include "faiss/IndexIVFPQ.h"
+#include "faiss/IndexIVFFlat.h"
+#include "faiss/IndexIVFSpectralHash.h"
+#include "faiss/MetaIndexes.h"
+#include "faiss/IndexSQHybrid.h"
+#include "faiss/IndexHNSW.h"
+#include "faiss/OnDiskInvertedLists.h"
+#include "faiss/IndexBinaryFlat.h"
+#include "faiss/IndexBinaryFromFloat.h"
+#include "faiss/IndexBinaryHNSW.h"
+#include "faiss/IndexBinaryIVF.h"
+#include "faiss/utils/distances.h"
+#include "faiss/index_factory.h"
+
+using namespace faiss;
+
+#define PRINT_RESULT 0
+std::shared_ptr<faiss::Index> gpu_index_ivf_ptr;
+const int d = 512;                            // dimension
+const int nq = 1000;                        // nb of queries
+const int nprobe = 1;
+int k = 8;
+
+void
+print_result(const char* unit, long number, long k, long nq, long* I) {
+    printf("%s: I (2 first results)=\n", unit);
+    for (int i = 0; i < number; i++) {
+        for (int j = 0; j < k; j++)
+            printf("%5ld ", I[i * k + j]);
+        printf("\n");
+    }
+
+    printf("%s: I (2 last results)=\n", unit);
+    for (int i = nq - number; i < nq; i++) {
+        for (int j = 0; j < k; j++)
+            printf("%5ld ", I[i * k + j]);
+        printf("\n");
+    }
+}
+
+void
+cpu_executor(faiss::Index* cpu_index, float*& xq) {       // search xq
+    printf("CPU: \n");
+    long* I = new long[k * nq];
+    float* D = new float[k * nq];
+
+    double t4 = getmillisecs();
+    faiss::IndexIVF* ivf_index =
+        dynamic_cast<faiss::IndexIVF*>(cpu_index);
+    ivf_index->nprobe = nprobe;
+    cpu_index->search(nq, xq, k, D, I);
+    double t5 = getmillisecs();
+    printf("CPU execution time: %0.2f\n", t5 - t4);
+#if PRINT_RESULT
+    print_result("CPU", number, k, nq, I);
+#endif
+    delete[] I;
+    delete[] D;
+};
+
+void
+hybrid_executor(faiss::Index* cpu_index,
+                faiss::IndexComposition* index_composition,
+                float*& xq) {       // search xq
+    printf("HYBRID: \n");
+    long* I = new long[k * nq];
+    float* D = new float[k * nq];
+
+    double t4 = getmillisecs();
+    faiss::IndexIVF* ivf_index = dynamic_cast<faiss::IndexIVF*>(cpu_index);
+    ivf_index->nprobe = nprobe;
+
+    faiss::gpu::GpuIndexFlat* is_gpu_flat_index = dynamic_cast<faiss::gpu::GpuIndexFlat*>(ivf_index->quantizer);
+    if (is_gpu_flat_index == nullptr) {
+        delete ivf_index->quantizer;
+        ivf_index->quantizer = index_composition->quantizer;
+    }
+
+    cpu_index->search(nq, xq, k, D, I);
+    double t5 = getmillisecs();
+    printf("HYBRID execution time: %0.2f\n", t5 - t4);
+#if PRINT_RESULT
+    print_result("HYBRID", number, k, nq, I);
+#endif
+    delete[] I;
+    delete[] D;
+};
+
+void
+gpu_executor(faiss::gpu::StandardGpuResources& res,
+             int device_id,
+             faiss::gpu::GpuClonerOptions* option,
+             faiss::IndexComposition* index_composition,
+             float*& xq) {
+    auto tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, index_composition, option);
+    delete tmp_index;
+    double t0 = getmillisecs();
+    {
+        // cpu to gpu
+        tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, index_composition, option);
+        gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
+    }
+    double t1 = getmillisecs();
+    printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
+
+    {
+        long* I = new long[k * nq];
+        float* D = new float[k * nq];
+
+        faiss::gpu::GpuIndexIVFSQHybrid
+            * gpu_index_ivf_hybrid = dynamic_cast<faiss::gpu::GpuIndexIVFSQHybrid*>(gpu_index_ivf_ptr.get());
+        gpu_index_ivf_hybrid->setNumProbes(nprobe);
+        for (long i = 0; i < 1; ++i) {
+            double t2 = getmillisecs();
+            gpu_index_ivf_ptr->search(nq, xq, k, D, I);
+            double t3 = getmillisecs();
+            printf("* GPU: %d, execution time: %0.2f\n", device_id, t3 - t2);
+        }
+
+        // print results
+#if PRINT_RESULT
+        print_result("GPU", number, k, nq, I);
+#endif
+        delete[] I;
+        delete[] D;
+    }
+    double t4 = getmillisecs();
+
+    printf("GPU:%d total time: %0.2f\n", device_id, t4 - t0);
+
+};
+
+int
+main() {
+    const char* filename = "index500k-h.index";
+    faiss::gpu::StandardGpuResources res;
+
+#if PRINT_RESULT
+    int number = 8;
+#endif
+
+    float* xq = new float[d * nq];
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++) {
+            xq[d * i + j] = drand48();
+        }
+    }
+    faiss::distance_compute_blas_threshold = 800;
+
+    faiss::Index* cpu_index = nullptr;
+    faiss::IndexIVF* cpu_ivf_index = nullptr;
+    if ((access(filename, F_OK)) == -1) {
+        printf("index file not found.");
+        exit(-1);
+    } else {
+        cpu_index = faiss::read_index(filename);
+    }
+
+    cpu_ivf_index = dynamic_cast<faiss::IndexIVF*>(cpu_index);
+    if (cpu_ivf_index != nullptr) {
+        cpu_ivf_index->to_readonly();
+    }
+
+    printf("============================\n");
+    cpu_executor(cpu_index, xq);
+    cpu_executor(cpu_index, xq);
+    printf("============================\n");
+
+    faiss::gpu::GpuClonerOptions option0;
+    faiss::gpu::GpuClonerOptions option1;
+
+    faiss::IndexComposition index_composition0;
+    index_composition0.index = cpu_index;
+    index_composition0.quantizer = nullptr;
+    index_composition0.mode = 0; // only quantizer
+
+    // Copy quantizer to GPU 0
+    auto index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
+    delete index1;
+
+    faiss::IndexComposition index_composition1;
+    index_composition1.index = cpu_index;
+    index_composition1.quantizer = nullptr;
+    index_composition1.mode = 0; // only quantizer
+
+    // Copy quantizer to GPU 1
+    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
+    delete index1;
+
+    hybrid_executor(cpu_index, &index_composition0, xq);
+    hybrid_executor(cpu_index, &index_composition1, xq);
+
+    printf("============================\n");
+
+    index_composition0.mode = 2; // only data
+    index_composition1.mode = 2; // only data
+
+    index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
+    delete index1;
+    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
+    delete index1;
+
+    gpu_executor(res, 0, &option0, &index_composition0, xq);
+    gpu_executor(res, 1, &option1, &index_composition1, xq);
+
+    printf("============================\n");
+
+    hybrid_executor(cpu_index, &index_composition0, xq);
+    hybrid_executor(cpu_index, &index_composition1, xq);
+
+    delete[] xq;
+    gpu_index_ivf_ptr = nullptr;
+    return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/tutorial/cpp/7-GPU.cpp b/core/src/index/thirdparty/faiss/tutorial/cpp/7-GPU.cpp
new file mode 100644
index 0000000000..4ab91f27db
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/cpp/7-GPU.cpp
@@ -0,0 +1,347 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <unistd.h>
+
+#include <iostream>
+
+#include "faiss/IndexIVF.h"
+#include "faiss/IndexFlat.h"
+#include "faiss/index_io.h"
+#include "faiss/gpu/GpuIndexFlat.h"
+#include "faiss/gpu/StandardGpuResources.h"
+#include "faiss/gpu/GpuAutoTune.h"
+#include "faiss/gpu/GpuClonerOptions.h"
+#include "faiss/gpu/GpuCloner.h"
+#include "faiss/gpu/GpuIndexIVF.h"
+#include "faiss/gpu/GpuIndexIVFSQHybrid.h"
+
+
+#include "faiss/IndexFlat.h"
+#include "faiss/VectorTransform.h"
+#include "faiss/IndexLSH.h"
+#include "faiss/IndexPQ.h"
+
+#include "faiss/IndexIVFPQ.h"
+#include "faiss/IndexIVFFlat.h"
+#include "faiss/IndexIVFSpectralHash.h"
+#include "faiss/MetaIndexes.h"
+#include "faiss/IndexSQHybrid.h"
+#include "faiss/IndexHNSW.h"
+#include "faiss/OnDiskInvertedLists.h"
+#include "faiss/IndexBinaryFlat.h"
+#include "faiss/IndexBinaryFromFloat.h"
+#include "faiss/IndexBinaryHNSW.h"
+#include "faiss/IndexBinaryIVF.h"
+#include "faiss/utils/distances.h"
+#include "faiss/clone_index.h"
+#include "faiss/index_factory.h"
+
+using namespace faiss;
+
+#define PRINT_RESULT 0
+
+void print_result(const char* unit, long number, long k, long nq, long *I) {
+    printf("%s: I (2 first results)=\n", unit);
+    for(int i = 0; i < number; i++) {
+        for(int j = 0; j < k; j++)
+            printf("%5ld ", I[i * k + j]);
+        printf("\n");
+    }
+
+    printf("%s: I (2 last results)=\n", unit);
+    for(int i = nq - number; i < nq; i++) {
+        for(int j = 0; j < k; j++)
+            printf("%5ld ", I[i * k + j]);
+        printf("\n");
+    }
+}
+
+void
+GpuLoad(faiss::gpu::StandardGpuResources* res,
+        int device_id,
+        faiss::gpu::GpuClonerOptions* option,
+        faiss::IndexComposition* index_composition,
+        std::shared_ptr<faiss::Index>& gpu_index_ivf_ptr
+        ) {
+
+    double t0 = getmillisecs ();
+
+    auto tmp_index = faiss::gpu::index_cpu_to_gpu(res, device_id, index_composition, option);
+    gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
+
+    double t1 = getmillisecs ();
+    printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
+}
+
+void
+GpuExecutor(
+        std::shared_ptr<faiss::Index>& gpu_index_ivf_ptr,
+        faiss::gpu::StandardGpuResources& res,
+        int device_id,
+        faiss::gpu::GpuClonerOptions* option,
+        faiss::IndexComposition* index_composition,
+        int nq,
+        int nprobe,
+        int k,
+        float* xq) {
+    double t0 = getmillisecs ();
+    {
+        long *I = new long[k * nq];
+        float *D = new float[k * nq];
+
+        faiss::gpu::GpuIndexIVFSQHybrid* gpu_index_ivf_hybrid =
+                dynamic_cast<faiss::gpu::GpuIndexIVFSQHybrid*>(gpu_index_ivf_ptr.get());
+        gpu_index_ivf_hybrid->setNumProbes(nprobe);
+        for(long i = 0; i < 4; ++ i) {
+            double t2 = getmillisecs();
+            gpu_index_ivf_ptr->search(nq, xq, k, D, I);
+            double t3 = getmillisecs();
+            printf("* GPU: %d, execution time: %0.2f\n", device_id, t3 - t2);
+        }
+
+        // print results
+#if PRINT_RESULT
+        print_result("GPU", number, k, nq, I);
+#endif
+        delete [] I;
+        delete [] D;
+        gpu_index_ivf_ptr = nullptr;
+    }
+    double t4 = getmillisecs();
+
+    printf("GPU:%d total time: %0.2f\n", device_id, t4 - t0);
+}
+
+
+void
+GpuExecutor(
+        faiss::gpu::StandardGpuResources& res,
+        int device_id,
+        faiss::gpu::GpuClonerOptions* option,
+        faiss::IndexComposition* index_composition,
+        int nq,
+        int nprobe,
+        int k,
+        float* xq) {
+    auto tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, index_composition, option);
+    delete tmp_index;
+    double t0 = getmillisecs ();
+    // cpu to gpu
+    tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, index_composition, option);
+    auto gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
+
+    double t1 = getmillisecs ();
+    printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
+
+    {
+        long *I = new long[k * nq];
+        float *D = new float[k * nq];
+
+        faiss::gpu::GpuIndexIVFSQHybrid* gpu_index_ivf_hybrid =
+                dynamic_cast<faiss::gpu::GpuIndexIVFSQHybrid*>(gpu_index_ivf_ptr.get());
+        gpu_index_ivf_hybrid->setNumProbes(nprobe);
+        for(long i = 0; i < 4; ++ i) {
+            double t2 = getmillisecs();
+            gpu_index_ivf_ptr->search(nq, xq, k, D, I);
+            double t3 = getmillisecs();
+            printf("* GPU: %d, execution time: %0.2f\n", device_id, t3 - t2);
+        }
+
+        // print results
+#if PRINT_RESULT
+        print_result("GPU", number, k, nq, I);
+#endif
+        delete [] I;
+        delete [] D;
+        gpu_index_ivf_ptr = nullptr;
+    }
+    double t4 = getmillisecs();
+
+    printf("GPU:%d total time: %0.2f\n", device_id, t4 - t0);
+}
+
+void
+CpuExecutor(
+        faiss::IndexComposition* index_composition,
+        int nq,
+        int nprobe,
+        int k,
+        float* xq,
+        faiss::Index *cpu_index) {
+    printf("CPU: \n");
+    long *I = new long[k * nq];
+    float *D = new float[k * nq];
+
+    double t4 = getmillisecs();
+    faiss::IndexIVF* ivf_index =
+            dynamic_cast<faiss::IndexIVF*>(cpu_index);
+    ivf_index->nprobe = nprobe;
+
+    faiss::gpu::GpuIndexFlat* is_gpu_flat_index = dynamic_cast<faiss::gpu::GpuIndexFlat*>(ivf_index->quantizer);
+    if(is_gpu_flat_index == nullptr) {
+        delete ivf_index->quantizer;
+        ivf_index->quantizer = index_composition->quantizer;
+    }
+
+    cpu_index->search(nq, xq, k, D, I);
+    double t5 = getmillisecs();
+    printf("CPU execution time: %0.2f\n", t5 - t4);
+#if PRINT_RESULT
+    print_result("CPU", number, k, nq, I);
+#endif
+    delete [] I;
+    delete [] D;
+}
+
+int main() {
+    const char* filename = "index500k-h.index";
+
+#if PRINT_RESULT
+    int number = 8;
+#endif
+
+    int d = 512;                            // dimension
+    int nq = 1000;                        // nb of queries
+    int nprobe = 8;
+    float *xq = new float[d * nq];
+    for(int i = 0; i < nq; i++) {
+        for(int j = 0; j < d; j++) {
+            xq[d * i + j] = drand48();
+        }
+    }
+    faiss::distance_compute_blas_threshold = 800;
+
+    faiss::gpu::StandardGpuResources res;
+
+    int k = 1000;
+    std::shared_ptr<faiss::Index> gpu_index_ivf_ptr;
+
+    const char* index_description = "IVF16384,SQ8Hybrid";
+//     const char* index_description = "IVF3276,SQ8";
+
+    faiss::Index *cpu_index = nullptr;
+    faiss::IndexIVF* cpu_ivf_index = nullptr;
+    if((access(filename,F_OK))==-1) {
+        // create database
+        long nb = 500000;                       // database size
+//        printf("-----------------------\n");
+        long size = d * nb;
+        float *xb = new float[size];
+        memset(xb, 0, size * sizeof(float));
+        printf("size: %ld\n", (size * sizeof(float)) );
+        for(long i = 0; i < nb; i++) {
+            for(long j = 0; j < d; j++) {
+                float rand = drand48();
+                xb[d * i + j] = rand;
+            }
+        }
+
+        faiss::Index *ori_index = faiss::index_factory(d, index_description, faiss::METRIC_L2);
+        auto device_index = faiss::gpu::index_cpu_to_gpu(&res, 0, ori_index);
+
+        gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(device_index);
+
+        assert(!device_index->is_trained);
+        device_index->train(nb, xb);
+        assert(device_index->is_trained);
+        device_index->add(nb, xb);  // add vectors to the index
+
+        printf("is_trained = %s\n", device_index->is_trained ? "true" : "false");
+        printf("ntotal = %ld\n", device_index->ntotal);
+
+        cpu_index = faiss::gpu::index_gpu_to_cpu ((device_index));
+        faiss::write_index(cpu_index, filename);
+        printf("index.index is stored successfully.\n");
+        delete [] xb;
+    } else {
+        cpu_index = faiss::read_index(filename);
+    }
+
+    cpu_ivf_index = dynamic_cast<faiss::IndexIVF*>(cpu_index);
+    if(cpu_ivf_index != nullptr) {
+        cpu_ivf_index->to_readonly();
+    }
+
+    faiss::gpu::GpuClonerOptions option0;
+    faiss::gpu::GpuClonerOptions option1;
+
+    option0.allInGpu = true;
+    option1.allInGpu = true;
+
+    faiss::IndexComposition index_composition0;
+    index_composition0.index = cpu_index;
+    index_composition0.quantizer = nullptr;
+    index_composition0.mode = 1; // only quantizer
+
+    // Copy quantizer to GPU 0
+    auto index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
+    delete index1;
+
+    faiss::IndexComposition index_composition1;
+    index_composition1.index = cpu_index;
+    index_composition1.quantizer = nullptr;
+    index_composition1.mode = 1; // only quantizer
+
+    // Copy quantizer to GPU 1
+    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
+    delete index1;
+
+//    std::thread t_cpu1(cpu_executor, &index_composition0);
+//    t_cpu1.join();
+//    std::thread t_cpu2(cpu_executor, &index_composition1);
+//    t_cpu2.join();
+
+//    index_composition0.mode = 2; // only data
+//    index_composition1.mode = 2; // only data
+//
+//    index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
+//    delete index1;
+//    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
+//    delete index1;
+
+//    double tx = getmillisecs();
+//    std::thread t1(gpu_executor, 0, &option0, &index_composition0);
+//    std::thread t2(gpu_executor, 1, &option1, &index_composition1);
+//    t1.join();
+//    t2.join();
+//    for(long i = 0; i < 10; ++ i) {
+//        std::shared_ptr<faiss::Index> gpu_index_ptr00;
+//        std::shared_ptr<faiss::Index> gpu_index_ptr01;
+//
+//        std::thread t00(GpuLoad, &res, 0, &option0, &index_composition0, std::ref(gpu_index_ptr00));
+////        std::thread t2(GpuLoad, &res, 1, &option1, &index_composition1, std::ref(gpu_index_ptr1));
+//        std::thread t01(GpuLoad, &res, 0, &option0, &index_composition0, std::ref(gpu_index_ptr01));
+//
+//        t00.join();
+//
+//        GpuExecutor(gpu_index_ptr00, res, 0, &option0, &index_composition0, nq, nprobe, k, xq);
+//
+//        t01.join();
+////        t2.join();
+//        GpuExecutor(gpu_index_ptr01, res, 0, &option0, &index_composition0, nq, nprobe, k, xq);
+////        GpuExecutor(gpu_index_ptr1, res, 1, &option1, &index_composition1, nq, nprobe, k, xq);
+//    }
+
+//    std::thread t3(gpu_executor, 0, &option0, &index_composition0);
+//    std::thread t4(gpu_executor, 1, &option1, &index_composition1);
+//    t3.join();
+//    t4.join();
+//    double ty = getmillisecs();
+//    printf("Total GPU execution time: %0.2f\n", ty - tx);
+
+    CpuExecutor(&index_composition0, nq, nprobe, k, xq, cpu_index);
+    CpuExecutor(&index_composition1, nq, nprobe, k, xq, cpu_index);
+
+    /////
+    delete [] xq;
+    return 0;
+}
+
diff --git a/core/src/index/thirdparty/faiss/tutorial/cpp/8-GPU.cpp b/core/src/index/thirdparty/faiss/tutorial/cpp/8-GPU.cpp
new file mode 100644
index 0000000000..11f49a09cc
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/cpp/8-GPU.cpp
@@ -0,0 +1,479 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <unistd.h>
+
+#include <iostream>
+
+#include "faiss/IndexIVF.h"
+#include "faiss/IndexFlat.h"
+#include "faiss/index_io.h"
+#include "faiss/gpu/GpuIndexFlat.h"
+#include "faiss/gpu/StandardGpuResources.h"
+#include "faiss/gpu/GpuAutoTune.h"
+#include "faiss/gpu/GpuClonerOptions.h"
+#include "faiss/gpu/GpuCloner.h"
+#include "faiss/gpu/GpuIndexIVF.h"
+#include "faiss/gpu/GpuIndexIVFSQHybrid.h"
+
+#include "faiss/IndexFlat.h"
+#include "faiss/VectorTransform.h"
+#include "faiss/IndexLSH.h"
+#include "faiss/IndexPQ.h"
+
+#include "faiss/IndexIVFPQ.h"
+#include "faiss/IndexIVFFlat.h"
+#include "faiss/IndexIVFSpectralHash.h"
+#include "faiss/MetaIndexes.h"
+#include "faiss/IndexSQHybrid.h"
+#include "faiss/IndexHNSW.h"
+#include "faiss/OnDiskInvertedLists.h"
+#include "faiss/IndexBinaryFlat.h"
+#include "faiss/IndexBinaryFromFloat.h"
+#include "faiss/IndexBinaryHNSW.h"
+#include "faiss/IndexBinaryIVF.h"
+#include "faiss/utils/distances.h"
+#include "faiss/clone_index.h"
+#include "faiss/index_factory.h"
+
+using namespace faiss;
+
+#define PRINT_RESULT 0
+
+void print_result(const char* unit, long number, long k, long nq, long *I) {
+    printf("%s: I (2 first results)=\n", unit);
+    for(int i = 0; i < number; i++) {
+        for(int j = 0; j < k; j++)
+            printf("%5ld ", I[i * k + j]);
+        printf("\n");
+    }
+
+    printf("%s: I (2 last results)=\n", unit);
+    for(int i = nq - number; i < nq; i++) {
+        for(int j = 0; j < k; j++)
+            printf("%5ld ", I[i * k + j]);
+        printf("\n");
+    }
+}
+
+void
+GpuLoad(faiss::gpu::StandardGpuResources* res,
+        int device_id,
+        faiss::gpu::GpuClonerOptions* option,
+        faiss::IndexComposition* index_composition,
+        std::shared_ptr<faiss::Index>& gpu_index_ivf_ptr
+        ) {
+
+    double t0 = getmillisecs ();
+
+    auto tmp_index = faiss::gpu::index_cpu_to_gpu(res, device_id, index_composition, option);
+    gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
+
+    double t1 = getmillisecs ();
+    printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
+}
+
+void
+GpuExecutor(
+        std::shared_ptr<faiss::Index>& gpu_index_ivf_ptr,
+        faiss::gpu::StandardGpuResources& res,
+        int device_id,
+        faiss::gpu::GpuClonerOptions* option,
+        faiss::IndexComposition* index_composition,
+        int nq,
+        int nprobe,
+        int k,
+        float* xq) {
+    double t0 = getmillisecs ();
+    {
+        long *I = new long[k * nq];
+        float *D = new float[k * nq];
+
+        faiss::gpu::GpuIndexIVFSQHybrid* gpu_index_ivf_hybrid =
+                dynamic_cast<faiss::gpu::GpuIndexIVFSQHybrid*>(gpu_index_ivf_ptr.get());
+        gpu_index_ivf_hybrid->setNumProbes(nprobe);
+        for(long i = 0; i < 4; ++ i) {
+            double t2 = getmillisecs();
+            gpu_index_ivf_ptr->search(nq, xq, k, D, I);
+            double t3 = getmillisecs();
+            printf("* GPU: %d, execution time: %0.2f\n", device_id, t3 - t2);
+        }
+
+        // print results
+#if PRINT_RESULT
+        print_result("GPU", number, k, nq, I);
+#endif
+        delete [] I;
+        delete [] D;
+        gpu_index_ivf_ptr = nullptr;
+    }
+    double t4 = getmillisecs();
+
+    printf("GPU:%d total time: %0.2f\n", device_id, t4 - t0);
+}
+
+
+void
+GpuExecutor(
+        faiss::gpu::StandardGpuResources& res,
+        int device_id,
+        faiss::gpu::GpuClonerOptions* option,
+        faiss::IndexComposition* index_composition,
+        int nq,
+        int nprobe,
+        int k,
+        float* xq) {
+    auto tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, index_composition, option);
+    delete tmp_index;
+    double t0 = getmillisecs ();
+    // cpu to gpu
+    tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, index_composition, option);
+    auto gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
+
+    double t1 = getmillisecs ();
+    printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
+
+    {
+        long *I = new long[k * nq];
+        float *D = new float[k * nq];
+
+        faiss::gpu::GpuIndexIVFSQHybrid* gpu_index_ivf_hybrid =
+                dynamic_cast<faiss::gpu::GpuIndexIVFSQHybrid*>(gpu_index_ivf_ptr.get());
+        gpu_index_ivf_hybrid->setNumProbes(nprobe);
+        for(long i = 0; i < 4; ++ i) {
+            double t2 = getmillisecs();
+            gpu_index_ivf_ptr->search(nq, xq, k, D, I);
+            double t3 = getmillisecs();
+            printf("* GPU: %d, execution time: %0.2f\n", device_id, t3 - t2);
+        }
+
+        // print results
+#if PRINT_RESULT
+        print_result("GPU", number, k, nq, I);
+#endif
+        delete [] I;
+        delete [] D;
+        gpu_index_ivf_ptr = nullptr;
+    }
+    double t4 = getmillisecs();
+
+    printf("GPU:%d total time: %0.2f\n", device_id, t4 - t0);
+}
+
+void
+CpuExecutor(
+        faiss::IndexComposition* index_composition,
+        int nq,
+        int nprobe,
+        int k,
+        float* xq,
+        faiss::Index *cpu_index) {
+    printf("CPU: \n");
+    long *I = new long[k * nq];
+    float *D = new float[k * nq];
+
+    double t4 = getmillisecs();
+    faiss::IndexIVF* ivf_index =
+            dynamic_cast<faiss::IndexIVF*>(cpu_index);
+    ivf_index->nprobe = nprobe;
+
+    faiss::gpu::GpuIndexFlat* is_gpu_flat_index = dynamic_cast<faiss::gpu::GpuIndexFlat*>(ivf_index->quantizer);
+    if(is_gpu_flat_index == nullptr) {
+        delete ivf_index->quantizer;
+        ivf_index->quantizer = index_composition->quantizer;
+    }
+
+    cpu_index->search(nq, xq, k, D, I);
+    double t5 = getmillisecs();
+    printf("CPU execution time: %0.2f\n", t5 - t4);
+#if PRINT_RESULT
+    print_result("CPU", number, k, nq, I);
+#endif
+    delete [] I;
+    delete [] D;
+}
+
+void create_index(const char* filename, const char* index_description, long db_size, long d) {
+    faiss::gpu::StandardGpuResources res;
+    if((access(filename,F_OK))==-1) {
+        // create database
+        long size = d * db_size;
+        float *xb = new float[size];
+        memset(xb, 0, size * sizeof(float));
+        printf("size: %ld\n", (size * sizeof(float)) );
+        for(long i = 0; i < db_size; i++) {
+            for(long j = 0; j < d; j++) {
+                float rand = drand48();
+                xb[d * i + j] = rand;
+            }
+        }
+
+        faiss::Index *ori_index = faiss::index_factory(d, index_description, faiss::METRIC_INNER_PRODUCT);
+        auto device_index = faiss::gpu::index_cpu_to_gpu(&res, 0, ori_index);
+
+        std::shared_ptr<faiss::Index> gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(device_index);
+
+        assert(!device_index->is_trained);
+        device_index->train(db_size, xb);
+        assert(device_index->is_trained);
+        device_index->add(db_size, xb);  // add vectors to the index
+
+        printf("is_trained = %s\n", device_index->is_trained ? "true" : "false");
+        printf("ntotal = %ld\n", device_index->ntotal);
+
+        faiss::Index *cpu_index = faiss::gpu::index_gpu_to_cpu ((device_index));
+        faiss::write_index(cpu_index, filename);
+        printf("index.index is stored successfully.\n");
+        delete [] xb;
+    }
+}
+
+void execute_index(const char* filename, int d, int nq, int nprobe, int k, float* xq) {
+    faiss::gpu::StandardGpuResources res;
+    faiss::Index* cpu_index = faiss::read_index(filename);
+    faiss::IndexIVF* cpu_ivf_index = dynamic_cast<faiss::IndexIVF*>(cpu_index);
+
+    if(cpu_ivf_index != nullptr) {
+        cpu_ivf_index->to_readonly();
+    }
+
+    faiss::gpu::GpuClonerOptions option0;
+    faiss::gpu::GpuClonerOptions option1;
+
+    option0.allInGpu = true;
+    option1.allInGpu = true;
+
+    faiss::IndexComposition index_composition0;
+    index_composition0.index = cpu_index;
+    index_composition0.quantizer = nullptr;
+    index_composition0.mode = 1; // only quantizer
+
+    // Copy quantizer to GPU 0
+    auto index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
+    delete index1;
+
+    faiss::IndexComposition index_composition1;
+    index_composition1.index = cpu_index;
+    index_composition1.quantizer = nullptr;
+    index_composition1.mode = 1; // only quantizer
+
+    // Copy quantizer to GPU 1
+    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
+    delete index1;
+
+    //    std::thread t_cpu1(cpu_executor, &index_composition0);
+    //    t_cpu1.join();
+    //    std::thread t_cpu2(cpu_executor, &index_composition1);
+    //    t_cpu2.join();
+
+    index_composition0.mode = 2; // only data
+    index_composition1.mode = 2; // only data
+
+   // index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
+   // delete index1;
+   // index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
+   // delete index1;
+
+    //    double tx = getmillisecs();
+    //    std::thread t1(gpu_executor, 0, &option0, &index_composition0);
+    //    std::thread t2(gpu_executor, 1, &option1, &index_composition1);
+    //    t1.join();
+    //    t2.join();
+    for(long i = 0; i < 1; ++ i) {
+        std::shared_ptr<faiss::Index> gpu_index_ptr00;
+        std::shared_ptr<faiss::Index> gpu_index_ptr01;
+
+        std::thread t00(GpuLoad, &res, 0, &option0, &index_composition0, std::ref(gpu_index_ptr00));
+        //        std::thread t2(GpuLoad, &res, 1, &option1, &index_composition1, std::ref(gpu_index_ptr1));
+        std::thread t01(GpuLoad, &res, 0, &option0, &index_composition0, std::ref(gpu_index_ptr01));
+
+        t00.join();
+
+        GpuExecutor(gpu_index_ptr00, res, 0, &option0, &index_composition0, nq, nprobe, k, xq);
+
+        t01.join();
+        //        t2.join();
+        GpuExecutor(gpu_index_ptr01, res, 0, &option0, &index_composition0, nq, nprobe, k, xq);
+    //        GpuExecutor(gpu_index_ptr1, res, 1, &option1, &index_composition1, nq, nprobe, k, xq);
+    }
+
+    delete index_composition0.quantizer;
+    delete index_composition1.quantizer;
+    delete cpu_index;
+}
+
+int main() {
+    const char* filename = "index500k-h.index";
+    int d = 512;                          // dimension
+    int nq = 1000;                        // nb of queries
+    int nprobe = 16;
+    int k = 1000;
+    float *xq = new float[d * nq];
+    for(int i = 0; i < nq; i++) {
+        for(int j = 0; j < d; j++) {
+            xq[d * i + j] = drand48();
+        }
+    }
+
+    long db_size = 500000;
+    const char* index_description = "IVF16384,SQ8Hybrid";
+    create_index(filename, index_description, db_size, d);
+    for(long i = 0; i < 1000; ++ i) {
+        execute_index(filename, d, nq, nprobe, k, xq);
+    }
+    delete[] xq;
+    xq = nullptr;
+    return 0;
+}
+
+/*
+int main() {
+    const char* filename = "index500k-h.index";
+
+#if PRINT_RESULT
+    int number = 8;
+#endif
+
+    int d = 512;                          // dimension
+    int nq = 1000;                        // nb of queries
+    int nprobe = 16;
+    float *xq = new float[d * nq];
+    for(int i = 0; i < nq; i++) {
+        for(int j = 0; j < d; j++) {
+            xq[d * i + j] = drand48();
+        }
+    }
+    faiss::distance_compute_blas_threshold = 800;
+
+    faiss::gpu::StandardGpuResources res;
+
+    int k = 1000;
+    std::shared_ptr<faiss::Index> gpu_index_ivf_ptr;
+
+    const char* index_description = "IVF16384,SQ8Hybrid";
+//     const char* index_description = "IVF3276,SQ8";
+
+    faiss::Index *cpu_index = nullptr;
+    faiss::IndexIVF* cpu_ivf_index = nullptr;
+    if((access(filename,F_OK))==-1) {
+        // create database
+        long nb = 500000;                       // database size
+//        printf("-----------------------\n");
+        long size = d * nb;
+        float *xb = new float[size];
+        memset(xb, 0, size * sizeof(float));
+        printf("size: %ld\n", (size * sizeof(float)) );
+        for(long i = 0; i < nb; i++) {
+            for(long j = 0; j < d; j++) {
+                float rand = drand48();
+                xb[d * i + j] = rand;
+            }
+        }
+
+        faiss::Index *ori_index = faiss::index_factory(d, index_description, faiss::METRIC_INNER_PRODUCT);
+        auto device_index = faiss::gpu::index_cpu_to_gpu(&res, 0, ori_index);
+
+        gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(device_index);
+
+        assert(!device_index->is_trained);
+        device_index->train(nb, xb);
+        assert(device_index->is_trained);
+        device_index->add(nb, xb);  // add vectors to the index
+
+        printf("is_trained = %s\n", device_index->is_trained ? "true" : "false");
+        printf("ntotal = %ld\n", device_index->ntotal);
+
+        cpu_index = faiss::gpu::index_gpu_to_cpu ((device_index));
+        faiss::write_index(cpu_index, filename);
+        printf("index.index is stored successfully.\n");
+        delete [] xb;
+    } else {
+        cpu_index = faiss::read_index(filename);
+    }
+
+    cpu_ivf_index = dynamic_cast<faiss::IndexIVF*>(cpu_index);
+    if(cpu_ivf_index != nullptr) {
+        cpu_ivf_index->to_readonly();
+    }
+
+    faiss::gpu::GpuClonerOptions option0;
+    faiss::gpu::GpuClonerOptions option1;
+
+    option0.allInGpu = true;
+    option1.allInGpu = true;
+
+    faiss::IndexComposition index_composition0;
+    index_composition0.index = cpu_index;
+    index_composition0.quantizer = nullptr;
+    index_composition0.mode = 1; // only quantizer
+
+    // Copy quantizer to GPU 0
+    auto index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
+    delete index1;
+
+    faiss::IndexComposition index_composition1;
+    index_composition1.index = cpu_index;
+    index_composition1.quantizer = nullptr;
+    index_composition1.mode = 1; // only quantizer
+
+    // Copy quantizer to GPU 1
+    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
+    delete index1;
+
+//    std::thread t_cpu1(cpu_executor, &index_composition0);
+//    t_cpu1.join();
+//    std::thread t_cpu2(cpu_executor, &index_composition1);
+//    t_cpu2.join();
+
+    index_composition0.mode = 2; // only data
+    index_composition1.mode = 2; // only data
+
+    index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
+    delete index1;
+    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
+    delete index1;
+
+//    double tx = getmillisecs();
+//    std::thread t1(gpu_executor, 0, &option0, &index_composition0);
+//    std::thread t2(gpu_executor, 1, &option1, &index_composition1);
+//    t1.join();
+//    t2.join();
+    for(long i = 0; i < 10; ++ i) {
+        std::shared_ptr<faiss::Index> gpu_index_ptr00;
+        std::shared_ptr<faiss::Index> gpu_index_ptr01;
+
+        std::thread t00(GpuLoad, &res, 0, &option0, &index_composition0, std::ref(gpu_index_ptr00));
+//        std::thread t2(GpuLoad, &res, 1, &option1, &index_composition1, std::ref(gpu_index_ptr1));
+        std::thread t01(GpuLoad, &res, 0, &option0, &index_composition0, std::ref(gpu_index_ptr01));
+
+        t00.join();
+
+        GpuExecutor(gpu_index_ptr00, res, 0, &option0, &index_composition0, nq, nprobe, k, xq);
+
+        t01.join();
+//        t2.join();
+        GpuExecutor(gpu_index_ptr01, res, 0, &option0, &index_composition0, nq, nprobe, k, xq);
+//        GpuExecutor(gpu_index_ptr1, res, 1, &option1, &index_composition1, nq, nprobe, k, xq);
+    }
+
+//    std::thread t3(gpu_executor, 0, &option0, &index_composition0);
+//    std::thread t4(gpu_executor, 1, &option1, &index_composition1);
+//    t3.join();
+//    t4.join();
+//    double ty = getmillisecs();
+//    printf("Total GPU execution time: %0.2f\n", ty - tx);
+//    CpuExecutor(&index_composition0, nq, nprobe, k, xq, cpu_index);
+//    CpuExecutor(&index_composition1, nq, nprobe, k, xq, cpu_index);
+
+    /////
+    delete [] xq;
+    return 0;
+}
+*/
diff --git a/core/src/index/thirdparty/faiss/tutorial/cpp/Makefile b/core/src/index/thirdparty/faiss/tutorial/cpp/Makefile
new file mode 100644
index 0000000000..81e041c4cc
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/cpp/Makefile
@@ -0,0 +1,25 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+-include ../../makefile.inc
+
+CPU_TARGETS = 1-Flat 2-IVFFlat 3-IVFPQ
+GPU_TARGETS = 6-RUN 7-GPU 8-GPU
+
+default: cpu
+
+all: cpu gpu
+
+cpu: $(CPU_TARGETS)
+
+gpu: $(GPU_TARGETS)
+
+%: %.cpp ../../libfaiss.a
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -o $@ $^ $(LDFLAGS) -I../../include $(LIBS)
+
+clean:
+	rm -f $(CPU_TARGETS) $(GPU_TARGETS)
+
+.PHONY: all cpu default gpu clean
diff --git a/core/src/index/thirdparty/faiss/tutorial/cpp/faiss_test.cpp b/core/src/index/thirdparty/faiss/tutorial/cpp/faiss_test.cpp
new file mode 100644
index 0000000000..6377f133c2
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/cpp/faiss_test.cpp
@@ -0,0 +1,378 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <unistd.h>
+
+#include <iostream>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/index_io.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/GpuAutoTune.h>
+
+
+#include "faiss/FaissAssert.h"
+#include "faiss/AuxIndexStructures.h"
+
+#include "faiss/IndexFlat.h"
+#include "faiss/VectorTransform.h"
+#include "faiss/IndexLSH.h"
+#include "faiss/IndexPQ.h"
+#include "faiss/IndexIVF.h"
+#include "faiss/IndexIVFPQ.h"
+#include "faiss/IndexIVFFlat.h"
+#include "faiss/IndexIVFSpectralHash.h"
+#include "faiss/MetaIndexes.h"
+#include "faiss/IndexScalarQuantizer.h"
+#include "faiss/IndexHNSW.h"
+#include "faiss/OnDiskInvertedLists.h"
+#include "faiss/IndexBinaryFlat.h"
+#include "faiss/IndexBinaryFromFloat.h"
+#include "faiss/IndexBinaryHNSW.h"
+#include "faiss/IndexBinaryIVF.h"
+#include "faiss/gpu/GpuIndexIVFSQ.h"
+#include "faiss/utils.h"
+
+
+using namespace faiss;
+
+void
+generate_file(const char *filename,
+                   long nb,
+                   long dimension,
+                   std::string index_desc,
+                   faiss::gpu::StandardGpuResources &res) {
+    long size = dimension * nb;
+    float *xb = new float[size];
+    printf("size: %lf(GB)\n", (size * sizeof(float)) / (3 * 1024.0 * 1024 * 1024));
+    for (long i = 0; i < nb; i++) {
+        for (long j = 0; j < dimension; j++) {
+            float rand = drand48();
+            xb[dimension * i + j] = rand;
+        }
+    }
+
+    faiss::Index *ori_index = faiss::index_factory(dimension, index_desc.c_str(), faiss::METRIC_L2);
+    auto device_index = faiss::gpu::index_cpu_to_gpu(&res, 0, ori_index);
+
+    assert(!device_index->is_trained);
+    device_index->train(nb, xb);
+    assert(device_index->is_trained);
+    device_index->add(nb, xb);
+
+    faiss::Index *cpu_index = faiss::gpu::index_gpu_to_cpu((device_index));
+    faiss::write_index(cpu_index, filename);
+    printf("index: %s is stored successfully.\n", filename);
+    delete[] xb;
+
+    return;
+}
+
+faiss::Index *
+get_index(const char *filename) {
+    return faiss::read_index(filename);
+}
+
+void
+execute_on_gpu(faiss::Index *index, float *xq, long nq, long k, long nprobe,
+    faiss::gpu::StandardGpuResources &res, long* I, float* D) {
+
+    double t0 = getmillisecs();
+
+    faiss::gpu::CpuToGpuClonerOptions option;
+    option.readonly = true;
+    faiss::Index *tmp_index = faiss::gpu::cpu_to_gpu(&res, 0, index, &option);
+    std::shared_ptr<faiss::Index> gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
+
+    double t1 = getmillisecs();
+    printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
+
+
+    double t2 = getmillisecs();
+    faiss::gpu::GpuIndexIVF *gpu_index_ivf =
+        dynamic_cast<faiss::gpu::GpuIndexIVF *>(gpu_index_ivf_ptr.get());
+    gpu_index_ivf->setNumProbes(nprobe);
+
+    gpu_index_ivf_ptr->search(nq, xq, k, D, I);
+    double t3 = getmillisecs();
+    printf("GPU execution time: %0.2f\n", t3 - t2);
+}
+
+void execute_on_cpu(faiss::Index *index, float* xq, long nq, long k, long nprobe, long* I, float* D) {
+    faiss::IndexIVF* ivf_index =
+        dynamic_cast<faiss::IndexIVF*>(index);
+    ivf_index->nprobe = nprobe;
+    index->search(nq, xq, k, D, I);
+}
+
+float *construct_queries(long nq, long dimension) {
+    float *xq = new float[dimension * nq];
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < dimension; j++) {
+            xq[dimension * i + j] = drand48();
+        }
+    }
+    return xq;
+}
+
+void print_result(long number, long nq, long k, long *I, float *D) {
+    printf("I (%ld first results)=\n", number);
+    for (int i = 0; i < number; i++) {
+        for (int j = 0; j < k; j++)
+            printf("%5ld ", I[i * k + j]);
+        printf("\n");
+    }
+
+    printf("I (%ld last results)=\n", number);
+    for (int i = nq - number; i < nq; i++) {
+        for (int j = 0; j < k; j++)
+            printf("%5ld ", I[i * k + j]);
+        printf("\n");
+    }
+}
+
+void faiss_setting() {
+    faiss::distance_compute_blas_threshold = 800;
+}
+
+int main() {
+    const char *filename = "index5.index";
+
+#if 0
+    long dimension = 512;
+    long nb = 6000000;
+    long nq = 1000;
+    long topk = 16;
+    long print_number = 8;
+    long nprobe = 32;
+
+    std::string index_desc = "IVF16384,SQ8";
+    faiss::gpu::StandardGpuResources res;
+    if ((access(filename, F_OK)) == -1) {
+        printf("file doesn't exist, create one\n");
+        generate_file(filename, nb, dimension, index_desc, res);
+    }
+
+    // Construct queries
+    float *xq = construct_queries(nq, dimension);
+
+    // Read index
+    faiss::Index *index = get_index(filename);
+
+    // Execute on GPU
+    long *I = new long[topk * nq];
+    float *D = new float[topk * nq];
+    execute_on_gpu(index, xq, nq, topk, nprobe, res, I, D);
+
+    // Print results
+    print_result(print_number, nq, topk, I, D);
+    delete[] I; I = nullptr;
+    delete[] D; D = nullptr;
+
+    // Execute on CPU
+    I = new long[topk * nq];
+    D = new float[topk * nq];
+    execute_on_cpu(index, xq, nq, topk, nprobe, I, D);
+
+    // Print results
+    print_result(print_number, nq, topk, I, D);
+    delete[] I;
+    delete[] D;
+
+    return 0;
+#else
+    int number = 8;
+    int d = 512;                            // dimension
+    int nq = 1000;                        // nb of queries
+    int nprobe = 16;
+    float *xq = new float[d * nq];
+    for(int i = 0; i < nq; i++) {
+        for(int j = 0; j < d; j++) {
+            xq[d * i + j] = drand48();
+//            printf("%lf ", xq[d * i + j]);
+        }
+//        xq[d * i] += i / 1000.;
+//        printf("\n");
+    }
+    faiss::distance_compute_blas_threshold = 800;
+
+    faiss::gpu::StandardGpuResources res;
+
+    int k = 16;
+    std::shared_ptr<faiss::Index> gpu_index_ivf_ptr;
+
+    const char* index_description = "IVF16384,SQ8";
+    // const char* index_description = "IVF3276,Flat";
+//    Index *index_factory (int d, const char *description,
+//                          MetricType metric = METRIC_L2);
+
+    faiss::Index *cpu_index = nullptr;
+    if((access(filename,F_OK))==-1) {
+        long nb = 6000000;
+        long dimension = d;
+        printf("file doesn't exist, create one\n");
+        generate_file(filename, nb, dimension, index_description, res);
+        /*
+        // create database
+                               // database size
+//        printf("-----------------------\n");
+        long size = d * nb;
+        float *xb = new float[size];
+        memset(xb, 0, size * sizeof(float));
+        printf("size: %ld\n", (size * sizeof(float)) );
+        for(long i = 0; i < nb; i++) {
+            for(long j = 0; j < d; j++) {
+                float rand = drand48();
+                xb[d * i + j] = rand;
+//                printf("%lf ", xb[d * i + j]);
+            }
+//            xb[d * i] += i / 1000.;
+//            printf("\n");
+        }
+
+        // Using an IVF index
+        // here we specify METRIC_L2, by default it performs inner-product search
+
+        faiss::Index *ori_index = faiss::index_factory(d, index_description, faiss::METRIC_L2);
+        auto device_index = faiss::gpu::index_cpu_to_gpu(&res, 0, ori_index);
+
+        gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(device_index);
+
+        assert(!device_index->is_trained);
+        device_index->train(nb, xb);
+        assert(device_index->is_trained);
+        device_index->add(nb, xb);  // add vectors to the index
+
+        printf("is_trained = %s\n", device_index->is_trained ? "true" : "false");
+        printf("ntotal = %ld\n", device_index->ntotal);
+
+        cpu_index = faiss::gpu::index_gpu_to_cpu ((device_index));
+        faiss::write_index(cpu_index, filename);
+        printf("index.index is stored successfully.\n");
+        delete [] xb;
+         */
+    } else {
+        cpu_index = get_index(filename);
+    }
+
+    {
+        // cpu to gpu
+        double t0 = getmillisecs ();
+        faiss::gpu::CpuToGpuClonerOptions option;
+        option.readonly = true;
+        faiss::Index* tmp_index = faiss::gpu::cpu_to_gpu(&res, 0, cpu_index, &option);
+
+        gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
+
+        // Gpu index dump
+
+        auto gpu_index_ivf_sq_ptr = dynamic_cast<faiss::gpu::GpuIndexIVFSQ*>(tmp_index);
+//        gpu_index_ivf_sq_ptr->dump();
+        double t1 = getmillisecs ();
+        printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
+        // // Cpu index dump
+        // auto cpu_index_ivf_sq_ptr = dynamic_cast<faiss::IndexIVF*>(cpu_index);
+        // cpu_index_ivf_sq_ptr->dump();
+    }
+
+
+    {       // search xq
+        long *I = new long[k * nq];
+        float *D = new float[k * nq];
+        double t2 = getmillisecs();
+        faiss::gpu::GpuIndexIVF* gpu_index_ivf =
+            dynamic_cast<faiss::gpu::GpuIndexIVF*>(gpu_index_ivf_ptr.get());
+        gpu_index_ivf->setNumProbes(nprobe);
+
+        gpu_index_ivf_ptr->search(nq, xq, k, D, I);
+        double t3 = getmillisecs();
+        printf("GPU execution time: %0.2f\n", t3 - t2);
+
+        // print results
+        printf("GPU: \n");
+#if 0
+        printf("GPU: I (2 first results)=\n");
+        for(int i = 0; i < number; i++) {
+            for(int j = 0; j < k; j++)
+                printf("GPU: %5ld(%f) ", I[i * k + j], D[i * k + j]);
+            printf("\n");
+        }
+
+        printf("GPU: I (2 last results)=\n");
+        for(int i = nq - number; i < nq; i++) {
+            for(int j = 0; j < k; j++)
+                printf("GPU: %5ld(%f) ", I[i * k + j], D[i * k + j]);
+            printf("\n");
+        }
+#else
+        printf("I (2 first results)=\n");
+        for(int i = 0; i < number; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+
+        printf("I (2 last results)=\n");
+        for(int i = nq - number; i < nq; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+#endif
+        delete [] I;
+        delete [] D;
+    }
+    printf("----------------------------------\n");
+    {       // search xq
+        printf("CPU: \n");
+        long *I = new long[k * nq];
+        float *D = new float[k * nq];
+
+        double t4 = getmillisecs();
+        faiss::IndexIVF* ivf_index =
+            dynamic_cast<faiss::IndexIVF*>(cpu_index);
+        ivf_index->nprobe = nprobe;
+        cpu_index->search(nq, xq, k, D, I);
+        double t5 = getmillisecs();
+        printf("CPU execution time: %0.2f\n", t5 - t4);
+#if 0
+        // print results
+        printf("CPU: I (2 first results)=\n");
+        for(int i = 0; i < number; i++) {
+            for(int j = 0; j < k; j++)
+                printf("CPU: %5ld(%f) ", I[i * k + j], D[i * k + j]);
+            printf("\n");
+        }
+
+        printf("CPU: I (2 last results)=\n");
+        for(int i = nq - number; i < nq; i++) {
+            for(int j = 0; j < k; j++)
+                printf("CPU: %5ld(%f) ", I[i * k + j], D[i * k + j]);
+            printf("\n");
+        }
+#else
+        // print results
+        printf("I (2 first results)=\n");
+        for(int i = 0; i < number; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+
+        printf("I (2 last results)=\n");
+        for(int i = nq - number; i < nq; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+#endif
+        delete [] I;
+        delete [] D;
+    }
+
+
+    delete [] xq;
+    return 0;
+#endif
+}
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/tutorial/python/1-Flat.py b/core/src/index/thirdparty/faiss/tutorial/python/1-Flat.py
new file mode 100644
index 0000000000..584c7bc703
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/python/1-Flat.py
@@ -0,0 +1,29 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+import faiss                   # make faiss available
+index = faiss.IndexFlatL2(d)   # build the index
+print(index.is_trained)
+index.add(xb)                  # add vectors to the index
+print(index.ntotal)
+
+k = 4                          # we want to see 4 nearest neighbors
+D, I = index.search(xb[:5], k) # sanity check
+print(I)
+print(D)
+D, I = index.search(xq, k)     # actual search
+print(I[:5])                   # neighbors of the 5 first queries
+print(I[-5:])                  # neighbors of the 5 last queries
diff --git a/core/src/index/thirdparty/faiss/tutorial/python/2-IVFFlat.py b/core/src/index/thirdparty/faiss/tutorial/python/2-IVFFlat.py
new file mode 100644
index 0000000000..a4ac0c4d1f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/python/2-IVFFlat.py
@@ -0,0 +1,34 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+import faiss
+
+nlist = 100
+k = 4
+quantizer = faiss.IndexFlatL2(d)  # the other index
+index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
+# here we specify METRIC_L2, by default it performs inner-product search
+
+assert not index.is_trained
+index.train(xb)
+assert index.is_trained
+
+index.add(xb)                  # add may be a bit slower as well
+D, I = index.search(xq, k)     # actual search
+print(I[-5:])                  # neighbors of the 5 last queries
+index.nprobe = 10              # default nprobe is 1, try a few more
+D, I = index.search(xq, k)
+print(I[-5:])                  # neighbors of the 5 last queries
diff --git a/core/src/index/thirdparty/faiss/tutorial/python/3-IVFPQ.py b/core/src/index/thirdparty/faiss/tutorial/python/3-IVFPQ.py
new file mode 100644
index 0000000000..e502239ca4
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/python/3-IVFPQ.py
@@ -0,0 +1,32 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+import faiss
+
+nlist = 100
+m = 8
+k = 4
+quantizer = faiss.IndexFlatL2(d)  # this remains the same
+index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)
+                                  # 8 specifies that each sub-vector is encoded as 8 bits
+index.train(xb)
+index.add(xb)
+D, I = index.search(xb[:5], k) # sanity check
+print(I)
+print(D)
+index.nprobe = 10              # make comparable with experiment above
+D, I = index.search(xq, k)     # search
+print(I[-5:])
diff --git a/core/src/index/thirdparty/faiss/tutorial/python/4-GPU.py b/core/src/index/thirdparty/faiss/tutorial/python/4-GPU.py
new file mode 100644
index 0000000000..6f5e37e535
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/python/4-GPU.py
@@ -0,0 +1,57 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+import faiss                     # make faiss available
+
+res = faiss.StandardGpuResources()  # use a single GPU
+
+## Using a flat index
+
+index_flat = faiss.IndexFlatL2(d)  # build a flat (CPU) index
+
+# make it a flat GPU index
+gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)
+
+gpu_index_flat.add(xb)         # add vectors to the index
+print(gpu_index_flat.ntotal)
+
+k = 4                          # we want to see 4 nearest neighbors
+D, I = gpu_index_flat.search(xq, k)  # actual search
+print(I[:5])                   # neighbors of the 5 first queries
+print(I[-5:])                  # neighbors of the 5 last queries
+
+
+## Using an IVF index
+
+nlist = 100
+quantizer = faiss.IndexFlatL2(d)  # the other index
+index_ivf = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
+# here we specify METRIC_L2, by default it performs inner-product search
+
+# make it an IVF GPU index
+gpu_index_ivf = faiss.index_cpu_to_gpu(res, 0, index_ivf)
+
+assert not gpu_index_ivf.is_trained
+gpu_index_ivf.train(xb)        # add vectors to the index
+assert gpu_index_ivf.is_trained
+
+gpu_index_ivf.add(xb)          # add vectors to the index
+print(gpu_index_ivf.ntotal)
+
+k = 4                          # we want to see 4 nearest neighbors
+D, I = gpu_index_ivf.search(xq, k)  # actual search
+print(I[:5])                   # neighbors of the 5 first queries
+print(I[-5:])                  # neighbors of the 5 last queries
diff --git a/core/src/index/thirdparty/faiss/tutorial/python/5-Multiple-GPUs.py b/core/src/index/thirdparty/faiss/tutorial/python/5-Multiple-GPUs.py
new file mode 100644
index 0000000000..c458587ce9
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/python/5-Multiple-GPUs.py
@@ -0,0 +1,35 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+import faiss                     # make faiss available
+
+ngpus = faiss.get_num_gpus()
+
+print("number of GPUs:", ngpus)
+
+cpu_index = faiss.IndexFlatL2(d)
+
+gpu_index = faiss.index_cpu_to_all_gpus(  # build the index
+    cpu_index
+)
+
+gpu_index.add(xb)              # add vectors to the index
+print(gpu_index.ntotal)
+
+k = 4                          # we want to see 4 nearest neighbors
+D, I = gpu_index.search(xq, k) # actual search
+print(I[:5])                   # neighbors of the 5 first queries
+print(I[-5:])                  # neighbors of the 5 last queries
diff --git a/core/src/index/thirdparty/faiss/utils/Heap.cpp b/core/src/index/thirdparty/faiss/utils/Heap.cpp
new file mode 100644
index 0000000000..4a5de5ad36
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/utils/Heap.cpp
@@ -0,0 +1,122 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/* Function for soft heap */
+
+#include <faiss/utils/Heap.h>
+
+
+namespace faiss {
+
+
+template <typename C>
+void HeapArray<C>::heapify ()
+{
+#pragma omp parallel for
+    for (size_t j = 0; j < nh; j++)
+        heap_heapify<C> (k, val + j * k, ids + j * k);
+}
+
+template <typename C>
+void HeapArray<C>::reorder ()
+{
+#pragma omp parallel for
+    for (size_t j = 0; j < nh; j++)
+        heap_reorder<C> (k, val + j * k, ids + j * k);
+}
+
+template <typename C>
+void HeapArray<C>::addn (size_t nj, const T *vin, TI j0,
+                         size_t i0, int64_t ni)
+{
+    if (ni == -1) ni = nh;
+    assert (i0 >= 0 && i0 + ni <= nh);
+#pragma omp parallel for
+    for (size_t i = i0; i < i0 + ni; i++) {
+        T * __restrict simi = get_val(i);
+        TI * __restrict idxi = get_ids (i);
+        const T *ip_line = vin + (i - i0) * nj;
+
+        for (size_t j = 0; j < nj; j++) {
+            T ip = ip_line [j];
+            if (C::cmp(simi[0], ip)) {
+                heap_pop<C> (k, simi, idxi);
+                heap_push<C> (k, simi, idxi, ip, j + j0);
+            }
+        }
+    }
+}
+
+template <typename C>
+void HeapArray<C>::addn_with_ids (
+     size_t nj, const T *vin, const TI *id_in,
+     int64_t id_stride, size_t i0, int64_t ni)
+{
+    if (id_in == nullptr) {
+        addn (nj, vin, 0, i0, ni);
+        return;
+    }
+    if (ni == -1) ni = nh;
+    assert (i0 >= 0 && i0 + ni <= nh);
+#pragma omp parallel for
+    for (size_t i = i0; i < i0 + ni; i++) {
+        T * __restrict simi = get_val(i);
+        TI * __restrict idxi = get_ids (i);
+        const T *ip_line = vin + (i - i0) * nj;
+        const TI *id_line = id_in + (i - i0) * id_stride;
+
+        for (size_t j = 0; j < nj; j++) {
+            T ip = ip_line [j];
+            if (C::cmp(simi[0], ip)) {
+                heap_pop<C> (k, simi, idxi);
+                heap_push<C> (k, simi, idxi, ip, id_line [j]);
+            }
+        }
+    }
+}
+
+template <typename C>
+void HeapArray<C>::per_line_extrema (
+                   T * out_val,
+                   TI * out_ids) const
+{
+#pragma omp parallel for
+    for (size_t j = 0; j < nh; j++) {
+        int64_t imin = -1;
+        typename C::T xval = C::Crev::neutral ();
+        const typename C::T * x_ = val + j * k;
+        for (size_t i = 0; i < k; i++)
+            if (C::cmp (x_[i], xval)) {
+                xval = x_[i];
+                imin = i;
+            }
+        if (out_val)
+            out_val[j] = xval;
+
+        if (out_ids) {
+            if (ids && imin != -1)
+                out_ids[j] = ids [j * k + imin];
+            else
+                out_ids[j] = imin;
+        }
+    }
+}
+
+
+
+
+// explicit instanciations
+
+template struct HeapArray<CMin <float, int64_t> >;
+template struct HeapArray<CMax <float, int64_t> >;
+template struct HeapArray<CMin <int, int64_t> >;
+template struct HeapArray<CMax <int, int64_t> >;
+
+
+}  // END namespace fasis
diff --git a/core/src/index/thirdparty/faiss/utils/Heap.h b/core/src/index/thirdparty/faiss/utils/Heap.h
new file mode 100644
index 0000000000..e691c36c7f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/utils/Heap.h
@@ -0,0 +1,495 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/*
+ * C++ support for heaps. The set of functions is tailored for
+ * efficient similarity search.
+ *
+ * There is no specific object for a heap, and the functions that
+ * operate on a signle heap are inlined, because heaps are often
+ * small. More complex functions are implemented in Heaps.cpp
+ *
+ */
+
+
+#ifndef FAISS_Heap_h
+#define FAISS_Heap_h
+
+#include <climits>
+#include <cstring>
+#include <cmath>
+
+#include <cassert>
+#include <cstdio>
+#include <stdint.h>
+
+#include <limits>
+
+
+namespace faiss {
+
+/*******************************************************************
+ * C object: uniform handling of min and max heap
+ *******************************************************************/
+
+/** The C object gives the type T of the values in the heap, the type
+ *  of the keys, TI and the comparison that is done: > for the minheap
+ *  and < for the maxheap. The neutral value will always be dropped in
+ *  favor of any other value in the heap.
+ */
+
+template <typename T_, typename TI_>
+struct CMax;
+
+// traits of minheaps = heaps where the minimum value is stored on top
+// useful to find the *max* values of an array
+template <typename T_, typename TI_>
+struct CMin {
+    typedef T_ T;
+    typedef TI_ TI;
+    typedef CMax<T_, TI_> Crev;
+    inline static bool cmp (T a, T b) {
+        return a < b;
+    }
+    // value that will be popped first -> must be smaller than all others
+    // for int types this is not strictly the smallest val (-max - 1)
+    inline static T neutral () {
+        return -std::numeric_limits<T>::max();
+    }
+};
+
+
+template <typename T_, typename TI_>
+struct CMax {
+    typedef T_ T;
+    typedef TI_ TI;
+    typedef CMin<T_, TI_> Crev;
+    inline static bool cmp (T a, T b) {
+        return a > b;
+    }
+    inline static T neutral () {
+        return std::numeric_limits<T>::max();
+    }
+};
+
+
+/*******************************************************************
+ * Basic heap ops: push and pop
+ *******************************************************************/
+
+/** Pops the top element from the heap defined by bh_val[0..k-1] and
+ * bh_ids[0..k-1].  on output the element at k-1 is undefined.
+ */
+template <class C> inline
+void heap_pop (size_t k, typename C::T * bh_val, typename C::TI * bh_ids)
+{
+    bh_val--; /* Use 1-based indexing for easier node->child translation */
+    bh_ids--;
+    typename C::T val = bh_val[k];
+    size_t i = 1, i1, i2;
+    while (1) {
+        i1 = i << 1;
+        i2 = i1 + 1;
+        if (i1 > k)
+            break;
+        if (i2 == k + 1 || C::cmp(bh_val[i1], bh_val[i2])) {
+            if (C::cmp(val, bh_val[i1]))
+                break;
+            bh_val[i] = bh_val[i1];
+            bh_ids[i] = bh_ids[i1];
+            i = i1;
+        }
+        else {
+            if (C::cmp(val, bh_val[i2]))
+                break;
+            bh_val[i] = bh_val[i2];
+            bh_ids[i] = bh_ids[i2];
+            i = i2;
+        }
+    }
+    bh_val[i] = bh_val[k];
+    bh_ids[i] = bh_ids[k];
+}
+
+
+
+/** Pushes the element (val, ids) into the heap bh_val[0..k-2] and
+ * bh_ids[0..k-2].  on output the element at k-1 is defined.
+ */
+template <class C> inline
+void heap_push (size_t k,
+                typename C::T * bh_val, typename C::TI * bh_ids,
+                typename C::T val, typename C::TI ids)
+{
+    bh_val--; /* Use 1-based indexing for easier node->child translation */
+    bh_ids--;
+    size_t i = k, i_father;
+    while (i > 1) {
+        i_father = i >> 1;
+        if (!C::cmp (val, bh_val[i_father]))  /* the heap structure is ok */
+            break;
+        bh_val[i] = bh_val[i_father];
+        bh_ids[i] = bh_ids[i_father];
+        i = i_father;
+    }
+    bh_val[i] = val;
+    bh_ids[i] = ids;
+}
+
+
+
+/* Partial instanciation for heaps with TI = int64_t */
+
+template <typename T> inline
+void minheap_pop (size_t k, T * bh_val, int64_t * bh_ids)
+{
+    heap_pop<CMin<T, int64_t> > (k, bh_val, bh_ids);
+}
+
+
+template <typename T> inline
+void minheap_push (size_t k, T * bh_val, int64_t * bh_ids, T val, int64_t ids)
+{
+    heap_push<CMin<T, int64_t> > (k, bh_val, bh_ids, val, ids);
+}
+
+
+template <typename T> inline
+void maxheap_pop (size_t k, T * bh_val, int64_t * bh_ids)
+{
+    heap_pop<CMax<T, int64_t> > (k, bh_val, bh_ids);
+}
+
+
+template <typename T> inline
+void maxheap_push (size_t k, T * bh_val, int64_t * bh_ids, T val, int64_t ids)
+{
+    heap_push<CMax<T, int64_t> > (k, bh_val, bh_ids, val, ids);
+}
+
+
+
+/*******************************************************************
+ * Heap initialization
+ *******************************************************************/
+
+/* Initialization phase for the heap (with unconditionnal pushes).
+ * Store k0 elements in a heap containing up to k values. Note that
+ * (bh_val, bh_ids) can be the same as (x, ids) */
+template <class C> inline
+void heap_heapify (
+        size_t k,
+        typename C::T *  bh_val,
+        typename C::TI *  bh_ids,
+        const typename C::T * x = nullptr,
+        const typename C::TI * ids = nullptr,
+        size_t k0 = 0)
+{
+   if (k0 > 0) assert (x);
+
+   if (ids) {
+       for (size_t i = 0; i < k0; i++)
+           heap_push<C> (i+1, bh_val, bh_ids, x[i], ids[i]);
+   } else {
+       for (size_t i = 0; i < k0; i++)
+           heap_push<C> (i+1, bh_val, bh_ids, x[i], i);
+   }
+
+   for (size_t i = k0; i < k; i++) {
+       bh_val[i] = C::neutral();
+       bh_ids[i] = -1;
+   }
+
+}
+
+template <typename T> inline
+void minheap_heapify (
+        size_t k, T *  bh_val,
+        int64_t * bh_ids,
+        const T * x = nullptr,
+        const int64_t * ids = nullptr,
+        size_t k0 = 0)
+{
+    heap_heapify< CMin<T, int64_t> > (k, bh_val, bh_ids, x, ids, k0);
+}
+
+
+template <typename T> inline
+void maxheap_heapify (
+        size_t k,
+        T * bh_val,
+        int64_t * bh_ids,
+         const T * x = nullptr,
+         const int64_t * ids = nullptr,
+         size_t k0 = 0)
+{
+    heap_heapify< CMax<T, int64_t> > (k, bh_val, bh_ids, x, ids, k0);
+}
+
+
+
+/*******************************************************************
+ * Add n elements to the heap
+ *******************************************************************/
+
+
+/* Add some elements to the heap  */
+template <class C> inline
+void heap_addn (size_t k,
+                typename C::T * bh_val, typename C::TI * bh_ids,
+                const typename C::T * x,
+                const typename C::TI * ids,
+                size_t n)
+{
+    size_t i;
+    if (ids)
+        for (i = 0; i < n; i++) {
+            if (C::cmp (bh_val[0], x[i])) {
+                heap_pop<C> (k, bh_val, bh_ids);
+                heap_push<C> (k, bh_val, bh_ids, x[i], ids[i]);
+            }
+        }
+    else
+        for (i = 0; i < n; i++) {
+            if (C::cmp (bh_val[0], x[i])) {
+                heap_pop<C> (k, bh_val, bh_ids);
+                heap_push<C> (k, bh_val, bh_ids, x[i], i);
+            }
+        }
+}
+
+
+/* Partial instanciation for heaps with TI = int64_t */
+
+template <typename T> inline
+void minheap_addn (size_t k, T * bh_val, int64_t * bh_ids,
+                   const T * x, const int64_t * ids, size_t n)
+{
+    heap_addn<CMin<T, int64_t> > (k, bh_val, bh_ids, x, ids, n);
+}
+
+template <typename T> inline
+void maxheap_addn (size_t k, T * bh_val, int64_t * bh_ids,
+                   const T * x, const int64_t * ids, size_t n)
+{
+    heap_addn<CMax<T, int64_t> > (k, bh_val, bh_ids, x, ids, n);
+}
+
+
+
+
+
+
+/*******************************************************************
+ * Heap finalization (reorder elements)
+ *******************************************************************/
+
+
+/* This function maps a binary heap into an sorted structure.
+   It returns the number  */
+template <typename C> inline
+size_t heap_reorder (size_t k, typename C::T * bh_val, typename C::TI * bh_ids)
+{
+    size_t i, ii;
+
+    for (i = 0, ii = 0; i < k; i++) {
+        /* top element should be put at the end of the list */
+        typename C::T val = bh_val[0];
+        typename C::TI id = bh_ids[0];
+
+        /* boundary case: we will over-ride this value if not a true element */
+        heap_pop<C> (k-i, bh_val, bh_ids);
+        bh_val[k-ii-1] = val;
+        bh_ids[k-ii-1] = id;
+        if (id != -1) ii++;
+    }
+    /* Count the number of elements which are effectively returned */
+    size_t nel = ii;
+
+    memmove (bh_val, bh_val+k-ii, ii * sizeof(*bh_val));
+    memmove (bh_ids, bh_ids+k-ii, ii * sizeof(*bh_ids));
+
+    for (; ii < k; ii++) {
+        bh_val[ii] = C::neutral();
+        bh_ids[ii] = -1;
+    }
+    return nel;
+}
+
+template <typename T> inline
+size_t minheap_reorder (size_t k, T * bh_val, int64_t * bh_ids)
+{
+    return heap_reorder< CMin<T, int64_t> > (k, bh_val, bh_ids);
+}
+
+template <typename T> inline
+size_t maxheap_reorder (size_t k, T * bh_val, int64_t * bh_ids)
+{
+    return heap_reorder< CMax<T, int64_t> > (k, bh_val, bh_ids);
+}
+
+
+
+
+
+/*******************************************************************
+ * Operations on heap arrays
+ *******************************************************************/
+
+/** a template structure for a set of [min|max]-heaps it is tailored
+ * so that the actual data of the heaps can just live in compact
+ * arrays.
+ */
+template <typename C>
+struct HeapArray {
+    typedef typename C::TI TI;
+    typedef typename C::T T;
+
+    size_t nh;    ///< number of heaps
+    size_t k;     ///< allocated size per heap
+    TI * ids;     ///< identifiers (size nh * k)
+    T * val;      ///< values (distances or similarities), size nh * k
+
+    /// Return the list of values for a heap
+    T * get_val (size_t key) { return val + key * k; }
+
+    /// Correspponding identifiers
+    TI * get_ids (size_t key) { return ids + key * k; }
+
+    /// prepare all the heaps before adding
+    void heapify ();
+
+    /** add nj elements to heaps i0:i0+ni, with sequential ids
+     *
+     * @param nj    nb of elements to add to each heap
+     * @param vin   elements to add, size ni * nj
+     * @param j0    add this to the ids that are added
+     * @param i0    first heap to update
+     * @param ni    nb of elements to update (-1 = use nh)
+     */
+    void addn (size_t nj, const T *vin, TI j0 = 0,
+               size_t i0 = 0, int64_t ni = -1);
+
+    /** same as addn
+     *
+     * @param id_in     ids of the elements to add, size ni * nj
+     * @param id_stride stride for id_in
+     */
+    void addn_with_ids (
+        size_t nj, const T *vin, const TI *id_in = nullptr,
+        int64_t id_stride = 0, size_t i0 = 0, int64_t ni = -1);
+
+    /// reorder all the heaps
+    void reorder ();
+
+    /** this is not really a heap function. It just finds the per-line
+     *   extrema of each line of array D
+     * @param vals_out    extreme value of each line (size nh, or NULL)
+     * @param idx_out     index of extreme value (size nh or NULL)
+     */
+    void per_line_extrema (T *vals_out, TI *idx_out) const;
+
+};
+
+
+/* Define useful heaps */
+typedef HeapArray<CMin<float, int64_t> > float_minheap_array_t;
+typedef HeapArray<CMin<int, int64_t> > int_minheap_array_t;
+
+typedef HeapArray<CMax<float, int64_t> > float_maxheap_array_t;
+typedef HeapArray<CMax<int, int64_t> > int_maxheap_array_t;
+
+// The heap templates are instanciated explicitly in Heap.cpp
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+/*********************************************************************
+ * Indirect heaps: instead of having
+ *
+ *          node i = (bh_ids[i], bh_val[i]),
+ *
+ * in indirect heaps,
+ *
+ *          node i = (bh_ids[i], bh_val[bh_ids[i]]),
+ *
+ *********************************************************************/
+
+
+template <class C>
+inline
+void indirect_heap_pop (
+    size_t k,
+    const typename C::T * bh_val,
+    typename C::TI * bh_ids)
+{
+    bh_ids--; /* Use 1-based indexing for easier node->child translation */
+    typename C::T val = bh_val[bh_ids[k]];
+    size_t i = 1;
+    while (1) {
+        size_t i1 = i << 1;
+        size_t i2 = i1 + 1;
+        if (i1 > k)
+            break;
+        typename C::TI id1 = bh_ids[i1], id2 = bh_ids[i2];
+        if (i2 == k + 1 || C::cmp(bh_val[id1], bh_val[id2])) {
+            if (C::cmp(val, bh_val[id1]))
+                break;
+            bh_ids[i] = id1;
+            i = i1;
+        } else {
+            if (C::cmp(val, bh_val[id2]))
+                break;
+            bh_ids[i] = id2;
+            i = i2;
+        }
+    }
+    bh_ids[i] = bh_ids[k];
+}
+
+
+
+template <class C>
+inline
+void indirect_heap_push (size_t k,
+                         const typename C::T * bh_val, typename C::TI * bh_ids,
+                         typename C::TI id)
+{
+    bh_ids--; /* Use 1-based indexing for easier node->child translation */
+    typename C::T val = bh_val[id];
+    size_t i = k;
+    while (i > 1) {
+        size_t i_father = i >> 1;
+        if (!C::cmp (val, bh_val[bh_ids[i_father]]))
+            break;
+        bh_ids[i] = bh_ids[i_father];
+        i = i_father;
+    }
+    bh_ids[i] = id;
+}
+
+
+} // namespace faiss
+
+#endif  /* FAISS_Heap_h */
diff --git a/core/src/index/thirdparty/faiss/utils/WorkerThread.cpp b/core/src/index/thirdparty/faiss/utils/WorkerThread.cpp
new file mode 100644
index 0000000000..83b5c97e47
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/utils/WorkerThread.cpp
@@ -0,0 +1,126 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/utils/WorkerThread.h>
+#include <faiss/impl/FaissAssert.h>
+#include <exception>
+
+namespace faiss {
+
+namespace {
+
+// Captures any exceptions thrown by the lambda and returns them via the promise
+void runCallback(std::function<void()>& fn,
+                 std::promise<bool>& promise) {
+  try {
+    fn();
+    promise.set_value(true);
+  } catch (...) {
+    promise.set_exception(std::current_exception());
+  }
+}
+
+} // namespace
+
+WorkerThread::WorkerThread() :
+    wantStop_(false) {
+  startThread();
+
+  // Make sure that the thread has started before continuing
+  add([](){}).get();
+}
+
+WorkerThread::~WorkerThread() {
+  stop();
+  waitForThreadExit();
+}
+
+void
+WorkerThread::startThread() {
+  thread_ = std::thread([this](){ threadMain(); });
+}
+
+void
+WorkerThread::stop() {
+  std::lock_guard<std::mutex> guard(mutex_);
+
+  wantStop_ = true;
+  monitor_.notify_one();
+}
+
+std::future<bool>
+WorkerThread::add(std::function<void()> f) {
+  std::lock_guard<std::mutex> guard(mutex_);
+
+  if (wantStop_) {
+    // The timer thread has been stopped, or we want to stop; we can't
+    // schedule anything else
+    std::promise<bool> p;
+    auto fut = p.get_future();
+
+    // did not execute
+    p.set_value(false);
+    return fut;
+  }
+
+  auto pr = std::promise<bool>();
+  auto fut = pr.get_future();
+
+  queue_.emplace_back(std::make_pair(std::move(f), std::move(pr)));
+
+  // Wake up our thread
+  monitor_.notify_one();
+  return fut;
+}
+
+void
+WorkerThread::threadMain() {
+  threadLoop();
+
+  // Call all pending tasks
+  FAISS_ASSERT(wantStop_);
+
+  // flush all pending operations
+  for (auto& f : queue_) {
+    runCallback(f.first, f.second);
+  }
+}
+
+void
+WorkerThread::threadLoop() {
+  while (true) {
+    std::pair<std::function<void()>, std::promise<bool>> data;
+
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+
+      while (!wantStop_ && queue_.empty()) {
+        monitor_.wait(lock);
+      }
+
+      if (wantStop_) {
+        return;
+      }
+
+      data = std::move(queue_.front());
+      queue_.pop_front();
+    }
+
+    runCallback(data.first, data.second);
+  }
+}
+
+void
+WorkerThread::waitForThreadExit() {
+  try {
+    thread_.join();
+  } catch (...) {
+  }
+}
+
+} // namespace
diff --git a/core/src/index/thirdparty/faiss/utils/WorkerThread.h b/core/src/index/thirdparty/faiss/utils/WorkerThread.h
new file mode 100644
index 0000000000..7ab21e9f90
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/utils/WorkerThread.h
@@ -0,0 +1,61 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <condition_variable>
+#include <future>
+#include <deque>
+#include <thread>
+
+namespace faiss {
+
+class WorkerThread {
+ public:
+  WorkerThread();
+
+  /// Stops and waits for the worker thread to exit, flushing all
+  /// pending lambdas
+  ~WorkerThread();
+
+  /// Request that the worker thread stop itself
+  void stop();
+
+  /// Blocking waits in the current thread for the worker thread to
+  /// stop
+  void waitForThreadExit();
+
+  /// Adds a lambda to run on the worker thread; returns a future that
+  /// can be used to block on its completion.
+  /// Future status is `true` if the lambda was run in the worker
+  /// thread; `false` if it was not run, because the worker thread is
+  /// exiting or has exited.
+  std::future<bool> add(std::function<void()> f);
+
+ private:
+  void startThread();
+  void threadMain();
+  void threadLoop();
+
+  /// Thread that all queued lambdas are run on
+  std::thread thread_;
+
+  /// Mutex for the queue and exit status
+  std::mutex mutex_;
+
+  /// Monitor for the exit status and the queue
+  std::condition_variable monitor_;
+
+  /// Whether or not we want the thread to exit
+  bool wantStop_;
+
+  /// Queue of pending lambdas to call
+  std::deque<std::pair<std::function<void()>, std::promise<bool>>> queue_;
+};
+
+} // namespace
diff --git a/core/src/index/thirdparty/faiss/utils/distances.cpp b/core/src/index/thirdparty/faiss/utils/distances.cpp
new file mode 100644
index 0000000000..dcbac8824c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/utils/distances.cpp
@@ -0,0 +1,765 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/utils/distances.h>
+
+#include <cstdio>
+#include <cassert>
+#include <cstring>
+#include <cmath>
+
+#include <omp.h>
+
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+
+
+
+#ifndef FINTEGER
+#define FINTEGER long
+#endif
+
+
+extern "C" {
+
+/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
+
+int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+            n, FINTEGER *k, const float *alpha, const float *a,
+            FINTEGER *lda, const float *b, FINTEGER *
+            ldb, float *beta, float *c, FINTEGER *ldc);
+
+/* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
+
+int sgeqrf_ (FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
+                 float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
+
+int sgemv_(const char *trans, FINTEGER *m, FINTEGER *n, float *alpha,
+           const float *a, FINTEGER *lda, const float *x, FINTEGER *incx,
+           float *beta, float *y, FINTEGER *incy);
+
+}
+
+
+namespace faiss {
+
+
+
+/***************************************************************************
+ * Matrix/vector ops
+ ***************************************************************************/
+
+
+
+/* Compute the inner product between a vector x and
+   a set of ny vectors y.
+   These functions are not intended to replace BLAS matrix-matrix, as they
+   would be significantly less efficient in this case. */
+void fvec_inner_products_ny (float * ip,
+                             const float * x,
+                             const float * y,
+                             size_t d, size_t ny)
+{
+    // Not sure which one is fastest
+#if 0
+    {
+        FINTEGER di = d;
+        FINTEGER nyi = ny;
+        float one = 1.0, zero = 0.0;
+        FINTEGER onei = 1;
+        sgemv_ ("T", &di, &nyi, &one, y, &di, x, &onei, &zero, ip, &onei);
+    }
+#endif
+    for (size_t i = 0; i < ny; i++) {
+        ip[i] = fvec_inner_product (x, y, d);
+        y += d;
+    }
+}
+
+
+
+
+
+/* Compute the L2 norm of a set of nx vectors */
+void fvec_norms_L2 (float * __restrict nr,
+                    const float * __restrict x,
+                    size_t d, size_t nx)
+{
+
+#pragma omp parallel for
+    for (size_t i = 0; i < nx; i++) {
+        nr[i] = sqrtf (fvec_norm_L2sqr (x + i * d, d));
+    }
+}
+
+void fvec_norms_L2sqr (float * __restrict nr,
+                       const float * __restrict x,
+                       size_t d, size_t nx)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < nx; i++)
+        nr[i] = fvec_norm_L2sqr (x + i * d, d);
+}
+
+
+
+void fvec_renorm_L2 (size_t d, size_t nx, float * __restrict x)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < nx; i++) {
+        float * __restrict xi = x + i * d;
+
+        float nr = fvec_norm_L2sqr (xi, d);
+
+        if (nr > 0) {
+            size_t j;
+            const float inv_nr = 1.0 / sqrtf (nr);
+            for (j = 0; j < d; j++)
+                xi[j] *= inv_nr;
+        }
+    }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+/***************************************************************************
+ * KNN functions
+ ***************************************************************************/
+
+
+
+/* Find the nearest neighbors for nx queries in a set of ny vectors */
+static void knn_inner_product_sse (const float * x,
+                        const float * y,
+                        size_t d, size_t nx, size_t ny,
+                        float_minheap_array_t * res)
+{
+    size_t k = res->k;
+    size_t check_period = InterruptCallback::get_period_hint (ny * d);
+
+    check_period *= omp_get_max_threads();
+
+    for (size_t i0 = 0; i0 < nx; i0 += check_period) {
+        size_t i1 = std::min(i0 + check_period, nx);
+
+#pragma omp parallel for
+        for (size_t i = i0; i < i1; i++) {
+            const float * x_i = x + i * d;
+            const float * y_j = y;
+
+            float * __restrict simi = res->get_val(i);
+            int64_t * __restrict idxi = res->get_ids (i);
+
+            minheap_heapify (k, simi, idxi);
+
+            for (size_t j = 0; j < ny; j++) {
+                float ip = fvec_inner_product (x_i, y_j, d);
+
+                if (ip > simi[0]) {
+                    minheap_pop (k, simi, idxi);
+                    minheap_push (k, simi, idxi, ip, j);
+                }
+                y_j += d;
+            }
+            minheap_reorder (k, simi, idxi);
+        }
+        InterruptCallback::check ();
+    }
+
+}
+
+static void knn_L2sqr_sse (
+                const float * x,
+                const float * y,
+                size_t d, size_t nx, size_t ny,
+                float_maxheap_array_t * res)
+{
+    size_t k = res->k;
+
+    size_t check_period = InterruptCallback::get_period_hint (ny * d);
+    check_period *= omp_get_max_threads();
+
+    for (size_t i0 = 0; i0 < nx; i0 += check_period) {
+        size_t i1 = std::min(i0 + check_period, nx);
+
+#pragma omp parallel for
+        for (size_t i = i0; i < i1; i++) {
+            const float * x_i = x + i * d;
+            const float * y_j = y;
+            size_t j;
+            float * simi = res->get_val(i);
+            int64_t * idxi = res->get_ids (i);
+
+            maxheap_heapify (k, simi, idxi);
+            for (j = 0; j < ny; j++) {
+                float disij = fvec_L2sqr (x_i, y_j, d);
+
+                if (disij < simi[0]) {
+                    maxheap_pop (k, simi, idxi);
+                    maxheap_push (k, simi, idxi, disij, j);
+                }
+                y_j += d;
+            }
+            maxheap_reorder (k, simi, idxi);
+        }
+        InterruptCallback::check ();
+    }
+
+}
+
+
+/** Find the nearest neighbors for nx queries in a set of ny vectors */
+static void knn_inner_product_blas (
+        const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        float_minheap_array_t * res)
+{
+    res->heapify ();
+
+    // BLAS does not like empty matrices
+    if (nx == 0 || ny == 0) return;
+
+    /* block sizes */
+    const size_t bs_x = 4096, bs_y = 1024;
+    // const size_t bs_x = 16, bs_y = 16;
+    std::unique_ptr<float[]> ip_block(new float[bs_x * bs_y]);
+
+    for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
+        size_t i1 = i0 + bs_x;
+        if(i1 > nx) i1 = nx;
+
+        for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
+            size_t j1 = j0 + bs_y;
+            if (j1 > ny) j1 = ny;
+            /* compute the actual dot products */
+            {
+                float one = 1, zero = 0;
+                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
+                sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
+                        y + j0 * d, &di,
+                        x + i0 * d, &di, &zero,
+                        ip_block.get(), &nyi);
+            }
+
+            /* collect maxima */
+            res->addn (j1 - j0, ip_block.get(), j0, i0, i1 - i0);
+        }
+        InterruptCallback::check ();
+    }
+    res->reorder ();
+}
+
+// distance correction is an operator that can be applied to transform
+// the distances
+template<class DistanceCorrection>
+static void knn_L2sqr_blas (const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        float_maxheap_array_t * res,
+        const DistanceCorrection &corr)
+{
+    res->heapify ();
+
+    // BLAS does not like empty matrices
+    if (nx == 0 || ny == 0) return;
+
+    size_t k = res->k;
+
+    /* block sizes */
+    const size_t bs_x = 4096, bs_y = 1024;
+    // const size_t bs_x = 16, bs_y = 16;
+    float *ip_block = new float[bs_x * bs_y];
+    float *x_norms = new float[nx];
+    float *y_norms = new float[ny];
+    ScopeDeleter<float> del1(ip_block), del3(x_norms), del2(y_norms);
+
+    fvec_norms_L2sqr (x_norms, x, d, nx);
+    fvec_norms_L2sqr (y_norms, y, d, ny);
+
+
+    for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
+        size_t i1 = i0 + bs_x;
+        if(i1 > nx) i1 = nx;
+
+        for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
+            size_t j1 = j0 + bs_y;
+            if (j1 > ny) j1 = ny;
+            /* compute the actual dot products */
+            {
+                float one = 1, zero = 0;
+                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
+                sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
+                        y + j0 * d, &di,
+                        x + i0 * d, &di, &zero,
+                        ip_block, &nyi);
+            }
+
+            /* collect minima */
+#pragma omp parallel for
+            for (size_t i = i0; i < i1; i++) {
+                float * __restrict simi = res->get_val(i);
+                int64_t * __restrict idxi = res->get_ids (i);
+                const float *ip_line = ip_block + (i - i0) * (j1 - j0);
+
+                for (size_t j = j0; j < j1; j++) {
+                    float ip = *ip_line++;
+                    float dis = x_norms[i] + y_norms[j] - 2 * ip;
+
+                    // negative values can occur for identical vectors
+                    // due to roundoff errors
+                    if (dis < 0) dis = 0;
+
+                    dis = corr (dis, i, j);
+
+                    if (dis < simi[0]) {
+                        maxheap_pop (k, simi, idxi);
+                        maxheap_push (k, simi, idxi, dis, j);
+                    }
+                }
+            }
+        }
+        InterruptCallback::check ();
+    }
+    res->reorder ();
+
+}
+
+
+
+
+
+
+
+
+
+/*******************************************************
+ * KNN driver functions
+ *******************************************************/
+
+int distance_compute_blas_threshold = 20;
+
+void knn_inner_product (const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        float_minheap_array_t * res)
+{
+    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
+        knn_inner_product_sse (x, y, d, nx, ny, res);
+    } else {
+        knn_inner_product_blas (x, y, d, nx, ny, res);
+    }
+}
+
+
+
+struct NopDistanceCorrection {
+  float operator()(float dis, size_t /*qno*/, size_t /*bno*/) const {
+    return dis;
+    }
+};
+
+void knn_L2sqr (const float * x,
+                const float * y,
+                size_t d, size_t nx, size_t ny,
+                float_maxheap_array_t * res)
+{
+    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
+        knn_L2sqr_sse (x, y, d, nx, ny, res);
+    } else {
+        NopDistanceCorrection nop;
+        knn_L2sqr_blas (x, y, d, nx, ny, res, nop);
+    }
+}
+
+struct BaseShiftDistanceCorrection {
+    const float *base_shift;
+    float operator()(float dis, size_t /*qno*/, size_t bno) const {
+      return dis - base_shift[bno];
+    }
+};
+
+void knn_L2sqr_base_shift (
+         const float * x,
+         const float * y,
+         size_t d, size_t nx, size_t ny,
+         float_maxheap_array_t * res,
+         const float *base_shift)
+{
+    BaseShiftDistanceCorrection corr = {base_shift};
+    knn_L2sqr_blas (x, y, d, nx, ny, res, corr);
+}
+
+
+
+/***************************************************************************
+ * compute a subset of  distances
+ ***************************************************************************/
+
+/* compute the inner product between x and a subset y of ny vectors,
+   whose indices are given by idy.  */
+void fvec_inner_products_by_idx (float * __restrict ip,
+                                 const float * x,
+                                 const float * y,
+                                 const int64_t * __restrict ids, /* for y vecs */
+                                 size_t d, size_t nx, size_t ny)
+{
+#pragma omp parallel for
+    for (size_t j = 0; j < nx; j++) {
+        const int64_t * __restrict idsj = ids + j * ny;
+        const float * xj = x + j * d;
+        float * __restrict ipj = ip + j * ny;
+        for (size_t i = 0; i < ny; i++) {
+            if (idsj[i] < 0)
+                continue;
+            ipj[i] = fvec_inner_product (xj, y + d * idsj[i], d);
+        }
+    }
+}
+
+
+
+/* compute the inner product between x and a subset y of ny vectors,
+   whose indices are given by idy.  */
+void fvec_L2sqr_by_idx (float * __restrict dis,
+                        const float * x,
+                        const float * y,
+                        const int64_t * __restrict ids, /* ids of y vecs */
+                        size_t d, size_t nx, size_t ny)
+{
+#pragma omp parallel for
+    for (size_t j = 0; j < nx; j++) {
+        const int64_t * __restrict idsj = ids + j * ny;
+        const float * xj = x + j * d;
+        float * __restrict disj = dis + j * ny;
+        for (size_t i = 0; i < ny; i++) {
+            if (idsj[i] < 0)
+                continue;
+            disj[i] = fvec_L2sqr (xj, y + d * idsj[i], d);
+        }
+    }
+}
+
+void pairwise_indexed_L2sqr (
+        size_t d, size_t n,
+        const float * x, const int64_t *ix,
+        const float * y, const int64_t *iy,
+        float *dis)
+{
+#pragma omp parallel for
+    for (size_t j = 0; j < n; j++) {
+        if (ix[j] >= 0 && iy[j] >= 0) {
+            dis[j] = fvec_L2sqr (x + d * ix[j], y + d * iy[j], d);
+        }
+    }
+}
+
+void pairwise_indexed_inner_product (
+        size_t d, size_t n,
+        const float * x, const int64_t *ix,
+        const float * y, const int64_t *iy,
+        float *dis)
+{
+#pragma omp parallel for
+    for (size_t j = 0; j < n; j++) {
+        if (ix[j] >= 0 && iy[j] >= 0) {
+            dis[j] = fvec_inner_product (x + d * ix[j], y + d * iy[j], d);
+        }
+    }
+}
+
+
+/* Find the nearest neighbors for nx queries in a set of ny vectors
+   indexed by ids. May be useful for re-ranking a pre-selected vector list */
+void knn_inner_products_by_idx (const float * x,
+                                const float * y,
+                                const int64_t * ids,
+                                size_t d, size_t nx, size_t ny,
+                                float_minheap_array_t * res)
+{
+    size_t k = res->k;
+
+#pragma omp parallel for
+    for (size_t i = 0; i < nx; i++) {
+        const float * x_ = x + i * d;
+        const int64_t * idsi = ids + i * ny;
+        size_t j;
+        float * __restrict simi = res->get_val(i);
+        int64_t * __restrict idxi = res->get_ids (i);
+        minheap_heapify (k, simi, idxi);
+
+        for (j = 0; j < ny; j++) {
+            if (idsi[j] < 0) break;
+            float ip = fvec_inner_product (x_, y + d * idsi[j], d);
+
+            if (ip > simi[0]) {
+                minheap_pop (k, simi, idxi);
+                minheap_push (k, simi, idxi, ip, idsi[j]);
+            }
+        }
+        minheap_reorder (k, simi, idxi);
+    }
+
+}
+
+void knn_L2sqr_by_idx (const float * x,
+                       const float * y,
+                       const int64_t * __restrict ids,
+                       size_t d, size_t nx, size_t ny,
+                       float_maxheap_array_t * res)
+{
+    size_t k = res->k;
+
+#pragma omp parallel for
+    for (size_t i = 0; i < nx; i++) {
+        const float * x_ = x + i * d;
+        const int64_t * __restrict idsi = ids + i * ny;
+        float * __restrict simi = res->get_val(i);
+        int64_t * __restrict idxi = res->get_ids (i);
+        maxheap_heapify (res->k, simi, idxi);
+        for (size_t j = 0; j < ny; j++) {
+            float disij = fvec_L2sqr (x_, y + d * idsi[j], d);
+
+            if (disij < simi[0]) {
+                maxheap_pop (k, simi, idxi);
+                maxheap_push (k, simi, idxi, disij, idsi[j]);
+            }
+        }
+        maxheap_reorder (res->k, simi, idxi);
+    }
+
+}
+
+
+
+
+
+/***************************************************************************
+ * Range search
+ ***************************************************************************/
+
+/** Find the nearest neighbors for nx queries in a set of ny vectors
+ * compute_l2 = compute pairwise squared L2 distance rather than inner prod
+ */
+template <bool compute_l2>
+static void range_search_blas (
+        const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        float radius,
+        RangeSearchResult *result)
+{
+
+    // BLAS does not like empty matrices
+    if (nx == 0 || ny == 0) return;
+
+    /* block sizes */
+    const size_t bs_x = 4096, bs_y = 1024;
+    // const size_t bs_x = 16, bs_y = 16;
+    float *ip_block = new float[bs_x * bs_y];
+    ScopeDeleter<float> del0(ip_block);
+
+    float *x_norms = nullptr, *y_norms = nullptr;
+    ScopeDeleter<float> del1, del2;
+    if (compute_l2) {
+        x_norms = new float[nx];
+        del1.set (x_norms);
+        fvec_norms_L2sqr (x_norms, x, d, nx);
+
+        y_norms = new float[ny];
+        del2.set (y_norms);
+        fvec_norms_L2sqr (y_norms, y, d, ny);
+    }
+
+    std::vector <RangeSearchPartialResult *> partial_results;
+
+    for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
+        size_t j1 = j0 + bs_y;
+        if (j1 > ny) j1 = ny;
+        RangeSearchPartialResult * pres = new RangeSearchPartialResult (result);
+        partial_results.push_back (pres);
+
+        for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
+            size_t i1 = i0 + bs_x;
+            if(i1 > nx) i1 = nx;
+
+            /* compute the actual dot products */
+            {
+                float one = 1, zero = 0;
+                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
+                sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
+                        y + j0 * d, &di,
+                        x + i0 * d, &di, &zero,
+                        ip_block, &nyi);
+            }
+
+
+            for (size_t i = i0; i < i1; i++) {
+                const float *ip_line = ip_block + (i - i0) * (j1 - j0);
+
+                RangeQueryResult & qres = pres->new_result (i);
+
+                for (size_t j = j0; j < j1; j++) {
+                    float ip = *ip_line++;
+                    if (compute_l2) {
+                        float dis =  x_norms[i] + y_norms[j] - 2 * ip;
+                        if (dis < radius) {
+                            qres.add (dis, j);
+                        }
+                    } else {
+                        if (ip > radius) {
+                            qres.add (ip, j);
+                        }
+                    }
+                }
+            }
+        }
+        InterruptCallback::check ();
+    }
+
+    RangeSearchPartialResult::merge (partial_results);
+}
+
+
+template <bool compute_l2>
+static void range_search_sse (const float * x,
+                const float * y,
+                size_t d, size_t nx, size_t ny,
+                float radius,
+                RangeSearchResult *res)
+{
+    FAISS_THROW_IF_NOT (d % 4 == 0);
+
+#pragma omp parallel
+    {
+        RangeSearchPartialResult pres (res);
+
+#pragma omp for
+        for (size_t i = 0; i < nx; i++) {
+            const float * x_ = x + i * d;
+            const float * y_ = y;
+            size_t j;
+
+            RangeQueryResult & qres = pres.new_result (i);
+
+            for (j = 0; j < ny; j++) {
+                if (compute_l2) {
+                    float disij = fvec_L2sqr (x_, y_, d);
+                    if (disij < radius) {
+                        qres.add (disij, j);
+                    }
+                } else {
+                    float ip = fvec_inner_product (x_, y_, d);
+                    if (ip > radius) {
+                        qres.add (ip, j);
+                    }
+                }
+                y_ += d;
+            }
+
+        }
+        pres.finalize ();
+    }
+
+    // check just at the end because the use case is typically just
+    // when the nb of queries is low.
+    InterruptCallback::check();
+}
+
+
+
+
+
+void range_search_L2sqr (
+        const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        float radius,
+        RangeSearchResult *res)
+{
+
+    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
+        range_search_sse<true> (x, y, d, nx, ny, radius, res);
+    } else {
+        range_search_blas<true> (x, y, d, nx, ny, radius, res);
+    }
+}
+
+void range_search_inner_product (
+        const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        float radius,
+        RangeSearchResult *res)
+{
+
+    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
+        range_search_sse<false> (x, y, d, nx, ny, radius, res);
+    } else {
+        range_search_blas<false> (x, y, d, nx, ny, radius, res);
+    }
+}
+
+
+void pairwise_L2sqr (int64_t d,
+                     int64_t nq, const float *xq,
+                     int64_t nb, const float *xb,
+                     float *dis,
+                     int64_t ldq, int64_t ldb, int64_t ldd)
+{
+    if (nq == 0 || nb == 0) return;
+    if (ldq == -1) ldq = d;
+    if (ldb == -1) ldb = d;
+    if (ldd == -1) ldd = nb;
+
+    // store in beginning of distance matrix to avoid malloc
+    float *b_norms = dis;
+
+#pragma omp parallel for
+    for (int64_t i = 0; i < nb; i++)
+        b_norms [i] = fvec_norm_L2sqr (xb + i * ldb, d);
+
+#pragma omp parallel for
+    for (int64_t i = 1; i < nq; i++) {
+        float q_norm = fvec_norm_L2sqr (xq + i * ldq, d);
+        for (int64_t j = 0; j < nb; j++)
+            dis[i * ldd + j] = q_norm + b_norms [j];
+    }
+
+    {
+        float q_norm = fvec_norm_L2sqr (xq, d);
+        for (int64_t j = 0; j < nb; j++)
+            dis[j] += q_norm;
+    }
+
+    {
+        FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd;
+        float one = 1.0, minus_2 = -2.0;
+
+        sgemm_ ("Transposed", "Not transposed",
+                &nbi, &nqi, &di,
+                &minus_2,
+                xb, &ldbi,
+                xq, &ldqi,
+                &one, dis, &lddi);
+    }
+
+}
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/utils/distances.h b/core/src/index/thirdparty/faiss/utils/distances.h
new file mode 100644
index 0000000000..a78a5af80f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/utils/distances.h
@@ -0,0 +1,243 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/* All distance functions for L2 and IP distances.
+ * The actual functions are implemented in distances.cpp and distances_simd.cpp */
+
+#pragma once
+
+#include <stdint.h>
+
+#include <faiss/utils/Heap.h>
+
+
+namespace faiss {
+
+ /*********************************************************
+ * Optimized distance/norm/inner prod computations
+ *********************************************************/
+
+
+/// Squared L2 distance between two vectors
+float fvec_L2sqr (
+        const float * x,
+        const float * y,
+        size_t d);
+
+/// inner product
+float  fvec_inner_product (
+        const float * x,
+        const float * y,
+        size_t d);
+
+/// L1 distance
+float fvec_L1 (
+        const float * x,
+        const float * y,
+        size_t d);
+
+float fvec_Linf (
+        const float * x,
+        const float * y,
+        size_t d);
+
+
+/** Compute pairwise distances between sets of vectors
+ *
+ * @param d     dimension of the vectors
+ * @param nq    nb of query vectors
+ * @param nb    nb of database vectors
+ * @param xq    query vectors (size nq * d)
+ * @param xb    database vectros (size nb * d)
+ * @param dis   output distances (size nq * nb)
+ * @param ldq,ldb, ldd strides for the matrices
+ */
+void pairwise_L2sqr (int64_t d,
+                     int64_t nq, const float *xq,
+                     int64_t nb, const float *xb,
+                     float *dis,
+                     int64_t ldq = -1, int64_t ldb = -1, int64_t ldd = -1);
+
+/* compute the inner product between nx vectors x and one y */
+void fvec_inner_products_ny (
+        float * ip,         /* output inner product */
+        const float * x,
+        const float * y,
+        size_t d, size_t ny);
+
+/* compute ny square L2 distance bewteen x and a set of contiguous y vectors */
+void fvec_L2sqr_ny (
+        float * dis,
+        const float * x,
+        const float * y,
+        size_t d, size_t ny);
+
+
+/** squared norm of a vector */
+float fvec_norm_L2sqr (const float * x,
+                       size_t d);
+
+/** compute the L2 norms for a set of vectors
+ *
+ * @param  ip       output norms, size nx
+ * @param  x        set of vectors, size nx * d
+ */
+void fvec_norms_L2 (float * ip, const float * x, size_t d, size_t nx);
+
+/// same as fvec_norms_L2, but computes square norms
+void fvec_norms_L2sqr (float * ip, const float * x, size_t d, size_t nx);
+
+/* L2-renormalize a set of vector. Nothing done if the vector is 0-normed */
+void fvec_renorm_L2 (size_t d, size_t nx, float * x);
+
+
+/* This function exists because the Torch counterpart is extremly slow
+   (not multi-threaded + unexpected overhead even in single thread).
+   It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y>  */
+void inner_product_to_L2sqr (float * dis,
+                             const float * nr1,
+                             const float * nr2,
+                             size_t n1, size_t n2);
+
+/***************************************************************************
+ * Compute a subset of  distances
+ ***************************************************************************/
+
+ /* compute the inner product between x and a subset y of ny vectors,
+   whose indices are given by idy.  */
+void fvec_inner_products_by_idx (
+        float * ip,
+        const float * x,
+        const float * y,
+        const int64_t *ids,
+        size_t d, size_t nx, size_t ny);
+
+/* same but for a subset in y indexed by idsy (ny vectors in total) */
+void fvec_L2sqr_by_idx (
+        float * dis,
+        const float * x,
+        const float * y,
+        const int64_t *ids, /* ids of y vecs */
+        size_t d, size_t nx, size_t ny);
+
+
+/** compute dis[j] = L2sqr(x[ix[j]], y[iy[j]]) forall j=0..n-1
+ *
+ * @param x  size (max(ix) + 1, d)
+ * @param y  size (max(iy) + 1, d)
+ * @param ix size n
+ * @param iy size n
+ * @param dis size n
+ */
+void pairwise_indexed_L2sqr (
+        size_t d, size_t n,
+        const float * x, const int64_t *ix,
+        const float * y, const int64_t *iy,
+        float *dis);
+
+/* same for inner product */
+void pairwise_indexed_inner_product (
+        size_t d, size_t n,
+        const float * x, const int64_t *ix,
+        const float * y, const int64_t *iy,
+        float *dis);
+
+/***************************************************************************
+ * KNN functions
+ ***************************************************************************/
+
+// threshold on nx above which we switch to BLAS to compute distances
+extern int distance_compute_blas_threshold;
+
+/** Return the k nearest neighors of each of the nx vectors x among the ny
+ *  vector y, w.r.t to max inner product
+ *
+ * @param x    query vectors, size nx * d
+ * @param y    database vectors, size ny * d
+ * @param res  result array, which also provides k. Sorted on output
+ */
+void knn_inner_product (
+        const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        float_minheap_array_t * res);
+
+/** Same as knn_inner_product, for the L2 distance */
+void knn_L2sqr (
+        const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        float_maxheap_array_t * res);
+
+
+
+/** same as knn_L2sqr, but base_shift[bno] is subtracted to all
+ * computed distances.
+ *
+ * @param base_shift   size ny
+ */
+void knn_L2sqr_base_shift (
+         const float * x,
+         const float * y,
+         size_t d, size_t nx, size_t ny,
+         float_maxheap_array_t * res,
+         const float *base_shift);
+
+/* Find the nearest neighbors for nx queries in a set of ny vectors
+ * indexed by ids. May be useful for re-ranking a pre-selected vector list
+ */
+void knn_inner_products_by_idx (
+        const float * x,
+        const float * y,
+        const int64_t *  ids,
+        size_t d, size_t nx, size_t ny,
+        float_minheap_array_t * res);
+
+void knn_L2sqr_by_idx (const float * x,
+                       const float * y,
+                       const int64_t * ids,
+                       size_t d, size_t nx, size_t ny,
+                       float_maxheap_array_t * res);
+
+/***************************************************************************
+ * Range search
+ ***************************************************************************/
+
+
+
+/// Forward declaration, see AuxIndexStructures.h
+struct RangeSearchResult;
+
+/** Return the k nearest neighors of each of the nx vectors x among the ny
+ *  vector y, w.r.t to max inner product
+ *
+ * @param x      query vectors, size nx * d
+ * @param y      database vectors, size ny * d
+ * @param radius search radius around the x vectors
+ * @param result result structure
+ */
+void range_search_L2sqr (
+        const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        float radius,
+        RangeSearchResult *result);
+
+/// same as range_search_L2sqr for the inner product similarity
+void range_search_inner_product (
+        const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        float radius,
+        RangeSearchResult *result);
+
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/utils/distances_simd.cpp b/core/src/index/thirdparty/faiss/utils/distances_simd.cpp
new file mode 100644
index 0000000000..da2bfa7750
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/utils/distances_simd.cpp
@@ -0,0 +1,809 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/utils/distances.h>
+
+#include <cstdio>
+#include <cassert>
+#include <cstring>
+#include <cmath>
+
+#ifdef __SSE__
+#include <immintrin.h>
+#endif
+
+#ifdef __aarch64__
+#include <arm_neon.h>
+#endif
+
+#include <omp.h>
+
+namespace faiss {
+
+#ifdef __AVX__
+#define USE_AVX
+#endif
+
+
+/*********************************************************
+ * Optimized distance computations
+ *********************************************************/
+
+
+/* Functions to compute:
+   - L2 distance between 2 vectors
+   - inner product between 2 vectors
+   - L2 norm of a vector
+
+   The functions should probably not be invoked when a large number of
+   vectors are be processed in batch (in which case Matrix multiply
+   is faster), but may be useful for comparing vectors isolated in
+   memory.
+
+   Works with any vectors of any dimension, even unaligned (in which
+   case they are slower).
+
+*/
+
+
+/*********************************************************
+ * Reference implementations
+ */
+
+
+float fvec_L2sqr_ref (const float * x,
+                     const float * y,
+                     size_t d)
+{
+    size_t i;
+    float res = 0;
+    for (i = 0; i < d; i++) {
+        const float tmp = x[i] - y[i];
+       res += tmp * tmp;
+    }
+    return res;
+}
+
+float fvec_L1_ref (const float * x,
+                   const float * y,
+                   size_t d)
+{
+    size_t i;
+    float res = 0;
+    for (i = 0; i < d; i++) {
+        const float tmp = x[i] - y[i];
+        res += fabs(tmp);
+    }
+    return res;
+}
+
+float fvec_Linf_ref (const float * x,
+                     const float * y,
+                     size_t d)
+{
+    size_t i;
+    float res = 0;
+    for (i = 0; i < d; i++) {
+      res = fmax(res, fabs(x[i] - y[i]));
+    }
+    return res;
+}
+
+float fvec_inner_product_ref (const float * x,
+                             const float * y,
+                             size_t d)
+{
+    size_t i;
+    float res = 0;
+    for (i = 0; i < d; i++)
+       res += x[i] * y[i];
+    return res;
+}
+
+float fvec_norm_L2sqr_ref (const float *x, size_t d)
+{
+    size_t i;
+    double res = 0;
+    for (i = 0; i < d; i++)
+       res += x[i] * x[i];
+    return res;
+}
+
+
+void fvec_L2sqr_ny_ref (float * dis,
+                    const float * x,
+                    const float * y,
+                    size_t d, size_t ny)
+{
+    for (size_t i = 0; i < ny; i++) {
+        dis[i] = fvec_L2sqr (x, y, d);
+        y += d;
+    }
+}
+
+
+
+
+/*********************************************************
+ * SSE and AVX implementations
+ */
+
+#ifdef __SSE__
+
+// reads 0 <= d < 4 floats as __m128
+static inline __m128 masked_read (int d, const float *x)
+{
+    assert (0 <= d && d < 4);
+    __attribute__((__aligned__(16))) float buf[4] = {0, 0, 0, 0};
+    switch (d) {
+      case 3:
+        buf[2] = x[2];
+      case 2:
+        buf[1] = x[1];
+      case 1:
+        buf[0] = x[0];
+    }
+    return _mm_load_ps (buf);
+    // cannot use AVX2 _mm_mask_set1_epi32
+}
+
+float fvec_norm_L2sqr (const float *  x,
+                      size_t d)
+{
+    __m128 mx;
+    __m128 msum1 = _mm_setzero_ps();
+
+    while (d >= 4) {
+        mx = _mm_loadu_ps (x); x += 4;
+        msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
+        d -= 4;
+    }
+
+    mx = masked_read (d, x);
+    msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
+
+    msum1 = _mm_hadd_ps (msum1, msum1);
+    msum1 = _mm_hadd_ps (msum1, msum1);
+    return  _mm_cvtss_f32 (msum1);
+}
+
+namespace {
+
+float sqr (float x) {
+    return x * x;
+}
+
+
+void fvec_L2sqr_ny_D1 (float * dis, const float * x,
+                       const float * y, size_t ny)
+{
+    float x0s = x[0];
+    __m128 x0 = _mm_set_ps (x0s, x0s, x0s, x0s);
+
+    size_t i;
+    for (i = 0; i + 3 < ny; i += 4) {
+        __m128 tmp, accu;
+        tmp = x0 - _mm_loadu_ps (y); y += 4;
+        accu = tmp * tmp;
+        dis[i] = _mm_cvtss_f32 (accu);
+        tmp = _mm_shuffle_ps (accu, accu, 1);
+        dis[i + 1] = _mm_cvtss_f32 (tmp);
+        tmp = _mm_shuffle_ps (accu, accu, 2);
+        dis[i + 2] = _mm_cvtss_f32 (tmp);
+        tmp = _mm_shuffle_ps (accu, accu, 3);
+        dis[i + 3] = _mm_cvtss_f32 (tmp);
+    }
+    while (i < ny) { // handle non-multiple-of-4 case
+        dis[i++] = sqr(x0s - *y++);
+    }
+}
+
+
+void fvec_L2sqr_ny_D2 (float * dis, const float * x,
+                       const float * y, size_t ny)
+{
+    __m128 x0 = _mm_set_ps (x[1], x[0], x[1], x[0]);
+
+    size_t i;
+    for (i = 0; i + 1 < ny; i += 2) {
+        __m128 tmp, accu;
+        tmp = x0 - _mm_loadu_ps (y); y += 4;
+        accu = tmp * tmp;
+        accu = _mm_hadd_ps (accu, accu);
+        dis[i] = _mm_cvtss_f32 (accu);
+        accu = _mm_shuffle_ps (accu, accu, 3);
+        dis[i + 1] = _mm_cvtss_f32 (accu);
+    }
+    if (i < ny) { // handle odd case
+        dis[i] = sqr(x[0] - y[0]) + sqr(x[1] - y[1]);
+    }
+}
+
+
+
+void fvec_L2sqr_ny_D4 (float * dis, const float * x,
+                        const float * y, size_t ny)
+{
+    __m128 x0 = _mm_loadu_ps(x);
+
+    for (size_t i = 0; i < ny; i++) {
+        __m128 tmp, accu;
+        tmp = x0 - _mm_loadu_ps (y); y += 4;
+        accu = tmp * tmp;
+        accu = _mm_hadd_ps (accu, accu);
+        accu = _mm_hadd_ps (accu, accu);
+        dis[i] = _mm_cvtss_f32 (accu);
+    }
+}
+
+
+void fvec_L2sqr_ny_D8 (float * dis, const float * x,
+                        const float * y, size_t ny)
+{
+    __m128 x0 = _mm_loadu_ps(x);
+    __m128 x1 = _mm_loadu_ps(x + 4);
+
+    for (size_t i = 0; i < ny; i++) {
+        __m128 tmp, accu;
+        tmp = x0 - _mm_loadu_ps (y); y += 4;
+        accu = tmp * tmp;
+        tmp = x1 - _mm_loadu_ps (y); y += 4;
+        accu += tmp * tmp;
+        accu = _mm_hadd_ps (accu, accu);
+        accu = _mm_hadd_ps (accu, accu);
+        dis[i] = _mm_cvtss_f32 (accu);
+    }
+}
+
+
+void fvec_L2sqr_ny_D12 (float * dis, const float * x,
+                        const float * y, size_t ny)
+{
+    __m128 x0 = _mm_loadu_ps(x);
+    __m128 x1 = _mm_loadu_ps(x + 4);
+    __m128 x2 = _mm_loadu_ps(x + 8);
+
+    for (size_t i = 0; i < ny; i++) {
+        __m128 tmp, accu;
+        tmp = x0 - _mm_loadu_ps (y); y += 4;
+        accu = tmp * tmp;
+        tmp = x1 - _mm_loadu_ps (y); y += 4;
+        accu += tmp * tmp;
+        tmp = x2 - _mm_loadu_ps (y); y += 4;
+        accu += tmp * tmp;
+        accu = _mm_hadd_ps (accu, accu);
+        accu = _mm_hadd_ps (accu, accu);
+        dis[i] = _mm_cvtss_f32 (accu);
+    }
+}
+
+
+} // anonymous namespace
+
+void fvec_L2sqr_ny (float * dis, const float * x,
+                        const float * y, size_t d, size_t ny) {
+    // optimized for a few special cases
+    switch(d) {
+    case 1:
+        fvec_L2sqr_ny_D1 (dis, x, y, ny);
+        return;
+    case 2:
+        fvec_L2sqr_ny_D2 (dis, x, y, ny);
+        return;
+    case 4:
+        fvec_L2sqr_ny_D4 (dis, x, y, ny);
+        return;
+    case 8:
+        fvec_L2sqr_ny_D8 (dis, x, y, ny);
+        return;
+    case 12:
+        fvec_L2sqr_ny_D12 (dis, x, y, ny);
+        return;
+    default:
+        fvec_L2sqr_ny_ref (dis, x, y, d, ny);
+        return;
+    }
+}
+
+
+
+#endif
+
+#ifdef USE_AVX
+
+// reads 0 <= d < 8 floats as __m256
+static inline __m256 masked_read_8 (int d, const float *x)
+{
+    assert (0 <= d && d < 8);
+    if (d < 4) {
+        __m256 res = _mm256_setzero_ps ();
+        res = _mm256_insertf128_ps (res, masked_read (d, x), 0);
+        return res;
+    } else {
+        __m256 res = _mm256_setzero_ps ();
+        res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0);
+        res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1);
+        return res;
+    }
+}
+
+float fvec_inner_product (const float * x,
+                          const float * y,
+                          size_t d)
+{
+    __m256 msum1 = _mm256_setzero_ps();
+
+    while (d >= 8) {
+        __m256 mx = _mm256_loadu_ps (x); x += 8;
+        __m256 my = _mm256_loadu_ps (y); y += 8;
+        msum1 = _mm256_add_ps (msum1, _mm256_mul_ps (mx, my));
+        d -= 8;
+    }
+
+    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
+    msum2 +=       _mm256_extractf128_ps(msum1, 0);
+
+    if (d >= 4) {
+        __m128 mx = _mm_loadu_ps (x); x += 4;
+        __m128 my = _mm_loadu_ps (y); y += 4;
+        msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
+        d -= 4;
+    }
+
+    if (d > 0) {
+        __m128 mx = masked_read (d, x);
+        __m128 my = masked_read (d, y);
+        msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
+    }
+
+    msum2 = _mm_hadd_ps (msum2, msum2);
+    msum2 = _mm_hadd_ps (msum2, msum2);
+    return  _mm_cvtss_f32 (msum2);
+}
+
+float fvec_L2sqr (const float * x,
+                 const float * y,
+                 size_t d)
+{
+    __m256 msum1 = _mm256_setzero_ps();
+
+    while (d >= 8) {
+        __m256 mx = _mm256_loadu_ps (x); x += 8;
+        __m256 my = _mm256_loadu_ps (y); y += 8;
+        const __m256 a_m_b1 = mx - my;
+        msum1 += a_m_b1 * a_m_b1;
+        d -= 8;
+    }
+
+    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
+    msum2 +=       _mm256_extractf128_ps(msum1, 0);
+
+    if (d >= 4) {
+        __m128 mx = _mm_loadu_ps (x); x += 4;
+        __m128 my = _mm_loadu_ps (y); y += 4;
+        const __m128 a_m_b1 = mx - my;
+        msum2 += a_m_b1 * a_m_b1;
+        d -= 4;
+    }
+
+    if (d > 0) {
+        __m128 mx = masked_read (d, x);
+        __m128 my = masked_read (d, y);
+        __m128 a_m_b1 = mx - my;
+        msum2 += a_m_b1 * a_m_b1;
+    }
+
+    msum2 = _mm_hadd_ps (msum2, msum2);
+    msum2 = _mm_hadd_ps (msum2, msum2);
+    return  _mm_cvtss_f32 (msum2);
+}
+
+float fvec_L1 (const float * x, const float * y, size_t d)
+{
+    __m256 msum1 = _mm256_setzero_ps();
+    __m256 signmask = __m256(_mm256_set1_epi32 (0x7fffffffUL));
+
+    while (d >= 8) {
+        __m256 mx = _mm256_loadu_ps (x); x += 8;
+        __m256 my = _mm256_loadu_ps (y); y += 8;
+        const __m256 a_m_b = mx - my;
+        msum1 += _mm256_and_ps(signmask, a_m_b);
+        d -= 8;
+    }
+
+    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
+    msum2 +=       _mm256_extractf128_ps(msum1, 0);
+    __m128 signmask2 = __m128(_mm_set1_epi32 (0x7fffffffUL));
+
+    if (d >= 4) {
+        __m128 mx = _mm_loadu_ps (x); x += 4;
+        __m128 my = _mm_loadu_ps (y); y += 4;
+        const __m128 a_m_b = mx - my;
+        msum2 += _mm_and_ps(signmask2, a_m_b);
+        d -= 4;
+    }
+
+    if (d > 0) {
+        __m128 mx = masked_read (d, x);
+        __m128 my = masked_read (d, y);
+        __m128 a_m_b = mx - my;
+        msum2 += _mm_and_ps(signmask2, a_m_b);
+    }
+
+    msum2 = _mm_hadd_ps (msum2, msum2);
+    msum2 = _mm_hadd_ps (msum2, msum2);
+    return  _mm_cvtss_f32 (msum2);
+}
+
+float fvec_Linf (const float * x, const float * y, size_t d)
+{
+    __m256 msum1 = _mm256_setzero_ps();
+    __m256 signmask = __m256(_mm256_set1_epi32 (0x7fffffffUL));
+
+    while (d >= 8) {
+        __m256 mx = _mm256_loadu_ps (x); x += 8;
+        __m256 my = _mm256_loadu_ps (y); y += 8;
+        const __m256 a_m_b = mx - my;
+        msum1 = _mm256_max_ps(msum1, _mm256_and_ps(signmask, a_m_b));
+        d -= 8;
+    }
+
+    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
+    msum2 = _mm_max_ps (msum2, _mm256_extractf128_ps(msum1, 0));
+    __m128 signmask2 = __m128(_mm_set1_epi32 (0x7fffffffUL));
+
+    if (d >= 4) {
+        __m128 mx = _mm_loadu_ps (x); x += 4;
+        __m128 my = _mm_loadu_ps (y); y += 4;
+        const __m128 a_m_b = mx - my;
+        msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b));
+        d -= 4;
+    }
+
+    if (d > 0) {
+        __m128 mx = masked_read (d, x);
+        __m128 my = masked_read (d, y);
+        __m128 a_m_b = mx - my;
+        msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b));
+    }
+
+    msum2 = _mm_max_ps(_mm_movehl_ps(msum2, msum2), msum2);
+    msum2 = _mm_max_ps(msum2, _mm_shuffle_ps (msum2, msum2, 1));
+    return  _mm_cvtss_f32 (msum2);
+}
+
+#elif defined(__SSE__) // But not AVX
+
+float fvec_L1 (const float * x, const float * y, size_t d)
+{
+    return fvec_L1_ref (x, y, d);
+}
+
+float fvec_Linf (const float * x, const float * y, size_t d)
+{
+    return fvec_Linf_ref (x, y, d);
+}
+
+
+float fvec_L2sqr (const float * x,
+                 const float * y,
+                 size_t d)
+{
+    __m128 msum1 = _mm_setzero_ps();
+
+    while (d >= 4) {
+        __m128 mx = _mm_loadu_ps (x); x += 4;
+        __m128 my = _mm_loadu_ps (y); y += 4;
+        const __m128 a_m_b1 = mx - my;
+        msum1 += a_m_b1 * a_m_b1;
+        d -= 4;
+    }
+
+    if (d > 0) {
+        // add the last 1, 2 or 3 values
+        __m128 mx = masked_read (d, x);
+        __m128 my = masked_read (d, y);
+        __m128 a_m_b1 = mx - my;
+        msum1 += a_m_b1 * a_m_b1;
+    }
+
+    msum1 = _mm_hadd_ps (msum1, msum1);
+    msum1 = _mm_hadd_ps (msum1, msum1);
+    return  _mm_cvtss_f32 (msum1);
+}
+
+
+float fvec_inner_product (const float * x,
+                         const float * y,
+                         size_t d)
+{
+    __m128 mx, my;
+    __m128 msum1 = _mm_setzero_ps();
+
+    while (d >= 4) {
+        mx = _mm_loadu_ps (x); x += 4;
+        my = _mm_loadu_ps (y); y += 4;
+        msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, my));
+        d -= 4;
+    }
+
+    // add the last 1, 2, or 3 values
+    mx = masked_read (d, x);
+    my = masked_read (d, y);
+    __m128 prod = _mm_mul_ps (mx, my);
+
+    msum1 = _mm_add_ps (msum1, prod);
+
+    msum1 = _mm_hadd_ps (msum1, msum1);
+    msum1 = _mm_hadd_ps (msum1, msum1);
+    return  _mm_cvtss_f32 (msum1);
+}
+
+#elif defined(__aarch64__)
+
+
+float fvec_L2sqr (const float * x,
+                  const float * y,
+                  size_t d)
+{
+    if (d & 3) return fvec_L2sqr_ref (x, y, d);
+    float32x4_t accu = vdupq_n_f32 (0);
+    for (size_t i = 0; i < d; i += 4) {
+        float32x4_t xi = vld1q_f32 (x + i);
+        float32x4_t yi = vld1q_f32 (y + i);
+        float32x4_t sq = vsubq_f32 (xi, yi);
+        accu = vfmaq_f32 (accu, sq, sq);
+    }
+    float32x4_t a2 = vpaddq_f32 (accu, accu);
+    return vdups_laneq_f32 (a2, 0) + vdups_laneq_f32 (a2, 1);
+}
+
+float fvec_inner_product (const float * x,
+                          const float * y,
+                          size_t d)
+{
+    if (d & 3) return fvec_inner_product_ref (x, y, d);
+    float32x4_t accu = vdupq_n_f32 (0);
+    for (size_t i = 0; i < d; i += 4) {
+        float32x4_t xi = vld1q_f32 (x + i);
+        float32x4_t yi = vld1q_f32 (y + i);
+        accu = vfmaq_f32 (accu, xi, yi);
+    }
+    float32x4_t a2 = vpaddq_f32 (accu, accu);
+    return vdups_laneq_f32 (a2, 0) + vdups_laneq_f32 (a2, 1);
+}
+
+float fvec_norm_L2sqr (const float *x, size_t d)
+{
+    if (d & 3) return fvec_norm_L2sqr_ref (x, d);
+    float32x4_t accu = vdupq_n_f32 (0);
+    for (size_t i = 0; i < d; i += 4) {
+        float32x4_t xi = vld1q_f32 (x + i);
+        accu = vfmaq_f32 (accu, xi, xi);
+    }
+    float32x4_t a2 = vpaddq_f32 (accu, accu);
+    return vdups_laneq_f32 (a2, 0) + vdups_laneq_f32 (a2, 1);
+}
+
+// not optimized for ARM
+void fvec_L2sqr_ny (float * dis, const float * x,
+                        const float * y, size_t d, size_t ny) {
+    fvec_L2sqr_ny_ref (dis, x, y, d, ny);
+}
+
+float fvec_L1 (const float * x, const float * y, size_t d)
+{
+    return fvec_L1_ref (x, y, d);
+}
+
+float fvec_Linf (const float * x, const float * y, size_t d)
+{
+    return fvec_Linf_ref (x, y, d);
+}
+
+
+#else
+// scalar implementation
+
+float fvec_L2sqr (const float * x,
+                  const float * y,
+                  size_t d)
+{
+    return fvec_L2sqr_ref (x, y, d);
+}
+
+float fvec_L1 (const float * x, const float * y, size_t d)
+{
+    return fvec_L1_ref (x, y, d);
+}
+
+float fvec_Linf (const float * x, const float * y, size_t d)
+{
+    return fvec_Linf_ref (x, y, d);
+}
+
+float fvec_inner_product (const float * x,
+                             const float * y,
+                             size_t d)
+{
+    return fvec_inner_product_ref (x, y, d);
+}
+
+float fvec_norm_L2sqr (const float *x, size_t d)
+{
+    return fvec_norm_L2sqr_ref (x, d);
+}
+
+void fvec_L2sqr_ny (float * dis, const float * x,
+                        const float * y, size_t d, size_t ny) {
+    fvec_L2sqr_ny_ref (dis, x, y, d, ny);
+}
+
+
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+/***************************************************************************
+ * heavily optimized table computations
+ ***************************************************************************/
+
+
+static inline void fvec_madd_ref (size_t n, const float *a,
+                           float bf, const float *b, float *c) {
+    for (size_t i = 0; i < n; i++)
+        c[i] = a[i] + bf * b[i];
+}
+
+#ifdef __SSE__
+
+static inline void fvec_madd_sse (size_t n, const float *a,
+                                  float bf, const float *b, float *c) {
+    n >>= 2;
+    __m128 bf4 = _mm_set_ps1 (bf);
+    __m128 * a4 = (__m128*)a;
+    __m128 * b4 = (__m128*)b;
+    __m128 * c4 = (__m128*)c;
+
+    while (n--) {
+        *c4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
+        b4++;
+        a4++;
+        c4++;
+    }
+}
+
+void fvec_madd (size_t n, const float *a,
+                float bf, const float *b, float *c)
+{
+    if ((n & 3) == 0 &&
+        ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
+        fvec_madd_sse (n, a, bf, b, c);
+    else
+        fvec_madd_ref (n, a, bf, b, c);
+}
+
+#else
+
+void fvec_madd (size_t n, const float *a,
+                float bf, const float *b, float *c)
+{
+    fvec_madd_ref (n, a, bf, b, c);
+}
+
+#endif
+
+static inline int fvec_madd_and_argmin_ref (size_t n, const float *a,
+                                         float bf, const float *b, float *c) {
+    float vmin = 1e20;
+    int imin = -1;
+
+    for (size_t i = 0; i < n; i++) {
+        c[i] = a[i] + bf * b[i];
+        if (c[i] < vmin) {
+            vmin = c[i];
+            imin = i;
+        }
+    }
+    return imin;
+}
+
+#ifdef __SSE__
+
+static inline int fvec_madd_and_argmin_sse (
+        size_t n, const float *a,
+        float bf, const float *b, float *c) {
+    n >>= 2;
+    __m128 bf4 = _mm_set_ps1 (bf);
+    __m128 vmin4 = _mm_set_ps1 (1e20);
+    __m128i imin4 = _mm_set1_epi32 (-1);
+    __m128i idx4 = _mm_set_epi32 (3, 2, 1, 0);
+    __m128i inc4 = _mm_set1_epi32 (4);
+    __m128 * a4 = (__m128*)a;
+    __m128 * b4 = (__m128*)b;
+    __m128 * c4 = (__m128*)c;
+
+    while (n--) {
+        __m128 vc4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
+        *c4 = vc4;
+        __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
+        // imin4 = _mm_blendv_epi8 (imin4, idx4, mask); // slower!
+
+        imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
+                              _mm_andnot_si128 (mask, imin4));
+        vmin4 = _mm_min_ps (vmin4, vc4);
+        b4++;
+        a4++;
+        c4++;
+        idx4 = _mm_add_epi32 (idx4, inc4);
+    }
+
+    // 4 values -> 2
+    {
+        idx4 = _mm_shuffle_epi32 (imin4, 3 << 2 | 2);
+        __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 3 << 2 | 2);
+        __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
+        imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
+                              _mm_andnot_si128 (mask, imin4));
+        vmin4 = _mm_min_ps (vmin4, vc4);
+    }
+    // 2 values -> 1
+    {
+        idx4 = _mm_shuffle_epi32 (imin4, 1);
+        __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 1);
+        __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
+        imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
+                              _mm_andnot_si128 (mask, imin4));
+        // vmin4 = _mm_min_ps (vmin4, vc4);
+    }
+    return _mm_cvtsi128_si32 (imin4);
+}
+
+
+int fvec_madd_and_argmin (size_t n, const float *a,
+                          float bf, const float *b, float *c)
+{
+    if ((n & 3) == 0 &&
+        ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
+        return fvec_madd_and_argmin_sse (n, a, bf, b, c);
+    else
+        return fvec_madd_and_argmin_ref (n, a, bf, b, c);
+}
+
+#else
+
+int fvec_madd_and_argmin (size_t n, const float *a,
+                          float bf, const float *b, float *c)
+{
+  return fvec_madd_and_argmin_ref (n, a, bf, b, c);
+}
+
+#endif
+
+
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/utils/extra_distances.cpp b/core/src/index/thirdparty/faiss/utils/extra_distances.cpp
new file mode 100644
index 0000000000..16b0b34570
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/utils/extra_distances.cpp
@@ -0,0 +1,336 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/utils/distances.h>
+
+#include <cmath>
+#include <omp.h>
+
+
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+namespace faiss {
+
+/***************************************************************************
+ * Distance functions (other than L2 and IP)
+ ***************************************************************************/
+
+struct VectorDistanceL2 {
+    size_t d;
+
+    float operator () (const float *x, const float *y) const {
+        return fvec_L2sqr (x, y, d);
+    }
+};
+
+struct VectorDistanceL1 {
+    size_t d;
+
+    float operator () (const float *x, const float *y) const {
+        return fvec_L1 (x, y, d);
+    }
+};
+
+struct VectorDistanceLinf {
+    size_t d;
+
+    float operator () (const float *x, const float *y) const {
+        return fvec_Linf (x, y, d);
+        /*
+        float vmax = 0;
+        for (size_t i = 0; i < d; i++) {
+            float diff = fabs (x[i] - y[i]);
+            if (diff > vmax) vmax = diff;
+        }
+        return vmax;*/
+    }
+};
+
+struct VectorDistanceLp {
+    size_t d;
+    const float p;
+
+    float operator () (const float *x, const float *y) const {
+        float accu = 0;
+        for (size_t i = 0; i < d; i++) {
+            float diff = fabs (x[i] - y[i]);
+            accu += powf (diff, p);
+        }
+        return accu;
+    }
+};
+
+struct VectorDistanceCanberra {
+    size_t d;
+
+    float operator () (const float *x, const float *y) const {
+        float accu = 0;
+        for (size_t i = 0; i < d; i++) {
+            float xi = x[i], yi = y[i];
+            accu += fabs (xi - yi) / (fabs(xi) + fabs(yi));
+        }
+        return accu;
+    }
+};
+
+struct VectorDistanceBrayCurtis {
+    size_t d;
+
+    float operator () (const float *x, const float *y) const {
+        float accu_num = 0, accu_den = 0;
+        for (size_t i = 0; i < d; i++) {
+            float xi = x[i], yi = y[i];
+            accu_num += fabs (xi - yi);
+            accu_den += fabs (xi + yi);
+        }
+        return accu_num / accu_den;
+    }
+};
+
+struct VectorDistanceJensenShannon {
+    size_t d;
+
+    float operator () (const float *x, const float *y) const {
+        float accu = 0;
+
+        for (size_t i = 0; i < d; i++) {
+            float xi = x[i], yi = y[i];
+            float mi = 0.5 * (xi + yi);
+            float kl1 = - xi * log(mi / xi);
+            float kl2 = - yi * log(mi / yi);
+            accu += kl1 + kl2;
+        }
+        return 0.5 * accu;
+    }
+};
+
+
+
+
+
+
+
+
+
+
+namespace {
+
+template<class VD>
+void pairwise_extra_distances_template (
+                     VD vd,
+                     int64_t nq, const float *xq,
+                     int64_t nb, const float *xb,
+                     float *dis,
+                     int64_t ldq, int64_t ldb, int64_t ldd)
+{
+
+#pragma omp parallel for if(nq > 10)
+    for (int64_t i = 0; i < nq; i++) {
+        const float *xqi = xq + i * ldq;
+        const float *xbj = xb;
+        float *disi = dis + ldd * i;
+
+        for (int64_t j = 0; j < nb; j++) {
+            disi[j] = vd (xqi, xbj);
+            xbj += ldb;
+        }
+    }
+}
+
+
+template<class VD>
+void knn_extra_metrics_template (
+        VD vd,
+        const float * x,
+        const float * y,
+        size_t nx, size_t ny,
+        float_maxheap_array_t * res)
+{
+    size_t k = res->k;
+    size_t d = vd.d;
+    size_t check_period = InterruptCallback::get_period_hint (ny * d);
+    check_period *= omp_get_max_threads();
+
+    for (size_t i0 = 0; i0 < nx; i0 += check_period) {
+        size_t i1 = std::min(i0 + check_period, nx);
+
+#pragma omp parallel for
+        for (size_t i = i0; i < i1; i++) {
+            const float * x_i = x + i * d;
+            const float * y_j = y;
+            size_t j;
+            float * simi = res->get_val(i);
+            int64_t * idxi = res->get_ids (i);
+
+            maxheap_heapify (k, simi, idxi);
+            for (j = 0; j < ny; j++) {
+                float disij = vd (x_i, y_j);
+
+                if (disij < simi[0]) {
+                    maxheap_pop (k, simi, idxi);
+                    maxheap_push (k, simi, idxi, disij, j);
+                }
+                y_j += d;
+            }
+            maxheap_reorder (k, simi, idxi);
+        }
+        InterruptCallback::check ();
+    }
+
+}
+
+
+template<class VD>
+struct ExtraDistanceComputer : DistanceComputer {
+    VD vd;
+    Index::idx_t nb;
+    const float *q;
+    const float *b;
+
+    float operator () (idx_t i) override {
+        return vd (q, b + i * vd.d);
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return vd (b + j * vd.d, b + i * vd.d);
+    }
+
+    ExtraDistanceComputer(const VD & vd, const float *xb,
+                          size_t nb, const float *q = nullptr)
+        : vd(vd), nb(nb), q(q), b(xb) {}
+
+    void set_query(const float *x) override {
+        q = x;
+    }
+};
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+} // anonymous namespace
+
+void pairwise_extra_distances (
+                     int64_t d,
+                     int64_t nq, const float *xq,
+                     int64_t nb, const float *xb,
+                     MetricType mt, float metric_arg,
+                     float *dis,
+                     int64_t ldq, int64_t ldb, int64_t ldd)
+{
+    if (nq == 0 || nb == 0) return;
+    if (ldq == -1) ldq = d;
+    if (ldb == -1) ldb = d;
+    if (ldd == -1) ldd = nb;
+
+    switch(mt) {
+#define HANDLE_VAR(kw)                                          \
+     case METRIC_ ## kw: {                                      \
+        VectorDistance ## kw vd({(size_t)d});                   \
+        pairwise_extra_distances_template (vd, nq, xq, nb, xb,  \
+                                           dis, ldq, ldb, ldd); \
+        break;                                                  \
+    }
+        HANDLE_VAR(L2);
+        HANDLE_VAR(L1);
+        HANDLE_VAR(Linf);
+        HANDLE_VAR(Canberra);
+        HANDLE_VAR(BrayCurtis);
+        HANDLE_VAR(JensenShannon);
+#undef HANDLE_VAR
+    case METRIC_Lp: {
+        VectorDistanceLp vd({(size_t)d, metric_arg});
+        pairwise_extra_distances_template (vd, nq, xq, nb, xb,
+                                           dis, ldq, ldb, ldd);
+        break;
+    }
+    default:
+        FAISS_THROW_MSG ("metric type not implemented");
+    }
+
+}
+
+void knn_extra_metrics (
+        const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        MetricType mt, float metric_arg,
+        float_maxheap_array_t * res)
+{
+
+    switch(mt) {
+#define HANDLE_VAR(kw)                                          \
+     case METRIC_ ## kw: {                                      \
+        VectorDistance ## kw vd({(size_t)d});                   \
+        knn_extra_metrics_template (vd, x, y, nx, ny, res);     \
+        break;                                                  \
+    }
+        HANDLE_VAR(L2);
+        HANDLE_VAR(L1);
+        HANDLE_VAR(Linf);
+        HANDLE_VAR(Canberra);
+        HANDLE_VAR(BrayCurtis);
+        HANDLE_VAR(JensenShannon);
+#undef HANDLE_VAR
+    case METRIC_Lp: {
+        VectorDistanceLp vd({(size_t)d, metric_arg});
+        knn_extra_metrics_template (vd, x, y, nx, ny, res);
+        break;
+    }
+    default:
+        FAISS_THROW_MSG ("metric type not implemented");
+    }
+
+}
+
+DistanceComputer *get_extra_distance_computer (
+        size_t d,
+        MetricType mt, float metric_arg,
+        size_t nb, const float *xb)
+{
+
+    switch(mt) {
+#define HANDLE_VAR(kw)                                                  \
+     case METRIC_ ## kw: {                                              \
+        VectorDistance ## kw vd({(size_t)d});                           \
+        return new ExtraDistanceComputer<VectorDistance ## kw>(vd, xb, nb); \
+    }
+        HANDLE_VAR(L2);
+        HANDLE_VAR(L1);
+        HANDLE_VAR(Linf);
+        HANDLE_VAR(Canberra);
+        HANDLE_VAR(BrayCurtis);
+        HANDLE_VAR(JensenShannon);
+#undef HANDLE_VAR
+    case METRIC_Lp: {
+        VectorDistanceLp vd({(size_t)d, metric_arg});
+        return new ExtraDistanceComputer<VectorDistanceLp> (vd, xb, nb);
+        break;
+    }
+    default:
+        FAISS_THROW_MSG ("metric type not implemented");
+    }
+
+}
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/utils/extra_distances.h b/core/src/index/thirdparty/faiss/utils/extra_distances.h
new file mode 100644
index 0000000000..65b00b0421
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/utils/extra_distances.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_distances_h
+#define FAISS_distances_h
+
+/** In this file are the implementations of extra metrics beyond L2
+ *  and inner product */
+
+#include <stdint.h>
+
+#include <faiss/Index.h>
+
+#include <faiss/utils/Heap.h>
+
+
+
+namespace faiss {
+
+
+void pairwise_extra_distances (
+                     int64_t d,
+                     int64_t nq, const float *xq,
+                     int64_t nb, const float *xb,
+                     MetricType mt, float metric_arg,
+                     float *dis,
+                     int64_t ldq = -1, int64_t ldb = -1, int64_t ldd = -1);
+
+
+void knn_extra_metrics (
+        const float * x,
+        const float * y,
+        size_t d, size_t nx, size_t ny,
+        MetricType mt, float metric_arg,
+        float_maxheap_array_t * res);
+
+
+/** get a DistanceComputer that refers to this type of distance and
+ *  indexes a flat array of size nb */
+DistanceComputer *get_extra_distance_computer (
+        size_t d,
+        MetricType mt, float metric_arg,
+        size_t nb, const float *xb);
+
+}
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/utils/hamming-inl.h b/core/src/index/thirdparty/faiss/utils/hamming-inl.h
new file mode 100644
index 0000000000..861e1f4308
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/utils/hamming-inl.h
@@ -0,0 +1,472 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+
+namespace faiss {
+
+
+inline BitstringWriter::BitstringWriter(uint8_t *code, int code_size):
+    code (code), code_size (code_size), i(0)
+{
+    bzero (code, code_size);
+}
+
+inline void BitstringWriter::write(uint64_t x, int nbit) {
+    assert (code_size * 8 >= nbit + i);
+    // nb of available bits in i / 8
+    int na = 8 - (i & 7);
+
+    if (nbit <= na) {
+        code[i >> 3] |= x << (i & 7);
+        i += nbit;
+        return;
+    } else {
+        int j = i >> 3;
+        code[j++] |= x << (i & 7);
+        i += nbit;
+        x >>= na;
+        while (x != 0) {
+            code[j++] |= x;
+            x >>= 8;
+        }
+    }
+}
+
+
+inline BitstringReader::BitstringReader(const uint8_t *code, int code_size):
+    code (code), code_size (code_size), i(0)
+{}
+
+inline uint64_t BitstringReader::read(int nbit) {
+    assert (code_size * 8 >= nbit + i);
+    // nb of available bits in i / 8
+    int na = 8 - (i & 7);
+    // get available bits in current byte
+    uint64_t res = code[i >> 3] >> (i & 7);
+    if (nbit <= na) {
+        res &= (1 << nbit) - 1;
+        i += nbit;
+        return res;
+    } else {
+        int ofs = na;
+        int j = (i >> 3) + 1;
+        i += nbit;
+        nbit -= na;
+        while (nbit > 8) {
+            res |= ((uint64_t)code[j++]) << ofs;
+            ofs += 8;
+            nbit -= 8; // TODO remove nbit
+        }
+        uint64_t last_byte = code[j];
+        last_byte &= (1 << nbit) - 1;
+        res |= last_byte << ofs;
+        return res;
+    }
+}
+
+
+/******************************************************************
+ * The HammingComputer series of classes compares a single code of
+ * size 4 to 32 to incoming codes. They are intended for use as a
+ * template class where it would be inefficient to switch on the code
+ * size in the inner loop. Hopefully the compiler will inline the
+ * hamming() functions and put the a0, a1, ... in registers.
+ ******************************************************************/
+
+
+struct HammingComputer4 {
+    uint32_t a0;
+
+    HammingComputer4 () {}
+
+    HammingComputer4 (const uint8_t *a, int code_size) {
+        set (a, code_size);
+    }
+
+    void set (const uint8_t *a, int code_size) {
+        assert (code_size == 4);
+        a0 = *(uint32_t *)a;
+    }
+
+    inline int hamming (const uint8_t *b) const {
+        return popcount64 (*(uint32_t *)b ^ a0);
+    }
+
+};
+
+struct HammingComputer8 {
+    uint64_t a0;
+
+    HammingComputer8 () {}
+
+    HammingComputer8 (const uint8_t *a, int code_size) {
+        set (a, code_size);
+    }
+
+    void set (const uint8_t *a, int code_size) {
+        assert (code_size == 8);
+        a0 = *(uint64_t *)a;
+    }
+
+    inline int hamming (const uint8_t *b) const {
+        return popcount64 (*(uint64_t *)b ^ a0);
+    }
+
+};
+
+
+struct HammingComputer16 {
+    uint64_t a0, a1;
+
+    HammingComputer16 () {}
+
+    HammingComputer16 (const uint8_t *a8, int code_size) {
+        set (a8, code_size);
+    }
+
+    void set (const uint8_t *a8, int code_size) {
+        assert (code_size == 16);
+        const uint64_t *a = (uint64_t *)a8;
+        a0 = a[0]; a1 = a[1];
+    }
+
+    inline int hamming (const uint8_t *b8) const {
+        const uint64_t *b = (uint64_t *)b8;
+        return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1);
+    }
+
+};
+
+// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
+// This incurs a penalty of ~10% wrt. fully aligned accesses.
+struct HammingComputer20 {
+    uint64_t a0, a1;
+    uint32_t a2;
+
+    HammingComputer20 () {}
+
+    HammingComputer20 (const uint8_t *a8, int code_size) {
+        set (a8, code_size);
+    }
+
+    void set (const uint8_t *a8, int code_size) {
+        assert (code_size == 20);
+        const uint64_t *a = (uint64_t *)a8;
+        a0 = a[0]; a1 = a[1]; a2 = a[2];
+    }
+
+    inline int hamming (const uint8_t *b8) const {
+        const uint64_t *b = (uint64_t *)b8;
+        return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1) +
+            popcount64 (*(uint32_t*)(b + 2) ^ a2);
+    }
+};
+
+struct HammingComputer32 {
+    uint64_t a0, a1, a2, a3;
+
+    HammingComputer32 () {}
+
+    HammingComputer32 (const uint8_t *a8, int code_size) {
+        set (a8, code_size);
+    }
+
+    void set (const uint8_t *a8, int code_size) {
+        assert (code_size == 32);
+        const uint64_t *a = (uint64_t *)a8;
+        a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3];
+    }
+
+    inline int hamming (const uint8_t *b8) const {
+        const uint64_t *b = (uint64_t *)b8;
+        return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1) +
+            popcount64 (b[2] ^ a2) + popcount64 (b[3] ^ a3);
+    }
+
+};
+
+struct HammingComputer64 {
+    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
+
+    HammingComputer64 () {}
+
+    HammingComputer64 (const uint8_t *a8, int code_size) {
+        set (a8, code_size);
+    }
+
+    void set (const uint8_t *a8, int code_size) {
+        assert (code_size == 64);
+        const uint64_t *a = (uint64_t *)a8;
+        a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3];
+        a4 = a[4]; a5 = a[5]; a6 = a[6]; a7 = a[7];
+    }
+
+    inline int hamming (const uint8_t *b8) const {
+        const uint64_t *b = (uint64_t *)b8;
+        return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1) +
+            popcount64 (b[2] ^ a2) + popcount64 (b[3] ^ a3) +
+            popcount64 (b[4] ^ a4) + popcount64 (b[5] ^ a5) +
+            popcount64 (b[6] ^ a6) + popcount64 (b[7] ^ a7);
+    }
+
+};
+
+// very inefficient...
+struct HammingComputerDefault {
+    const uint8_t *a;
+    int n;
+
+    HammingComputerDefault () {}
+
+    HammingComputerDefault (const uint8_t *a8, int code_size) {
+        set (a8, code_size);
+    }
+
+    void set (const uint8_t *a8, int code_size) {
+        a =  a8;
+        n = code_size;
+    }
+
+    int hamming (const uint8_t *b8) const {
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += popcount64 (a[i] ^ b8[i]);
+        return accu;
+    }
+
+};
+
+struct HammingComputerM8 {
+    const uint64_t *a;
+    int n;
+
+    HammingComputerM8 () {}
+
+    HammingComputerM8 (const uint8_t *a8, int code_size) {
+        set (a8, code_size);
+    }
+
+    void set (const uint8_t *a8, int code_size) {
+        assert (code_size % 8 == 0);
+        a =  (uint64_t *)a8;
+        n = code_size / 8;
+    }
+
+    int hamming (const uint8_t *b8) const {
+        const uint64_t *b = (uint64_t *)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += popcount64 (a[i] ^ b[i]);
+        return accu;
+    }
+
+};
+
+// even more inefficient!
+struct HammingComputerM4 {
+    const uint32_t *a;
+    int n;
+
+    HammingComputerM4 () {}
+
+    HammingComputerM4 (const uint8_t *a4, int code_size) {
+        set (a4, code_size);
+    }
+
+    void set (const uint8_t *a4, int code_size) {
+        assert (code_size % 4 == 0);
+        a =  (uint32_t *)a4;
+        n = code_size / 4;
+    }
+
+    int hamming (const uint8_t *b8) const {
+        const uint32_t *b = (uint32_t *)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+             accu += popcount64 (a[i] ^ b[i]);
+        return accu;
+    }
+
+};
+
+/***************************************************************************
+ * Equivalence with a template class when code size is known at compile time
+ **************************************************************************/
+
+// default template
+template<int CODE_SIZE>
+struct HammingComputer: HammingComputerM8 {
+    HammingComputer (const uint8_t *a, int code_size):
+    HammingComputerM8(a, code_size) {}
+};
+
+#define SPECIALIZED_HC(CODE_SIZE)                     \
+    template<> struct HammingComputer<CODE_SIZE>:     \
+            HammingComputer ## CODE_SIZE {            \
+        HammingComputer (const uint8_t *a):           \
+        HammingComputer ## CODE_SIZE(a, CODE_SIZE) {} \
+    }
+
+SPECIALIZED_HC(4);
+SPECIALIZED_HC(8);
+SPECIALIZED_HC(16);
+SPECIALIZED_HC(20);
+SPECIALIZED_HC(32);
+SPECIALIZED_HC(64);
+
+#undef SPECIALIZED_HC
+
+
+/***************************************************************************
+ * generalized Hamming = number of bytes that are different between
+ * two codes.
+ ***************************************************************************/
+
+
+inline int generalized_hamming_64 (uint64_t a) {
+    a |= a >> 1;
+    a |= a >> 2;
+    a |= a >> 4;
+    a &= 0x0101010101010101UL;
+    return popcount64 (a);
+}
+
+
+struct GenHammingComputer8 {
+    uint64_t a0;
+
+    GenHammingComputer8 (const uint8_t *a, int code_size) {
+        assert (code_size == 8);
+        a0 = *(uint64_t *)a;
+    }
+
+    inline int hamming (const uint8_t *b) const {
+        return generalized_hamming_64 (*(uint64_t *)b ^ a0);
+    }
+
+};
+
+
+struct GenHammingComputer16 {
+    uint64_t a0, a1;
+    GenHammingComputer16 (const uint8_t *a8, int code_size) {
+        assert (code_size == 16);
+        const uint64_t *a = (uint64_t *)a8;
+        a0 = a[0]; a1 = a[1];
+    }
+
+    inline int hamming (const uint8_t *b8) const {
+        const uint64_t *b = (uint64_t *)b8;
+        return generalized_hamming_64 (b[0] ^ a0) +
+            generalized_hamming_64 (b[1] ^ a1);
+    }
+
+};
+
+struct GenHammingComputer32 {
+    uint64_t a0, a1, a2, a3;
+
+    GenHammingComputer32 (const uint8_t *a8, int code_size) {
+        assert (code_size == 32);
+        const uint64_t *a = (uint64_t *)a8;
+        a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3];
+    }
+
+    inline int hamming (const uint8_t *b8) const {
+        const uint64_t *b = (uint64_t *)b8;
+        return generalized_hamming_64 (b[0] ^ a0) +
+            generalized_hamming_64 (b[1] ^ a1) +
+            generalized_hamming_64 (b[2] ^ a2) +
+            generalized_hamming_64 (b[3] ^ a3);
+    }
+
+};
+
+struct GenHammingComputerM8 {
+    const uint64_t *a;
+    int n;
+
+    GenHammingComputerM8 (const uint8_t *a8, int code_size) {
+        assert (code_size % 8 == 0);
+        a =  (uint64_t *)a8;
+        n = code_size / 8;
+    }
+
+    int hamming (const uint8_t *b8) const {
+        const uint64_t *b = (uint64_t *)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += generalized_hamming_64 (a[i] ^ b[i]);
+        return accu;
+    }
+
+};
+
+
+/** generalized Hamming distances (= count number of code bytes that
+    are the same) */
+void generalized_hammings_knn_hc (
+        int_maxheap_array_t * ha,
+        const uint8_t * a,
+        const uint8_t * b,
+        size_t nb,
+        size_t code_size,
+        int ordered = true);
+
+
+
+/** This class maintains a list of best distances seen so far.
+ *
+ * Since the distances are in a limited range (0 to nbit), the
+ * object maintains one list per possible distance, and fills
+ * in only the n-first lists, such that the sum of sizes of the
+ * n lists is below k.
+ */
+template<class HammingComputer>
+struct HCounterState {
+  int *counters;
+  int64_t *ids_per_dis;
+
+  HammingComputer hc;
+  int thres;
+  int count_lt;
+  int count_eq;
+  int k;
+
+ HCounterState(int *counters, int64_t *ids_per_dis,
+               const uint8_t *x, int d, int k)
+ : counters(counters),
+        ids_per_dis(ids_per_dis),
+        hc(x, d / 8),
+        thres(d + 1),
+        count_lt(0),
+        count_eq(0),
+        k(k) {}
+
+  void update_counter(const uint8_t *y, size_t j) {
+    int32_t dis = hc.hamming(y);
+
+    if (dis <= thres) {
+      if (dis < thres) {
+        ids_per_dis[dis * k + counters[dis]++] = j;
+        ++count_lt;
+        while (count_lt == k && thres > 0) {
+          --thres;
+          count_eq = counters[thres];
+          count_lt -= count_eq;
+        }
+      } else if (count_eq < k) {
+        ids_per_dis[dis * k + count_eq++] = j;
+        counters[dis] = count_eq;
+      }
+    }
+  }
+};
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/utils/hamming.cpp b/core/src/index/thirdparty/faiss/utils/hamming.cpp
new file mode 100644
index 0000000000..de9e5e85bb
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/utils/hamming.cpp
@@ -0,0 +1,792 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/*
+ * Implementation of Hamming related functions (distances, smallest distance
+ * selection with regular heap|radix and probabilistic heap|radix.
+ *
+ * IMPLEMENTATION NOTES
+ * Bitvectors are generally assumed to be multiples of 64 bits.
+ *
+ * hamdis_t is used for distances because at this time
+ * it is not clear how we will need to balance
+ * - flexibility in vector size (unclear more than 2^16 or even 2^8 bitvectors)
+ * - memory usage
+ * - cache-misses when dealing with large volumes of data (lower bits is better)
+ *
+ * The hamdis_t should optimally be compatibe with one of the Torch Storage
+ * (Byte,Short,Long) and therefore should be signed for 2-bytes and 4-bytes
+*/
+
+#include <faiss/utils/hamming.h>
+
+#include <vector>
+#include <memory>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+#include <limits.h>
+
+#include <faiss/utils/Heap.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/utils.h>
+
+static const size_t BLOCKSIZE_QUERY = 8192;
+
+
+namespace faiss {
+
+size_t hamming_batch_size = 65536;
+
+static const uint8_t hamdis_tab_ham_bytes[256] = {
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+};
+
+
+/* Elementary Hamming distance computation: unoptimized  */
+template <size_t nbits, typename T>
+T hamming (const uint8_t *bs1,
+           const uint8_t *bs2)
+{
+    const size_t nbytes = nbits / 8;
+    size_t i;
+    T h = 0;
+    for (i = 0; i < nbytes; i++)
+        h += (T) hamdis_tab_ham_bytes[bs1[i]^bs2[i]];
+    return h;
+}
+
+
+/* Hamming distances for multiples of 64 bits */
+template <size_t nbits>
+hamdis_t hamming (const uint64_t * bs1, const uint64_t * bs2)
+{
+    const size_t nwords = nbits / 64;
+    size_t i;
+    hamdis_t h = 0;
+    for (i = 0; i < nwords; i++)
+        h += popcount64 (bs1[i] ^ bs2[i]);
+    return h;
+}
+
+
+
+/* specialized (optimized) functions */
+template <>
+hamdis_t hamming<64> (const uint64_t * pa, const uint64_t * pb)
+{
+    return popcount64 (pa[0] ^ pb[0]);
+}
+
+
+template <>
+hamdis_t hamming<128> (const uint64_t *pa, const uint64_t *pb)
+{
+    return popcount64 (pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
+}
+
+
+template <>
+hamdis_t hamming<256> (const uint64_t * pa, const uint64_t * pb)
+{
+    return  popcount64 (pa[0] ^ pb[0])
+          + popcount64 (pa[1] ^ pb[1])
+          + popcount64 (pa[2] ^ pb[2])
+          + popcount64 (pa[3] ^ pb[3]);
+}
+
+
+/* Hamming distances for multiple of 64 bits */
+hamdis_t hamming (
+        const uint64_t * bs1,
+        const uint64_t * bs2,
+        size_t nwords)
+{
+    size_t i;
+    hamdis_t h = 0;
+    for (i = 0; i < nwords; i++)
+        h += popcount64 (bs1[i] ^ bs2[i]);
+    return h;
+}
+
+
+
+template <size_t nbits>
+void hammings (
+        const uint64_t * bs1,
+        const uint64_t * bs2,
+        size_t n1, size_t n2,
+        hamdis_t * dis)
+
+{
+    size_t i, j;
+    const size_t nwords = nbits / 64;
+    for (i = 0; i < n1; i++) {
+        const uint64_t * __restrict bs1_ = bs1 + i * nwords;
+        hamdis_t * __restrict dis_ = dis + i * n2;
+        for (j = 0; j < n2; j++)
+            dis_[j] = hamming<nbits>(bs1_, bs2 + j * nwords);
+    }
+}
+
+
+
+void hammings (
+        const uint64_t * bs1,
+        const uint64_t * bs2,
+        size_t n1,
+        size_t n2,
+        size_t nwords,
+        hamdis_t * __restrict dis)
+{
+    size_t i, j;
+    n1 *= nwords;
+    n2 *= nwords;
+    for (i = 0; i < n1; i+=nwords) {
+        const uint64_t * bs1_ = bs1+i;
+        for (j = 0; j < n2; j+=nwords)
+            dis[j] = hamming (bs1_, bs2+j, nwords);
+    }
+}
+
+
+
+
+/* Count number of matches given a max threshold */
+template <size_t nbits>
+void hamming_count_thres (
+        const uint64_t * bs1,
+        const uint64_t * bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t * nptr)
+{
+    const size_t nwords = nbits / 64;
+    size_t i, j, posm = 0;
+    const uint64_t * bs2_ = bs2;
+
+    for (i = 0; i < n1; i++) {
+        bs2 = bs2_;
+        for (j = 0; j < n2; j++) {
+            /* collect the match only if this satisfies the threshold */
+            if (hamming <nbits> (bs1, bs2) <= ht)
+                posm++;
+            bs2 += nwords;
+        }
+        bs1 += nwords;  /* next signature */
+    }
+    *nptr = posm;
+}
+
+
+template <size_t nbits>
+void crosshamming_count_thres (
+        const uint64_t * dbs,
+        size_t n,
+        int ht,
+        size_t * nptr)
+{
+    const size_t nwords = nbits / 64;
+    size_t i, j, posm = 0;
+    const uint64_t * bs1 = dbs;
+    for (i = 0; i < n; i++) {
+        const uint64_t * bs2 = bs1 + 2;
+        for (j = i + 1; j < n; j++) {
+            /* collect the match only if this satisfies the threshold */
+            if (hamming <nbits> (bs1, bs2) <= ht)
+                posm++;
+            bs2 += nwords;
+        }
+        bs1 += nwords;
+    }
+    *nptr = posm;
+}
+
+
+template <size_t nbits>
+size_t match_hamming_thres (
+        const uint64_t * bs1,
+        const uint64_t * bs2,
+        size_t n1,
+        size_t n2,
+        int ht,
+        int64_t * idx,
+        hamdis_t * hams)
+{
+    const size_t nwords = nbits / 64;
+    size_t i, j, posm = 0;
+    hamdis_t h;
+    const uint64_t * bs2_ = bs2;
+    for (i = 0; i < n1; i++) {
+        bs2 = bs2_;
+        for (j = 0; j < n2; j++) {
+            /* Here perform the real work of computing the distance */
+            h = hamming <nbits> (bs1, bs2);
+
+            /* collect the match only if this satisfies the threshold */
+            if (h <= ht) {
+                /* Enough space to store another match ? */
+                *idx = i; idx++;
+                *idx = j; idx++;
+                *hams = h;
+                hams++;
+                posm++;
+            }
+            bs2+=nwords;  /* next signature */
+        }
+        bs1+=nwords;
+    }
+    return posm;
+}
+
+
+/* Return closest neighbors w.r.t Hamming distance, using a heap. */
+template <class HammingComputer>
+static
+void hammings_knn_hc (
+        int bytes_per_code,
+        int_maxheap_array_t * ha,
+        const uint8_t * bs1,
+        const uint8_t * bs2,
+        size_t n2,
+        bool order = true,
+        bool init_heap = true)
+{
+    size_t k = ha->k;
+    if (init_heap) ha->heapify ();
+
+    const size_t block_size = hamming_batch_size;
+    for (size_t j0 = 0; j0 < n2; j0 += block_size) {
+      const size_t j1 = std::min(j0 + block_size, n2);
+#pragma omp parallel for
+      for (size_t i = 0; i < ha->nh; i++) {
+        HammingComputer hc (bs1 + i * bytes_per_code, bytes_per_code);
+
+        const uint8_t * bs2_ = bs2 + j0 * bytes_per_code;
+        hamdis_t dis;
+        hamdis_t * __restrict bh_val_ = ha->val + i * k;
+        int64_t * __restrict bh_ids_ = ha->ids + i * k;
+        size_t j;
+        for (j = j0; j < j1; j++, bs2_+= bytes_per_code) {
+          dis = hc.hamming (bs2_);
+          if (dis < bh_val_[0]) {
+            faiss::maxheap_pop<hamdis_t> (k, bh_val_, bh_ids_);
+            faiss::maxheap_push<hamdis_t> (k, bh_val_, bh_ids_, dis, j);
+          }
+        }
+      }
+    }
+    if (order) ha->reorder ();
+ }
+
+/* Return closest neighbors w.r.t Hamming distance, using max count. */
+template <class HammingComputer>
+static
+void hammings_knn_mc (
+        int bytes_per_code,
+        const uint8_t *a,
+        const uint8_t *b,
+        size_t na,
+        size_t nb,
+        size_t k,
+        int32_t *distances,
+        int64_t *labels)
+{
+  const int nBuckets = bytes_per_code * 8 + 1;
+  std::vector<int> all_counters(na * nBuckets, 0);
+  std::unique_ptr<int64_t[]> all_ids_per_dis(new int64_t[na * nBuckets * k]);
+
+  std::vector<HCounterState<HammingComputer>> cs;
+  for (size_t i = 0; i < na; ++i) {
+    cs.push_back(HCounterState<HammingComputer>(
+                   all_counters.data() + i * nBuckets,
+                   all_ids_per_dis.get() + i * nBuckets * k,
+                   a + i * bytes_per_code,
+                   8 * bytes_per_code,
+                   k
+                 ));
+  }
+
+  const size_t block_size = hamming_batch_size;
+  for (size_t j0 = 0; j0 < nb; j0 += block_size) {
+    const size_t j1 = std::min(j0 + block_size, nb);
+#pragma omp parallel for
+    for (size_t i = 0; i < na; ++i) {
+      for (size_t j = j0; j < j1; ++j) {
+        cs[i].update_counter(b + j * bytes_per_code, j);
+      }
+    }
+  }
+
+  for (size_t i = 0; i < na; ++i) {
+    HCounterState<HammingComputer>& csi = cs[i];
+
+    int nres = 0;
+    for (int b = 0; b < nBuckets && nres < k; b++) {
+      for (int l = 0; l < csi.counters[b] && nres < k; l++) {
+        labels[i * k + nres] = csi.ids_per_dis[b * k + l];
+        distances[i * k + nres] = b;
+        nres++;
+      }
+    }
+    while (nres < k) {
+      labels[i * k + nres] = -1;
+      distances[i * k + nres] = std::numeric_limits<int32_t>::max();
+      ++nres;
+    }
+  }
+}
+
+
+
+// works faster than the template version
+static
+void hammings_knn_hc_1 (
+        int_maxheap_array_t * ha,
+        const uint64_t * bs1,
+        const uint64_t * bs2,
+        size_t n2,
+        bool order = true,
+        bool init_heap = true)
+{
+    const size_t nwords = 1;
+    size_t k = ha->k;
+
+
+    if (init_heap) {
+        ha->heapify ();
+    }
+
+#pragma omp parallel for
+    for (size_t i = 0; i < ha->nh; i++) {
+        const uint64_t bs1_ = bs1 [i];
+        const uint64_t * bs2_ = bs2;
+        hamdis_t dis;
+        hamdis_t * bh_val_ = ha->val + i * k;
+        hamdis_t bh_val_0 = bh_val_[0];
+        int64_t * bh_ids_ = ha->ids + i * k;
+        size_t j;
+        for (j = 0; j < n2; j++, bs2_+= nwords) {
+            dis = popcount64 (bs1_ ^ *bs2_);
+            if (dis < bh_val_0) {
+                faiss::maxheap_pop<hamdis_t> (k, bh_val_, bh_ids_);
+                faiss::maxheap_push<hamdis_t> (k, bh_val_, bh_ids_, dis, j);
+                bh_val_0 = bh_val_[0];
+            }
+        }
+    }
+    if (order) {
+        ha->reorder ();
+    }
+}
+
+
+
+
+/* Functions to maps vectors to bits. Assume proper allocation done beforehand,
+   meaning that b should be be able to receive as many bits as x may produce. */
+
+/*
+ * dimension 0 corresponds to the least significant bit of b[0], or
+ * equivalently to the lsb of the first byte that is stored.
+ */
+void fvec2bitvec (const float * x, uint8_t * b, size_t d)
+{
+    for (int i = 0; i < d; i += 8) {
+        uint8_t w = 0;
+        uint8_t mask = 1;
+        int nj = i + 8 <= d ? 8 : d - i;
+        for (int j = 0; j < nj; j++) {
+            if (x[i + j] >= 0)
+                w |= mask;
+            mask <<= 1;
+        }
+        *b = w;
+        b++;
+    }
+}
+
+
+
+/* Same but for n vectors.
+   Ensure that the ouptut b is byte-aligned (pad with 0s). */
+void fvecs2bitvecs (const float * x, uint8_t * b, size_t d, size_t n)
+{
+    const int64_t ncodes = ((d + 7) / 8);
+#pragma omp parallel for if(n > 100000)
+    for (size_t i = 0; i < n; i++)
+        fvec2bitvec (x + i * d, b + i * ncodes, d);
+}
+
+
+
+void bitvecs2fvecs (
+        const uint8_t * b,
+        float * x,
+        size_t d,
+        size_t n) {
+
+    const int64_t ncodes = ((d + 7) / 8);
+#pragma omp parallel for if(n > 100000)
+    for (size_t i = 0; i < n; i++) {
+        binary_to_real (d, b + i * ncodes, x + i * d);
+    }
+}
+
+
+/* Reverse bit (NOT a optimized function, only used for print purpose) */
+static uint64_t uint64_reverse_bits (uint64_t b)
+{
+    int i;
+    uint64_t revb = 0;
+    for (i = 0; i < 64; i++) {
+        revb <<= 1;
+        revb |= b & 1;
+        b >>= 1;
+    }
+    return revb;
+}
+
+
+/* print the bit vector */
+void bitvec_print (const uint8_t * b, size_t d)
+{
+    size_t i, j;
+    for (i = 0; i < d; ) {
+        uint64_t brev = uint64_reverse_bits (* (uint64_t *) b);
+        for (j = 0; j < 64 && i < d; j++, i++) {
+            printf ("%d", (int) (brev & 1));
+            brev >>= 1;
+        }
+        b += 8;
+        printf (" ");
+    }
+}
+
+
+
+
+
+/*----------------------------------------*/
+/* Hamming distance computation and k-nn  */
+
+
+#define C64(x) ((uint64_t *)x)
+
+
+/* Compute a set of Hamming distances */
+void hammings (
+        const uint8_t * a,
+        const uint8_t * b,
+        size_t na, size_t nb,
+        size_t ncodes,
+        hamdis_t * __restrict dis)
+{
+    FAISS_THROW_IF_NOT (ncodes % 8 == 0);
+    switch (ncodes) {
+        case 8:
+            faiss::hammings <64>  (C64(a), C64(b), na, nb, dis); return;
+        case 16:
+            faiss::hammings <128> (C64(a), C64(b), na, nb, dis); return;
+        case 32:
+            faiss::hammings <256> (C64(a), C64(b), na, nb, dis); return;
+        case 64:
+            faiss::hammings <512> (C64(a), C64(b), na, nb, dis); return;
+        default:
+            faiss::hammings (C64(a), C64(b), na, nb, ncodes * 8, dis); return;
+    }
+}
+
+void hammings_knn(
+    int_maxheap_array_t *ha,
+    const uint8_t *a,
+    const uint8_t *b,
+    size_t nb,
+    size_t ncodes,
+    int order)
+{
+    hammings_knn_hc(ha, a, b, nb, ncodes, order);
+}
+void hammings_knn_hc (
+        int_maxheap_array_t * ha,
+        const uint8_t * a,
+        const uint8_t * b,
+        size_t nb,
+        size_t ncodes,
+        int order)
+{
+    switch (ncodes) {
+    case 4:
+        hammings_knn_hc<faiss::HammingComputer4>
+            (4, ha, a, b, nb, order, true);
+        break;
+    case 8:
+        hammings_knn_hc_1 (ha, C64(a), C64(b), nb, order, true);
+        // hammings_knn_hc<faiss::HammingComputer8>
+        //      (8, ha, a, b, nb, order, true);
+        break;
+    case 16:
+        hammings_knn_hc<faiss::HammingComputer16>
+            (16, ha, a, b, nb, order, true);
+        break;
+    case 32:
+        hammings_knn_hc<faiss::HammingComputer32>
+            (32, ha, a, b, nb, order, true);
+        break;
+    default:
+        if(ncodes % 8 == 0) {
+            hammings_knn_hc<faiss::HammingComputerM8>
+                (ncodes, ha, a, b, nb, order, true);
+        } else {
+            hammings_knn_hc<faiss::HammingComputerDefault>
+                (ncodes, ha, a, b, nb, order, true);
+
+        }
+    }
+}
+
+void hammings_knn_mc(
+    const uint8_t * a,
+    const uint8_t * b,
+    size_t na,
+    size_t nb,
+    size_t k,
+    size_t ncodes,
+    int32_t *distances,
+    int64_t *labels)
+{
+    switch (ncodes) {
+    case 4:
+        hammings_knn_mc<faiss::HammingComputer4>(
+          4, a, b, na, nb, k, distances, labels
+        );
+        break;
+    case 8:
+        // TODO(hoss): Write analog to hammings_knn_hc_1
+        // hammings_knn_hc_1 (ha, C64(a), C64(b), nb, order, true);
+        hammings_knn_mc<faiss::HammingComputer8>(
+          8, a, b, na, nb, k, distances, labels
+        );
+        break;
+    case 16:
+        hammings_knn_mc<faiss::HammingComputer16>(
+          16, a, b, na, nb, k, distances, labels
+        );
+        break;
+    case 32:
+        hammings_knn_mc<faiss::HammingComputer32>(
+          32, a, b, na, nb, k, distances, labels
+        );
+        break;
+    default:
+        if(ncodes % 8 == 0) {
+            hammings_knn_mc<faiss::HammingComputerM8>(
+              ncodes, a, b, na, nb, k, distances, labels
+            );
+        } else {
+            hammings_knn_mc<faiss::HammingComputerDefault>(
+              ncodes, a, b, na, nb, k, distances, labels
+            );
+        }
+    }
+}
+
+
+
+
+/* Count number of matches given a max threshold            */
+void hamming_count_thres (
+        const uint8_t * bs1,
+        const uint8_t * bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t * nptr)
+{
+    switch (ncodes) {
+        case 8:
+            faiss::hamming_count_thres <64> (C64(bs1), C64(bs2),
+                                             n1, n2, ht, nptr);
+            return;
+        case 16:
+            faiss::hamming_count_thres <128> (C64(bs1), C64(bs2),
+                                              n1, n2, ht, nptr);
+            return;
+        case 32:
+            faiss::hamming_count_thres <256> (C64(bs1), C64(bs2),
+                                              n1, n2, ht, nptr);
+            return;
+        case 64:
+            faiss::hamming_count_thres <512> (C64(bs1), C64(bs2),
+                                              n1, n2, ht, nptr);
+            return;
+        default:
+          FAISS_THROW_FMT ("not implemented for %zu bits", ncodes);
+    }
+}
+
+
+/* Count number of cross-matches given a threshold */
+void crosshamming_count_thres (
+        const uint8_t * dbs,
+        size_t n,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t * nptr)
+{
+    switch (ncodes) {
+        case 8:
+            faiss::crosshamming_count_thres <64>  (C64(dbs), n, ht, nptr);
+            return;
+        case 16:
+            faiss::crosshamming_count_thres <128> (C64(dbs), n, ht, nptr);
+            return;
+        case 32:
+            faiss::crosshamming_count_thres <256> (C64(dbs), n, ht, nptr);
+            return;
+        case 64:
+            faiss::crosshamming_count_thres <512> (C64(dbs), n, ht, nptr);
+            return;
+        default:
+            FAISS_THROW_FMT ("not implemented for %zu bits", ncodes);
+    }
+}
+
+
+/* Returns all matches given a threshold */
+size_t match_hamming_thres (
+        const uint8_t * bs1,
+        const uint8_t * bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        int64_t * idx,
+        hamdis_t * dis)
+{
+    switch (ncodes) {
+        case 8:
+          return faiss::match_hamming_thres <64> (C64(bs1), C64(bs2),
+                                                  n1, n2, ht, idx, dis);
+        case 16:
+          return faiss::match_hamming_thres <128> (C64(bs1), C64(bs2),
+                                                   n1, n2, ht, idx, dis);
+        case 32:
+          return faiss::match_hamming_thres <256> (C64(bs1), C64(bs2),
+                                                   n1, n2, ht, idx, dis);
+        case 64:
+          return faiss::match_hamming_thres <512> (C64(bs1), C64(bs2),
+                                                   n1, n2, ht, idx, dis);
+        default:
+            FAISS_THROW_FMT ("not implemented for %zu bits", ncodes);
+            return 0;
+    }
+}
+
+
+#undef C64
+
+
+
+/*************************************
+ * generalized Hamming distances
+ ************************************/
+
+
+
+template <class HammingComputer>
+static void hamming_dis_inner_loop (
+        const uint8_t *ca,
+        const uint8_t *cb,
+        size_t nb,
+        size_t code_size,
+        int k,
+        hamdis_t * bh_val_,
+        int64_t *     bh_ids_)
+{
+
+    HammingComputer hc (ca, code_size);
+
+    for (size_t j = 0; j < nb; j++) {
+        int ndiff = hc.hamming (cb);
+        cb += code_size;
+        if (ndiff < bh_val_[0]) {
+            maxheap_pop<hamdis_t> (k, bh_val_, bh_ids_);
+            maxheap_push<hamdis_t> (k, bh_val_, bh_ids_, ndiff, j);
+        }
+    }
+}
+
+void generalized_hammings_knn_hc (
+        int_maxheap_array_t * ha,
+        const uint8_t * a,
+        const uint8_t * b,
+        size_t nb,
+        size_t code_size,
+        int ordered)
+{
+    int na = ha->nh;
+    int k = ha->k;
+
+    if (ordered)
+        ha->heapify ();
+
+#pragma omp parallel for
+    for (int i = 0; i < na; i++) {
+        const uint8_t *ca = a + i * code_size;
+        const uint8_t *cb = b;
+
+        hamdis_t * bh_val_ = ha->val + i * k;
+        int64_t *     bh_ids_ = ha->ids + i * k;
+
+        switch (code_size) {
+        case 8:
+            hamming_dis_inner_loop<GenHammingComputer8>
+                (ca, cb, nb, 8, k, bh_val_, bh_ids_);
+            break;
+        case 16:
+            hamming_dis_inner_loop<GenHammingComputer16>
+                (ca, cb, nb, 16, k, bh_val_, bh_ids_);
+            break;
+        case 32:
+            hamming_dis_inner_loop<GenHammingComputer32>
+                (ca, cb, nb, 32, k, bh_val_, bh_ids_);
+            break;
+        default:
+            hamming_dis_inner_loop<GenHammingComputerM8>
+                (ca, cb, nb, code_size, k, bh_val_, bh_ids_);
+            break;
+        }
+    }
+
+    if (ordered)
+        ha->reorder ();
+
+}
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/utils/hamming.h b/core/src/index/thirdparty/faiss/utils/hamming.h
new file mode 100644
index 0000000000..1ddbd5c010
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/utils/hamming.h
@@ -0,0 +1,220 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/*
+ * Hamming distances. The binary vector dimensionality should be a
+ * multiple of 8, as the elementary operations operate on bytes. If
+ * you need other sizes, just pad with 0s (this is done by function
+ * fvecs2bitvecs).
+ *
+ * User-defined type hamdis_t is used for distances because at this time
+ * it is still uncler clear how we will need to balance
+ * - flexibility in vector size (may need 16- or even 8-bit vectors)
+ * - memory usage
+ * - cache-misses when dealing with large volumes of data (fewer bits is better)
+ *
+ */
+
+#ifndef FAISS_hamming_h
+#define FAISS_hamming_h
+
+
+#include <stdint.h>
+
+#include <faiss/utils/Heap.h>
+
+
+/* The Hamming distance type */
+typedef int32_t hamdis_t;
+
+namespace faiss {
+
+/**************************************************
+ * General bit vector functions
+ **************************************************/
+
+
+void bitvec_print (const uint8_t * b, size_t d);
+
+
+/* Functions for casting vectors of regular types to compact bits.
+   They assume proper allocation done beforehand, meaning that b
+   should be be able to receive as many bits as x may produce.  */
+
+/* Makes an array of bits from the signs of a float array. The length
+   of the output array b is rounded up to byte size (allocate
+   accordingly) */
+void fvecs2bitvecs (
+        const float * x,
+        uint8_t * b,
+        size_t d,
+        size_t n);
+
+void bitvecs2fvecs (
+        const uint8_t * b,
+        float * x,
+        size_t d,
+        size_t n);
+
+
+void fvec2bitvec (const float * x, uint8_t * b, size_t d);
+
+/***********************************************
+ * Generic reader/writer for bit strings
+ ***********************************************/
+
+
+struct BitstringWriter {
+    uint8_t *code;
+    size_t code_size;
+    size_t i; // current bit offset
+
+    // code_size in bytes
+    BitstringWriter(uint8_t *code, int code_size);
+
+    // write the nbit low bits of x
+    void write(uint64_t x, int nbit);
+};
+
+struct BitstringReader {
+    const uint8_t *code;
+    size_t code_size;
+    size_t i;
+
+    // code_size in bytes
+    BitstringReader(const uint8_t *code, int code_size);
+
+    // read nbit bits from the code
+    uint64_t read(int nbit);
+};
+
+/**************************************************
+ * Hamming distance computation functions
+ **************************************************/
+
+
+
+extern size_t hamming_batch_size;
+
+inline int popcount64(uint64_t x) {
+    return __builtin_popcountl(x);
+}
+
+
+/** Compute a set of Hamming distances between na and nb binary vectors
+ *
+ * @param  a             size na * nbytespercode
+ * @param  b             size nb * nbytespercode
+ * @param  nbytespercode should be multiple of 8
+ * @param  dis           output distances, size na * nb
+ */
+void hammings (
+        const uint8_t * a,
+        const uint8_t * b,
+        size_t na, size_t nb,
+        size_t nbytespercode,
+        hamdis_t * dis);
+
+
+
+
+/** Return the k smallest Hamming distances for a set of binary query vectors,
+ * using a max heap.
+ * @param a       queries, size ha->nh * ncodes
+ * @param b       database, size nb * ncodes
+ * @param nb      number of database vectors
+ * @param ncodes  size of the binary codes (bytes)
+ * @param ordered if != 0: order the results by decreasing distance
+ *                (may be bottleneck for k/n > 0.01) */
+void hammings_knn_hc (
+        int_maxheap_array_t * ha,
+        const uint8_t * a,
+        const uint8_t * b,
+        size_t nb,
+        size_t ncodes,
+        int ordered);
+
+/* Legacy alias to hammings_knn_hc. */
+void hammings_knn (
+  int_maxheap_array_t * ha,
+  const uint8_t * a,
+  const uint8_t * b,
+  size_t nb,
+  size_t ncodes,
+  int ordered);
+
+/** Return the k smallest Hamming distances for a set of binary query vectors,
+ * using counting max.
+ * @param a       queries, size na * ncodes
+ * @param b       database, size nb * ncodes
+ * @param na      number of query vectors
+ * @param nb      number of database vectors
+ * @param k       number of vectors/distances to return
+ * @param ncodes  size of the binary codes (bytes)
+ * @param distances output distances from each query vector to its k nearest
+ *                neighbors
+ * @param labels  output ids of the k nearest neighbors to each query vector
+ */
+void hammings_knn_mc (
+  const uint8_t * a,
+  const uint8_t * b,
+  size_t na,
+  size_t nb,
+  size_t k,
+  size_t ncodes,
+  int32_t *distances,
+  int64_t *labels);
+
+/* Counting the number of matches or of cross-matches (without returning them)
+   For use with function that assume pre-allocated memory */
+void hamming_count_thres (
+        const uint8_t * bs1,
+        const uint8_t * bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t * nptr);
+
+/* Return all Hamming distances/index passing a thres. Pre-allocation of output
+   is required. Use hamming_count_thres to determine the proper size. */
+size_t match_hamming_thres (
+        const uint8_t * bs1,
+        const uint8_t * bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        int64_t * idx,
+        hamdis_t * dis);
+
+/* Cross-matching in a set of vectors */
+void crosshamming_count_thres (
+        const uint8_t * dbs,
+        size_t n,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t * nptr);
+
+
+/* compute the Hamming distances between two codewords of nwords*64 bits */
+hamdis_t hamming (
+        const uint64_t * bs1,
+        const uint64_t * bs2,
+        size_t nwords);
+
+
+
+} // namespace faiss
+
+// inlined definitions of HammingComputerXX and GenHammingComputerXX
+
+#include <faiss/utils/hamming-inl.h>
+
+#endif /* FAISS_hamming_h */
diff --git a/core/src/index/thirdparty/faiss/utils/random.cpp b/core/src/index/thirdparty/faiss/utils/random.cpp
new file mode 100644
index 0000000000..7f50e0eb1c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/utils/random.cpp
@@ -0,0 +1,192 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/utils/random.h>
+
+namespace faiss {
+
+/**************************************************
+ * Random data generation functions
+ **************************************************/
+
+RandomGenerator::RandomGenerator (int64_t seed)
+    : mt((unsigned int)seed) {}
+
+int RandomGenerator::rand_int ()
+{
+    return mt() & 0x7fffffff;
+}
+
+int64_t RandomGenerator::rand_int64 ()
+{
+    return int64_t(rand_int()) | int64_t(rand_int()) << 31;
+}
+
+int RandomGenerator::rand_int (int max)
+{
+    return mt() % max;
+}
+
+float RandomGenerator::rand_float ()
+{
+    return mt() / float(mt.max());
+}
+
+double RandomGenerator::rand_double ()
+{
+    return mt() / double(mt.max());
+}
+
+
+/***********************************************************************
+ * Random functions in this C file only exist because Torch
+ *  counterparts are slow and not multi-threaded.  Typical use is for
+ *  more than 1-100 billion values. */
+
+
+/* Generate a set of random floating point values such that x[i] in [0,1]
+   multi-threading. For this reason, we rely on re-entreant functions.  */
+void float_rand (float * x, size_t n, int64_t seed)
+{
+    // only try to parallelize on large enough arrays
+    const size_t nblock = n < 1024 ? 1 : 1024;
+
+    RandomGenerator rng0 (seed);
+    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
+
+#pragma omp parallel for
+    for (size_t j = 0; j < nblock; j++) {
+
+        RandomGenerator rng (a0 + j * b0);
+
+        const size_t istart = j * n / nblock;
+        const size_t iend = (j + 1) * n / nblock;
+
+        for (size_t i = istart; i < iend; i++)
+            x[i] = rng.rand_float ();
+    }
+}
+
+
+void float_randn (float * x, size_t n, int64_t seed)
+{
+    // only try to parallelize on large enough arrays
+    const size_t nblock = n < 1024 ? 1 : 1024;
+
+    RandomGenerator rng0 (seed);
+    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
+
+#pragma omp parallel for
+    for (size_t j = 0; j < nblock; j++) {
+        RandomGenerator rng (a0 + j * b0);
+
+        double a = 0, b = 0, s = 0;
+        int state = 0;  /* generate two number per "do-while" loop */
+
+        const size_t istart = j * n / nblock;
+        const size_t iend = (j + 1) * n / nblock;
+
+        for (size_t i = istart; i < iend; i++) {
+            /* Marsaglia's method (see Knuth) */
+            if (state == 0) {
+                do {
+                    a = 2.0 * rng.rand_double () - 1;
+                    b = 2.0 * rng.rand_double () - 1;
+                    s = a * a + b * b;
+                } while (s >= 1.0);
+                x[i] = a * sqrt(-2.0 * log(s) / s);
+            }
+            else
+                x[i] = b * sqrt(-2.0 * log(s) / s);
+            state = 1 - state;
+        }
+    }
+}
+
+
+/* Integer versions */
+void int64_rand (int64_t * x, size_t n, int64_t seed)
+{
+    // only try to parallelize on large enough arrays
+    const size_t nblock = n < 1024 ? 1 : 1024;
+
+    RandomGenerator rng0 (seed);
+    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
+
+#pragma omp parallel for
+    for (size_t j = 0; j < nblock; j++) {
+
+        RandomGenerator rng (a0 + j * b0);
+
+        const size_t istart = j * n / nblock;
+        const size_t iend = (j + 1) * n / nblock;
+        for (size_t i = istart; i < iend; i++)
+            x[i] = rng.rand_int64 ();
+    }
+}
+
+void int64_rand_max (int64_t * x, size_t n, uint64_t max, int64_t seed)
+{
+    // only try to parallelize on large enough arrays
+    const size_t nblock = n < 1024 ? 1 : 1024;
+
+    RandomGenerator rng0 (seed);
+    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
+
+#pragma omp parallel for
+    for (size_t j = 0; j < nblock; j++) {
+
+        RandomGenerator rng (a0 + j * b0);
+
+        const size_t istart = j * n / nblock;
+        const size_t iend = (j + 1) * n / nblock;
+        for (size_t i = istart; i < iend; i++)
+            x[i] = rng.rand_int64 () % max;
+    }
+}
+
+
+void rand_perm (int *perm, size_t n, int64_t seed)
+{
+    for (size_t i = 0; i < n; i++) perm[i] = i;
+
+    RandomGenerator rng (seed);
+
+    for (size_t i = 0; i + 1 < n; i++) {
+        int i2 = i + rng.rand_int (n - i);
+        std::swap(perm[i], perm[i2]);
+    }
+}
+
+
+
+
+void byte_rand (uint8_t * x, size_t n, int64_t seed)
+{
+    // only try to parallelize on large enough arrays
+    const size_t nblock = n < 1024 ? 1 : 1024;
+
+    RandomGenerator rng0 (seed);
+    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
+
+#pragma omp parallel for
+    for (size_t j = 0; j < nblock; j++) {
+
+        RandomGenerator rng (a0 + j * b0);
+
+        const size_t istart = j * n / nblock;
+        const size_t iend = (j + 1) * n / nblock;
+
+        size_t i;
+        for (i = istart; i < iend; i++)
+            x[i] = rng.rand_int64 ();
+    }
+}
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/utils/random.h b/core/src/index/thirdparty/faiss/utils/random.h
new file mode 100644
index 0000000000..e94ac068cf
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/utils/random.h
@@ -0,0 +1,60 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/* Random generators. Implemented here for speed and to make
+ * sequences reproducible.
+ */
+
+#pragma once
+
+#include <random>
+#include <stdint.h>
+
+
+namespace faiss {
+
+/**************************************************
+ * Random data generation functions
+ **************************************************/
+
+/// random generator that can be used in multithreaded contexts
+struct RandomGenerator {
+
+    std::mt19937 mt;
+
+    /// random positive integer
+    int rand_int ();
+
+    /// random int64_t
+    int64_t rand_int64 ();
+
+    /// generate random integer between 0 and max-1
+    int rand_int (int max);
+
+    /// between 0 and 1
+    float rand_float ();
+
+    double rand_double ();
+
+    explicit RandomGenerator (int64_t seed = 1234);
+};
+
+/* Generate an array of uniform random floats / multi-threaded implementation */
+void float_rand (float * x, size_t n, int64_t seed);
+void float_randn (float * x, size_t n, int64_t seed);
+void int64_rand (int64_t * x, size_t n, int64_t seed);
+void byte_rand (uint8_t * x, size_t n, int64_t seed);
+// max is actually the maximum value + 1
+void int64_rand_max (int64_t * x, size_t n, uint64_t max, int64_t seed);
+
+/* random permutation */
+void rand_perm (int * perm, size_t n, int64_t seed);
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/utils/utils.cpp b/core/src/index/thirdparty/faiss/utils/utils.cpp
new file mode 100644
index 0000000000..ad9791c6aa
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/utils/utils.cpp
@@ -0,0 +1,783 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/utils/utils.h>
+
+#include <cstdio>
+#include <cassert>
+#include <cstring>
+#include <cmath>
+
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <omp.h>
+
+#include <algorithm>
+#include <vector>
+
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/random.h>
+
+
+
+#ifndef FINTEGER
+#define FINTEGER long
+#endif
+
+
+extern "C" {
+
+/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
+
+int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+            n, FINTEGER *k, const float *alpha, const float *a,
+            FINTEGER *lda, const float *b, FINTEGER *
+            ldb, float *beta, float *c, FINTEGER *ldc);
+
+/* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
+
+int sgeqrf_ (FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
+                 float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
+
+int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
+            FINTEGER *lda, float *tau, float *work,
+            FINTEGER *lwork, FINTEGER *info);
+
+int sgemv_(const char *trans, FINTEGER *m, FINTEGER *n, float *alpha,
+           const float *a, FINTEGER *lda, const float *x, FINTEGER *incx,
+           float *beta, float *y, FINTEGER *incy);
+
+}
+
+
+/**************************************************
+ * Get some stats about the system
+ **************************************************/
+
+namespace faiss {
+
+double getmillisecs () {
+    struct timeval tv;
+    gettimeofday (&tv, nullptr);
+    return tv.tv_sec * 1e3 + tv.tv_usec * 1e-3;
+}
+
+uint64_t get_cycles () {
+#ifdef  __x86_64__
+    uint32_t high, low;
+    asm volatile("rdtsc \n\t"
+                 : "=a" (low),
+                   "=d" (high));
+    return ((uint64_t)high << 32) | (low);
+#else
+    return 0;
+#endif
+}
+
+
+#ifdef __linux__
+
+size_t get_mem_usage_kb ()
+{
+    int pid = getpid ();
+    char fname[256];
+    snprintf (fname, 256, "/proc/%d/status", pid);
+    FILE * f = fopen (fname, "r");
+    FAISS_THROW_IF_NOT_MSG (f, "cannot open proc status file");
+    size_t sz = 0;
+    for (;;) {
+        char buf [256];
+        if (!fgets (buf, 256, f)) break;
+        if (sscanf (buf, "VmRSS: %ld kB", &sz) == 1) break;
+    }
+    fclose (f);
+    return sz;
+}
+
+#elif __APPLE__
+
+size_t get_mem_usage_kb ()
+{
+    fprintf(stderr, "WARN: get_mem_usage_kb not implemented on the mac\n");
+    return 0;
+}
+
+#endif
+
+
+
+
+
+void reflection (const float * __restrict u,
+                 float * __restrict x,
+                 size_t n, size_t d, size_t nu)
+{
+    size_t i, j, l;
+    for (i = 0; i < n; i++) {
+        const float * up = u;
+        for (l = 0; l < nu; l++) {
+            float ip1 = 0, ip2 = 0;
+
+            for (j = 0; j < d; j+=2) {
+                ip1 += up[j] * x[j];
+                ip2 += up[j+1] * x[j+1];
+            }
+            float ip = 2 * (ip1 + ip2);
+
+            for (j = 0; j < d; j++)
+                x[j] -= ip * up[j];
+            up += d;
+        }
+        x += d;
+    }
+}
+
+
+/* Reference implementation (slower) */
+void reflection_ref (const float * u, float * x, size_t n, size_t d, size_t nu)
+{
+    size_t i, j, l;
+    for (i = 0; i < n; i++) {
+        const float * up = u;
+        for (l = 0; l < nu; l++) {
+            double ip = 0;
+
+            for (j = 0; j < d; j++)
+                ip += up[j] * x[j];
+            ip *= 2;
+
+            for (j = 0; j < d; j++)
+                x[j] -= ip * up[j];
+
+            up += d;
+        }
+        x += d;
+    }
+}
+
+
+
+
+
+
+/***************************************************************************
+ * Some matrix manipulation functions
+ ***************************************************************************/
+
+
+/* This function exists because the Torch counterpart is extremly slow
+   (not multi-threaded + unexpected overhead even in single thread).
+   It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y>  */
+void inner_product_to_L2sqr (float * __restrict dis,
+                             const float * nr1,
+                             const float * nr2,
+                             size_t n1, size_t n2)
+{
+
+#pragma omp parallel for
+    for (size_t j = 0 ; j < n1 ; j++) {
+        float * disj = dis + j * n2;
+        for (size_t i = 0 ; i < n2 ; i++)
+            disj[i] = nr1[j] + nr2[i] - 2 * disj[i];
+    }
+}
+
+
+void matrix_qr (int m, int n, float *a)
+{
+    FAISS_THROW_IF_NOT (m >= n);
+    FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
+    std::vector<float> tau (ki);
+    FINTEGER lwork = -1, info;
+    float work_size;
+
+    sgeqrf_ (&mi, &ni, a, &mi, tau.data(),
+             &work_size, &lwork, &info);
+    lwork = size_t(work_size);
+    std::vector<float> work (lwork);
+
+    sgeqrf_ (&mi, &ni, a, &mi,
+             tau.data(), work.data(), &lwork, &info);
+
+    sorgqr_ (&mi, &ni, &ki, a, &mi, tau.data(),
+             work.data(), &lwork, &info);
+
+}
+
+
+/***************************************************************************
+ * Kmeans subroutine
+ ***************************************************************************/
+
+// a bit above machine epsilon for float16
+
+#define EPS (1 / 1024.)
+
+/* For k-means, compute centroids given assignment of vectors to centroids */
+int km_update_centroids (const float * x,
+                         float * centroids,
+                         int64_t * assign,
+                         size_t d, size_t k, size_t n,
+                         size_t k_frozen)
+{
+    k -= k_frozen;
+    centroids += k_frozen * d;
+
+    std::vector<size_t> hassign(k);
+    memset (centroids, 0, sizeof(*centroids) * d * k);
+
+#pragma omp parallel
+    {
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+        // this thread is taking care of centroids c0:c1
+        size_t c0 = (k * rank) / nt;
+        size_t c1 = (k * (rank + 1)) / nt;
+        const float *xi = x;
+        size_t nacc = 0;
+
+        for (size_t i = 0; i < n; i++) {
+            int64_t ci = assign[i];
+            assert (ci >= 0 && ci < k + k_frozen);
+            ci -= k_frozen;
+            if (ci >= c0 && ci < c1)  {
+                float * c = centroids + ci * d;
+                hassign[ci]++;
+                for (size_t j = 0; j < d; j++)
+                    c[j] += xi[j];
+                nacc++;
+            }
+            xi += d;
+        }
+
+    }
+
+#pragma omp parallel for
+    for (size_t ci = 0; ci < k; ci++) {
+        float * c = centroids + ci * d;
+        float ni = (float) hassign[ci];
+        if (ni != 0) {
+            for (size_t j = 0; j < d; j++)
+                c[j] /= ni;
+        }
+    }
+
+    /* Take care of void clusters */
+    size_t nsplit = 0;
+    RandomGenerator rng (1234);
+    for (size_t ci = 0; ci < k; ci++) {
+        if (hassign[ci] == 0) { /* need to redefine a centroid */
+            size_t cj;
+            for (cj = 0; 1; cj = (cj + 1) % k) {
+                /* probability to pick this cluster for split */
+                float p = (hassign[cj] - 1.0) / (float) (n - k);
+                float r = rng.rand_float ();
+                if (r < p) {
+                    break; /* found our cluster to be split */
+                }
+            }
+            memcpy (centroids+ci*d, centroids+cj*d, sizeof(*centroids) * d);
+
+            /* small symmetric pertubation. Much better than  */
+            for (size_t j = 0; j < d; j++) {
+                if (j % 2 == 0) {
+                    centroids[ci * d + j] *= 1 + EPS;
+                    centroids[cj * d + j] *= 1 - EPS;
+                } else {
+                    centroids[ci * d + j] *= 1 - EPS;
+                    centroids[cj * d + j] *= 1 + EPS;
+                }
+            }
+
+            /* assume even split of the cluster */
+            hassign[ci] = hassign[cj] / 2;
+            hassign[cj] -= hassign[ci];
+            nsplit++;
+        }
+    }
+
+    return nsplit;
+}
+
+#undef EPS
+
+
+
+/***************************************************************************
+ * Result list routines
+ ***************************************************************************/
+
+
+void ranklist_handle_ties (int k, int64_t *idx, const float *dis)
+{
+    float prev_dis = -1e38;
+    int prev_i = -1;
+    for (int i = 0; i < k; i++) {
+        if (dis[i] != prev_dis) {
+            if (i > prev_i + 1) {
+                // sort between prev_i and i - 1
+                std::sort (idx + prev_i, idx + i);
+            }
+            prev_i = i;
+            prev_dis = dis[i];
+        }
+    }
+}
+
+size_t merge_result_table_with (size_t n, size_t k,
+                                int64_t *I0, float *D0,
+                                const int64_t *I1, const float *D1,
+                                bool keep_min,
+                                int64_t translation)
+{
+    size_t n1 = 0;
+
+#pragma omp parallel reduction(+:n1)
+    {
+        std::vector<int64_t> tmpI (k);
+        std::vector<float> tmpD (k);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            int64_t *lI0 = I0 + i * k;
+            float *lD0 = D0 + i * k;
+            const int64_t *lI1 = I1 + i * k;
+            const float *lD1 = D1 + i * k;
+            size_t r0 = 0;
+            size_t r1 = 0;
+
+            if (keep_min) {
+                for (size_t j = 0; j < k; j++) {
+
+                    if (lI0[r0] >= 0 && lD0[r0] < lD1[r1]) {
+                        tmpD[j] = lD0[r0];
+                        tmpI[j] = lI0[r0];
+                        r0++;
+                    } else if (lD1[r1] >= 0) {
+                        tmpD[j] = lD1[r1];
+                        tmpI[j] = lI1[r1] + translation;
+                        r1++;
+                    } else { // both are NaNs
+                        tmpD[j] = NAN;
+                        tmpI[j] = -1;
+                    }
+                }
+            } else {
+                for (size_t j = 0; j < k; j++) {
+                    if (lI0[r0] >= 0 && lD0[r0] > lD1[r1]) {
+                        tmpD[j] = lD0[r0];
+                        tmpI[j] = lI0[r0];
+                        r0++;
+                    } else if (lD1[r1] >= 0) {
+                        tmpD[j] = lD1[r1];
+                        tmpI[j] = lI1[r1] + translation;
+                        r1++;
+                    } else { // both are NaNs
+                        tmpD[j] = NAN;
+                        tmpI[j] = -1;
+                    }
+                }
+            }
+            n1 += r1;
+            memcpy (lD0, tmpD.data(), sizeof (lD0[0]) * k);
+            memcpy (lI0, tmpI.data(), sizeof (lI0[0]) * k);
+        }
+    }
+
+    return n1;
+}
+
+
+
+size_t ranklist_intersection_size (size_t k1, const int64_t *v1,
+                                   size_t k2, const int64_t *v2_in)
+{
+    if (k2 > k1) return ranklist_intersection_size (k2, v2_in, k1, v1);
+    int64_t *v2 = new int64_t [k2];
+    memcpy (v2, v2_in, sizeof (int64_t) * k2);
+    std::sort (v2, v2 + k2);
+    { // de-dup v2
+        int64_t prev = -1;
+        size_t wp = 0;
+        for (size_t i = 0; i < k2; i++) {
+            if (v2 [i] != prev) {
+                v2[wp++] = prev = v2 [i];
+            }
+        }
+        k2 = wp;
+    }
+    const int64_t seen_flag = 1L << 60;
+    size_t count = 0;
+    for (size_t i = 0; i < k1; i++) {
+        int64_t q = v1 [i];
+        size_t i0 = 0, i1 = k2;
+        while (i0 + 1 < i1) {
+            size_t imed = (i1 + i0) / 2;
+            int64_t piv = v2 [imed] & ~seen_flag;
+            if (piv <= q) i0 = imed;
+            else          i1 = imed;
+        }
+        if (v2 [i0] == q) {
+            count++;
+            v2 [i0] |= seen_flag;
+        }
+    }
+    delete [] v2;
+
+    return count;
+}
+
+double imbalance_factor (int k, const int *hist) {
+    double tot = 0, uf = 0;
+
+    for (int i = 0 ; i < k ; i++) {
+        tot += hist[i];
+        uf += hist[i] * (double) hist[i];
+    }
+    uf = uf * k / (tot * tot);
+
+    return uf;
+}
+
+
+double imbalance_factor (int n, int k, const int64_t *assign) {
+    std::vector<int> hist(k, 0);
+    for (int i = 0; i < n; i++) {
+        hist[assign[i]]++;
+    }
+
+    return imbalance_factor (k, hist.data());
+}
+
+
+
+int ivec_hist (size_t n, const int * v, int vmax, int *hist) {
+    memset (hist, 0, sizeof(hist[0]) * vmax);
+    int nout = 0;
+    while (n--) {
+        if (v[n] < 0 || v[n] >= vmax) nout++;
+        else hist[v[n]]++;
+    }
+    return nout;
+}
+
+
+void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
+{
+    FAISS_THROW_IF_NOT (nbits % 8 == 0);
+    size_t d = nbits / 8;
+    std::vector<int> accu(d * 256);
+    const uint8_t *c = codes;
+    for (size_t i = 0; i < n; i++)
+        for(int j = 0; j < d; j++)
+            accu[j * 256 + *c++]++;
+    memset (hist, 0, sizeof(*hist) * nbits);
+    for (int i = 0; i < d; i++) {
+        const int *ai = accu.data() + i * 256;
+        int * hi = hist + i * 8;
+        for (int j = 0; j < 256; j++)
+            for (int k = 0; k < 8; k++)
+                if ((j >> k) & 1)
+                    hi[k] += ai[j];
+    }
+
+}
+
+
+
+size_t ivec_checksum (size_t n, const int *a)
+{
+    size_t cs = 112909;
+    while (n--) cs = cs * 65713 + a[n] * 1686049;
+    return cs;
+}
+
+
+namespace {
+    struct ArgsortComparator {
+        const float *vals;
+        bool operator() (const size_t a, const size_t b) const {
+            return vals[a] < vals[b];
+        }
+    };
+
+    struct SegmentS {
+        size_t i0; // begin pointer in the permutation array
+        size_t i1; // end
+        size_t len() const {
+            return i1 - i0;
+        }
+    };
+
+    // see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
+    // extended to > 1 merge thread
+
+    // merges 2 ranges that should be consecutive on the source into
+    // the union of the two on the destination
+    template<typename T>
+    void parallel_merge (const T *src, T *dst,
+                         SegmentS &s1, SegmentS & s2, int nt,
+                         const ArgsortComparator & comp) {
+        if (s2.len() > s1.len()) { // make sure that s1 larger than s2
+            std::swap(s1, s2);
+        }
+
+        // compute sub-ranges for each thread
+        SegmentS s1s[nt], s2s[nt], sws[nt];
+        s2s[0].i0 = s2.i0;
+        s2s[nt - 1].i1 = s2.i1;
+
+        // not sure parallel actually helps here
+#pragma omp parallel for num_threads(nt)
+        for (int t = 0; t < nt; t++) {
+            s1s[t].i0 = s1.i0 + s1.len() * t / nt;
+            s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
+
+            if (t + 1 < nt) {
+                T pivot = src[s1s[t].i1];
+                size_t i0 = s2.i0, i1 = s2.i1;
+                while (i0 + 1 < i1) {
+                    size_t imed = (i1 + i0) / 2;
+                    if (comp (pivot, src[imed])) {i1 = imed; }
+                    else                         {i0 = imed; }
+                }
+                s2s[t].i1 = s2s[t + 1].i0 = i1;
+            }
+        }
+        s1.i0 = std::min(s1.i0, s2.i0);
+        s1.i1 = std::max(s1.i1, s2.i1);
+        s2 = s1;
+        sws[0].i0 = s1.i0;
+        for (int t = 0; t < nt; t++) {
+            sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
+            if (t + 1 < nt) {
+                sws[t + 1].i0 = sws[t].i1;
+            }
+        }
+        assert(sws[nt - 1].i1 == s1.i1);
+
+        // do the actual merging
+#pragma omp parallel for num_threads(nt)
+        for (int t = 0; t < nt; t++) {
+            SegmentS sw = sws[t];
+            SegmentS s1t = s1s[t];
+            SegmentS s2t = s2s[t];
+            if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
+                for (;;) {
+                    // assert (sw.len() == s1t.len() + s2t.len());
+                    if (comp(src[s1t.i0], src[s2t.i0])) {
+                        dst[sw.i0++] = src[s1t.i0++];
+                        if (s1t.i0 == s1t.i1) break;
+                    } else {
+                        dst[sw.i0++] = src[s2t.i0++];
+                        if (s2t.i0 == s2t.i1) break;
+                    }
+                }
+            }
+            if (s1t.len() > 0) {
+                assert(s1t.len() == sw.len());
+                memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
+            } else if (s2t.len() > 0) {
+                assert(s2t.len() == sw.len());
+                memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
+            }
+        }
+    }
+
+};
+
+void fvec_argsort (size_t n, const float *vals,
+                    size_t *perm)
+{
+    for (size_t i = 0; i < n; i++) perm[i] = i;
+    ArgsortComparator comp = {vals};
+    std::sort (perm, perm + n, comp);
+}
+
+void fvec_argsort_parallel (size_t n, const float *vals,
+                            size_t *perm)
+{
+    size_t * perm2 = new size_t[n];
+    // 2 result tables, during merging, flip between them
+    size_t *permB = perm2, *permA = perm;
+
+    int nt = omp_get_max_threads();
+    { // prepare correct permutation so that the result ends in perm
+      // at final iteration
+        int nseg = nt;
+        while (nseg > 1) {
+            nseg = (nseg + 1) / 2;
+            std::swap (permA, permB);
+        }
+    }
+
+#pragma omp parallel
+    for (size_t i = 0; i < n; i++) permA[i] = i;
+
+    ArgsortComparator comp = {vals};
+
+    SegmentS segs[nt];
+
+    // independent sorts
+#pragma omp parallel for
+    for (int t = 0; t < nt; t++) {
+        size_t i0 = t * n / nt;
+        size_t i1 = (t + 1) * n / nt;
+        SegmentS seg = {i0, i1};
+        std::sort (permA + seg.i0, permA + seg.i1, comp);
+        segs[t] = seg;
+    }
+    int prev_nested = omp_get_nested();
+    omp_set_nested(1);
+
+    int nseg = nt;
+    while (nseg > 1) {
+        int nseg1 = (nseg + 1) / 2;
+        int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
+        int sub_nseg1 = nseg / 2;
+
+#pragma omp parallel for num_threads(nseg1)
+        for (int s = 0; s < nseg; s += 2) {
+            if (s + 1 == nseg) { // otherwise isolated segment
+                memcpy(permB + segs[s].i0, permA + segs[s].i0,
+                       segs[s].len() * sizeof(size_t));
+            } else {
+                int t0 = s * sub_nt / sub_nseg1;
+                int t1 = (s + 1) * sub_nt / sub_nseg1;
+                printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
+                parallel_merge(permA, permB, segs[s], segs[s + 1],
+                               t1 - t0, comp);
+            }
+        }
+        for (int s = 0; s < nseg; s += 2)
+            segs[s / 2] = segs[s];
+        nseg = nseg1;
+        std::swap (permA, permB);
+    }
+    assert (permA == perm);
+    omp_set_nested(prev_nested);
+    delete [] perm2;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+const float *fvecs_maybe_subsample (
+          size_t d, size_t *n, size_t nmax, const float *x,
+          bool verbose, int64_t seed)
+{
+
+    if (*n <= nmax) return x; // nothing to do
+
+    size_t n2 = nmax;
+    if (verbose) {
+        printf ("  Input training set too big (max size is %ld), sampling "
+                "%ld / %ld vectors\n", nmax, n2, *n);
+    }
+    std::vector<int> subset (*n);
+    rand_perm (subset.data (), *n, seed);
+    float *x_subset = new float[n2 * d];
+    for (int64_t i = 0; i < n2; i++)
+        memcpy (&x_subset[i * d],
+                &x[subset[i] * size_t(d)],
+                sizeof (x[0]) * d);
+    *n = n2;
+    return x_subset;
+}
+
+
+void binary_to_real(size_t d, const uint8_t *x_in, float *x_out) {
+    for (size_t i = 0; i < d; ++i) {
+        x_out[i] = 2 * ((x_in[i >> 3] >> (i & 7)) & 1) - 1;
+    }
+}
+
+void real_to_binary(size_t d, const float *x_in, uint8_t *x_out) {
+  for (size_t i = 0; i < d / 8; ++i) {
+    uint8_t b = 0;
+    for (int j = 0; j < 8; ++j) {
+      if (x_in[8 * i + j] > 0) {
+        b |= (1 << j);
+      }
+    }
+    x_out[i] = b;
+  }
+}
+
+
+// from Python's stringobject.c
+uint64_t hash_bytes (const uint8_t *bytes, int64_t n) {
+    const uint8_t *p = bytes;
+    uint64_t x = (uint64_t)(*p) << 7;
+    int64_t len = n;
+    while (--len >= 0) {
+        x = (1000003*x) ^ *p++;
+    }
+    x ^= n;
+    return x;
+}
+
+
+bool check_openmp() {
+    omp_set_num_threads(10);
+
+    if (omp_get_max_threads() != 10) {
+        return false;
+    }
+
+    std::vector<int> nt_per_thread(10);
+    size_t sum = 0;
+    bool in_parallel = true;
+#pragma omp parallel reduction(+: sum)
+    {
+        if (!omp_in_parallel()) {
+            in_parallel = false;
+        }
+
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+
+        nt_per_thread[rank] = nt;
+#pragma omp for
+        for(int i = 0; i < 1000 * 1000 * 10; i++) {
+            sum += i;
+        }
+    }
+
+    if (!in_parallel) {
+        return false;
+    }
+    if (nt_per_thread[0] != 10) {
+        return false;
+    }
+    if (sum == 0) {
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/utils/utils.h b/core/src/index/thirdparty/faiss/utils/utils.h
new file mode 100644
index 0000000000..bba0fce000
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/utils/utils.h
@@ -0,0 +1,181 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/*
+ *  A few utilitary functions for similarity search:
+ * - optimized exhaustive distance and knn search functions
+ * - some functions reimplemented from torch for speed
+ */
+
+#ifndef FAISS_utils_h
+#define FAISS_utils_h
+
+#include <stdint.h>
+
+#include <faiss/utils/Heap.h>
+
+
+namespace faiss {
+
+
+/**************************************************
+ * Get some stats about the system
+**************************************************/
+
+
+/// ms elapsed since some arbitrary epoch
+double getmillisecs ();
+
+/// get current RSS usage in kB
+size_t get_mem_usage_kb ();
+
+
+uint64_t get_cycles ();
+
+/***************************************************************************
+ * Misc  matrix and vector manipulation functions
+ ***************************************************************************/
+
+
+/** compute c := a + bf * b for a, b and c tables
+ *
+ * @param n   size of the tables
+ * @param a   size n
+ * @param b   size n
+ * @param c   restult table, size n
+ */
+void fvec_madd (size_t n, const float *a,
+                float bf, const float *b, float *c);
+
+
+/** same as fvec_madd, also return index of the min of the result table
+ * @return    index of the min of table c
+ */
+int fvec_madd_and_argmin (size_t n, const float *a,
+                           float bf, const float *b, float *c);
+
+
+/* perform a reflection (not an efficient implementation, just for test ) */
+void reflection (const float * u, float * x, size_t n, size_t d, size_t nu);
+
+
+/** For k-means: update stage.
+ *
+ * @param x          training vectors, size n * d
+ * @param centroids  centroid vectors, size k * d
+ * @param assign     nearest centroid for each training vector, size n
+ * @param k_frozen   do not update the k_frozen first centroids
+ * @return           nb of spliting operations to fight empty clusters
+ */
+int km_update_centroids (
+        const float * x,
+        float * centroids,
+        int64_t * assign,
+        size_t d, size_t k, size_t n,
+        size_t k_frozen);
+
+/** compute the Q of the QR decomposition for m > n
+ * @param a   size n * m: input matrix and output Q
+ */
+void matrix_qr (int m, int n, float *a);
+
+/** distances are supposed to be sorted. Sorts indices with same distance*/
+void ranklist_handle_ties (int k, int64_t *idx, const float *dis);
+
+/** count the number of comon elements between v1 and v2
+ * algorithm = sorting + bissection to avoid double-counting duplicates
+ */
+size_t ranklist_intersection_size (size_t k1, const int64_t *v1,
+                                   size_t k2, const int64_t *v2);
+
+/** merge a result table into another one
+ *
+ * @param I0, D0       first result table, size (n, k)
+ * @param I1, D1       second result table, size (n, k)
+ * @param keep_min     if true, keep min values, otherwise keep max
+ * @param translation  add this value to all I1's indexes
+ * @return             nb of values that were taken from the second table
+ */
+size_t merge_result_table_with (size_t n, size_t k,
+                                int64_t *I0, float *D0,
+                                const int64_t *I1, const float *D1,
+                                bool keep_min = true,
+                                int64_t translation = 0);
+
+
+/// a balanced assignment has a IF of 1
+double imbalance_factor (int n, int k, const int64_t *assign);
+
+/// same, takes a histogram as input
+double imbalance_factor (int k, const int *hist);
+
+
+void fvec_argsort (size_t n, const float *vals,
+                    size_t *perm);
+
+void fvec_argsort_parallel (size_t n, const float *vals,
+                    size_t *perm);
+
+
+/// compute histogram on v
+int ivec_hist (size_t n, const int * v, int vmax, int *hist);
+
+/** Compute histogram of bits on a code array
+ *
+ * @param codes   size(n, nbits / 8)
+ * @param hist    size(nbits): nb of 1s in the array of codes
+ */
+void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist);
+
+
+/// compute a checksum on a table.
+size_t ivec_checksum (size_t n, const int *a);
+
+
+/** random subsamples a set of vectors if there are too many of them
+ *
+ * @param d      dimension of the vectors
+ * @param n      on input: nb of input vectors, output: nb of output vectors
+ * @param nmax   max nb of vectors to keep
+ * @param x      input array, size *n-by-d
+ * @param seed   random seed to use for sampling
+ * @return       x or an array allocated with new [] with *n vectors
+ */
+const float *fvecs_maybe_subsample (
+       size_t d, size_t *n, size_t nmax, const float *x,
+       bool verbose = false, int64_t seed = 1234);
+
+/** Convert binary vector to +1/-1 valued float vector.
+ *
+ * @param d      dimension of the vector (multiple of 8)
+ * @param x_in   input binary vector (uint8_t table of size d / 8)
+ * @param x_out  output float vector (float table of size d)
+ */
+void binary_to_real(size_t d, const uint8_t *x_in, float *x_out);
+
+/** Convert float vector to binary vector. Components > 0 are converted to 1,
+ * others to 0.
+ *
+ * @param d      dimension of the vector (multiple of 8)
+ * @param x_in   input float vector (float table of size d)
+ * @param x_out  output binary vector (uint8_t table of size d / 8)
+ */
+void real_to_binary(size_t d, const float *x_in, uint8_t *x_out);
+
+
+/** A reasonable hashing function */
+uint64_t hash_bytes (const uint8_t *bytes, int64_t n);
+
+/** Whether OpenMP annotations were respected. */
+bool check_openmp();
+
+} // namspace faiss
+
+
+#endif /* FAISS_utils_h */
diff --git a/core/src/index/thirdparty/faiss_cache_check_lists.txt b/core/src/index/thirdparty/faiss_cache_check_lists.txt
deleted file mode 100644
index ce75614ca5..0000000000
--- a/core/src/index/thirdparty/faiss_cache_check_lists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-# source
-src/
-include/
-
-# third party
-thirdparty/
-
-# cmake
-cmake/
-CMakeLists.txt
-
-# script
-build.sh
\ No newline at end of file
diff --git a/core/src/index/unittest/faiss_benchmark/faiss_benchmark_test.cpp b/core/src/index/unittest/faiss_benchmark/faiss_benchmark_test.cpp
index 0d49004621..3c9b3a375c 100644
--- a/core/src/index/unittest/faiss_benchmark/faiss_benchmark_test.cpp
+++ b/core/src/index/unittest/faiss_benchmark/faiss_benchmark_test.cpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#define USE_FAISS_V0_2_1 0
+#define USE_FAISS_V_0_3_0
 
 #include <gtest/gtest.h>
 
@@ -35,16 +35,20 @@
 #include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/index_io.h>
 
-#if USE_FAISS_V0_2_1
+#ifdef USE_FAISS_V_0_3_0  // faiss_0.3.0
+
+#include <faiss/gpu/GpuCloner.h>
+#include <faiss/index_factory.h>
+#include <faiss/utils/distances.h>
+
+#else  // faiss_0.2.1
+
 #include <faiss/gpu/GpuAutoTune.h>
 #include <faiss/utils.h>
 #include <sys/stat.h>
 #include <cstdlib>
 #include <cstring>
-#else
-#include <faiss/gpu/GpuCloner.h>
-#include <faiss/index_factory.h>
-#include <faiss/utils/distances.h>
+
 #endif
 
 #ifdef CUSTOMIZATION
@@ -206,15 +210,12 @@ GetResultHitCount(const faiss::Index::idx_t* ground_index, const faiss::Index::i
     size_t min_k = std::min(ground_k, k);
     int hit = 0;
     for (int i = 0; i < nq; i++) {
-        // count the num of results exist in ground truth result set
-        // each result replicates INDEX_ADD_LOOPS times
-        for (int j_c = 0; j_c < k; j_c++) {
-            int r_c = index[i * k + j_c];
-            for (int j_g = 0; j_g < min_k / index_add_loops; j_g++) {
-                if (ground_index[i * ground_k + j_g] == r_c) {
-                    hit++;
-                    continue;
-                }
+        std::set<faiss::Index::idx_t> ground(ground_index + i * ground_k,
+                                             ground_index + i * ground_k + min_k / index_add_loops);
+        for (int j = 0; j < min_k; j++) {
+            faiss::Index::idx_t id = index[i * k + j];
+            if (ground.count(id) > 0) {
+                hit++;
             }
         }
     }
diff --git a/core/src/index/unittest/test_idmap.cpp b/core/src/index/unittest/test_idmap.cpp
index 98a554199f..1ed750d9a8 100644
--- a/core/src/index/unittest/test_idmap.cpp
+++ b/core/src/index/unittest/test_idmap.cpp
@@ -139,9 +139,9 @@ TEST_F(IDMAPTest, copy_test) {
 
     {
         // clone
-        auto clone_index = index_->Clone();
-        auto clone_result = clone_index->Search(query_dataset, conf);
-        AssertAnns(clone_result, nq, k);
+        //        auto clone_index = index_->Clone();
+        //        auto clone_result = clone_index->Search(query_dataset, conf);
+        //        AssertAnns(clone_result, nq, k);
     }
 
     {
@@ -159,9 +159,9 @@ TEST_F(IDMAPTest, copy_test) {
         auto new_result = clone_index->Search(query_dataset, conf);
         AssertAnns(new_result, nq, k);
 
-        auto clone_gpu_idx = clone_index->Clone();
-        auto clone_gpu_res = clone_gpu_idx->Search(query_dataset, conf);
-        AssertAnns(clone_gpu_res, nq, k);
+        //        auto clone_gpu_idx = clone_index->Clone();
+        //        auto clone_gpu_res = clone_gpu_idx->Search(query_dataset, conf);
+        //        AssertAnns(clone_gpu_res, nq, k);
 
         // gpu to cpu
         auto host_index = knowhere::cloner::CopyGpuToCpu(clone_index, conf);
diff --git a/core/src/index/unittest/test_ivf.cpp b/core/src/index/unittest/test_ivf.cpp
index 79ef1aa170..7438d5656a 100644
--- a/core/src/index/unittest/test_ivf.cpp
+++ b/core/src/index/unittest/test_ivf.cpp
@@ -62,6 +62,7 @@ class IVFTest : public DataGen, public TestWithParam<::std::tuple<std::string, P
         Generate(DIM, NB, NQ);
         index_ = IndexFactory(index_type);
         conf = ParamGenerator::GetInstance().Gen(parameter_type);
+        conf->Dump();
     }
 
     void
diff --git a/core/src/index/unittest/test_nsg/test_nsg.cpp b/core/src/index/unittest/test_nsg/test_nsg.cpp
index a5eac12b2a..450f5a7723 100644
--- a/core/src/index/unittest/test_nsg/test_nsg.cpp
+++ b/core/src/index/unittest/test_nsg/test_nsg.cpp
@@ -44,7 +44,8 @@ class NSGInterfaceTest : public DataGen, public ::testing::Test {
     SetUp() override {
         // Init_with_default();
 #ifdef MILVUS_GPU_VERSION
-        knowhere::FaissGpuResourceMgr::GetInstance().InitDevice(DEVICEID, 1024 * 1024 * 200, 1024 * 1024 * 600, 2);
+        int64_t MB = 1024 * 1024;
+        knowhere::FaissGpuResourceMgr::GetInstance().InitDevice(DEVICEID, MB * 200, MB * 600, 1);
 #endif
         Generate(256, 1000000 / 100, 1);
         index_ = std::make_shared<knowhere::NSG>();
@@ -59,11 +60,13 @@ class NSGInterfaceTest : public DataGen, public ::testing::Test {
         tmp_conf->candidate_pool_size = 100;
         tmp_conf->metric_type = knowhere::METRICTYPE::L2;
         train_conf = tmp_conf;
+        train_conf->Dump();
 
         auto tmp2_conf = std::make_shared<knowhere::NSGCfg>();
         tmp2_conf->k = k;
         tmp2_conf->search_length = 30;
         search_conf = tmp2_conf;
+        search_conf->Dump();
     }
 
     void
@@ -94,7 +97,7 @@ TEST_F(NSGInterfaceTest, basic_test) {
 
     ASSERT_EQ(index_->Count(), nb);
     ASSERT_EQ(index_->Dimension(), dim);
-    ASSERT_THROW({ index_->Clone(); }, knowhere::KnowhereException);
+    //    ASSERT_THROW({ index_->Clone(); }, knowhere::KnowhereException);
     ASSERT_NO_THROW({
         index_->Add(base_dataset, knowhere::Config());
         index_->Seal();
diff --git a/core/src/main.cpp b/core/src/main.cpp
index 5c97a061d2..670a992d29 100644
--- a/core/src/main.cpp
+++ b/core/src/main.cpp
@@ -58,10 +58,10 @@ print_banner() {
               << "OpenBLAS"
 #endif
               << " library." << std::endl;
-#ifdef MILVUS_CPU_VERSION
-    std::cout << "You are using Milvus CPU edition" << std::endl;
-#else
+#ifdef MILVUS_GPU_VERSION
     std::cout << "You are using Milvus GPU edition" << std::endl;
+#else
+    std::cout << "You are using Milvus CPU edition" << std::endl;
 #endif
     std::cout << std::endl;
 }
diff --git a/core/src/scheduler/SchedInst.h b/core/src/scheduler/SchedInst.h
index 1e8a7acf2e..6cca377033 100644
--- a/core/src/scheduler/SchedInst.h
+++ b/core/src/scheduler/SchedInst.h
@@ -25,6 +25,7 @@
 #include "optimizer/BuildIndexPass.h"
 #include "optimizer/FaissFlatPass.h"
 #include "optimizer/FaissIVFFlatPass.h"
+#include "optimizer/FaissIVFPQPass.h"
 #include "optimizer/FaissIVFSQ8HPass.h"
 #include "optimizer/FaissIVFSQ8Pass.h"
 #include "optimizer/FallbackPass.h"
@@ -129,7 +130,10 @@ class OptimizerInst {
                     pass_list.push_back(std::make_shared<FaissFlatPass>());
                     pass_list.push_back(std::make_shared<FaissIVFFlatPass>());
                     pass_list.push_back(std::make_shared<FaissIVFSQ8Pass>());
+#ifdef CUSTOMIZATION
                     pass_list.push_back(std::make_shared<FaissIVFSQ8HPass>());
+#endif
+                    pass_list.push_back(std::make_shared<FaissIVFPQPass>());
                 }
 #endif
                 pass_list.push_back(std::make_shared<FallbackPass>());
diff --git a/core/src/scheduler/optimizer/BuildIndexPass.cpp b/core/src/scheduler/optimizer/BuildIndexPass.cpp
index 67036f9d7b..770cfd333c 100644
--- a/core/src/scheduler/optimizer/BuildIndexPass.cpp
+++ b/core/src/scheduler/optimizer/BuildIndexPass.cpp
@@ -19,19 +19,17 @@
 #include "scheduler/SchedInst.h"
 #include "scheduler/Utils.h"
 #include "scheduler/tasklabel/SpecResLabel.h"
-
+#ifdef MILVUS_GPU_VERSION
 namespace milvus {
 namespace scheduler {
 
 void
 BuildIndexPass::Init() {
-#ifdef MILVUS_GPU_VERSION
     server::Config& config = server::Config::GetInstance();
     Status s = config.GetGpuResourceConfigBuildIndexResources(build_gpu_ids_);
     if (!s.ok()) {
         throw;
     }
-#endif
 }
 
 bool
@@ -56,3 +54,4 @@ BuildIndexPass::Run(const TaskPtr& task) {
 
 }  // namespace scheduler
 }  // namespace milvus
+#endif
diff --git a/core/src/scheduler/optimizer/BuildIndexPass.h b/core/src/scheduler/optimizer/BuildIndexPass.h
index 3adf1259a7..8be1a2f3cf 100644
--- a/core/src/scheduler/optimizer/BuildIndexPass.h
+++ b/core/src/scheduler/optimizer/BuildIndexPass.h
@@ -14,6 +14,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
+#ifdef MILVUS_GPU_VERSION
 #pragma once
 
 #include <condition_variable>
@@ -52,3 +53,4 @@ using BuildIndexPassPtr = std::shared_ptr<BuildIndexPass>;
 
 }  // namespace scheduler
 }  // namespace milvus
+#endif
diff --git a/core/src/scheduler/optimizer/FaissFlatPass.cpp b/core/src/scheduler/optimizer/FaissFlatPass.cpp
index 53256f7790..f7377d22ef 100644
--- a/core/src/scheduler/optimizer/FaissFlatPass.cpp
+++ b/core/src/scheduler/optimizer/FaissFlatPass.cpp
@@ -14,7 +14,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-
+#ifdef MILVUS_GPU_VERSION
 #include "scheduler/optimizer/FaissFlatPass.h"
 #include "cache/GpuCacheMgr.h"
 #include "scheduler/SchedInst.h"
@@ -29,7 +29,6 @@ namespace scheduler {
 
 void
 FaissFlatPass::Init() {
-#ifdef MILVUS_GPU_VERSION
     server::Config& config = server::Config::GetInstance();
     Status s = config.GetEngineConfigGpuSearchThreshold(threshold_);
     if (!s.ok()) {
@@ -39,7 +38,6 @@ FaissFlatPass::Init() {
     if (!s.ok()) {
         throw;
     }
-#endif
 }
 
 bool
@@ -62,7 +60,7 @@ FaissFlatPass::Run(const TaskPtr& task) {
         auto best_device_id = count_ % gpus.size();
         SERVER_LOG_DEBUG << "FaissFlatPass: nq > gpu_search_threshold, specify gpu" << best_device_id << " to search!";
         count_++;
-        res_ptr = ResMgrInst::GetInstance()->GetResource(ResourceType::GPU, best_device_id);
+        res_ptr = ResMgrInst::GetInstance()->GetResource(ResourceType::GPU, gpus[best_device_id]);
     }
     auto label = std::make_shared<SpecResLabel>(res_ptr);
     task->label() = label;
@@ -71,3 +69,4 @@ FaissFlatPass::Run(const TaskPtr& task) {
 
 }  // namespace scheduler
 }  // namespace milvus
+#endif
diff --git a/core/src/scheduler/optimizer/FaissFlatPass.h b/core/src/scheduler/optimizer/FaissFlatPass.h
index f219bebdf3..a6d7b7320a 100644
--- a/core/src/scheduler/optimizer/FaissFlatPass.h
+++ b/core/src/scheduler/optimizer/FaissFlatPass.h
@@ -14,6 +14,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
+#ifdef MILVUS_GPU_VERSION
 #pragma once
 
 #include <condition_variable>
@@ -54,3 +55,4 @@ using FaissFlatPassPtr = std::shared_ptr<FaissFlatPass>;
 
 }  // namespace scheduler
 }  // namespace milvus
+#endif
diff --git a/core/src/scheduler/optimizer/FaissIVFFlatPass.cpp b/core/src/scheduler/optimizer/FaissIVFFlatPass.cpp
index 8ad51fb14a..b8fe778432 100644
--- a/core/src/scheduler/optimizer/FaissIVFFlatPass.cpp
+++ b/core/src/scheduler/optimizer/FaissIVFFlatPass.cpp
@@ -14,7 +14,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-
+#ifdef MILVUS_GPU_VERSION
 #include "scheduler/optimizer/FaissIVFFlatPass.h"
 #include "cache/GpuCacheMgr.h"
 #include "scheduler/SchedInst.h"
@@ -63,7 +63,7 @@ FaissIVFFlatPass::Run(const TaskPtr& task) {
         SERVER_LOG_DEBUG << "FaissIVFFlatPass: nq > gpu_search_threshold, specify gpu" << best_device_id
                          << " to search!";
         count_++;
-        res_ptr = ResMgrInst::GetInstance()->GetResource(ResourceType::GPU, best_device_id);
+        res_ptr = ResMgrInst::GetInstance()->GetResource(ResourceType::GPU, gpus[best_device_id]);
     }
     auto label = std::make_shared<SpecResLabel>(res_ptr);
     task->label() = label;
@@ -72,3 +72,4 @@ FaissIVFFlatPass::Run(const TaskPtr& task) {
 
 }  // namespace scheduler
 }  // namespace milvus
+#endif
diff --git a/core/src/scheduler/optimizer/FaissIVFFlatPass.h b/core/src/scheduler/optimizer/FaissIVFFlatPass.h
index 2d15539014..7a8fca8ef8 100644
--- a/core/src/scheduler/optimizer/FaissIVFFlatPass.h
+++ b/core/src/scheduler/optimizer/FaissIVFFlatPass.h
@@ -14,6 +14,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
+#ifdef MILVUS_GPU_VERSION
 #pragma once
 
 #include <condition_variable>
@@ -54,3 +55,4 @@ using FaissIVFFlatPassPtr = std::shared_ptr<FaissIVFFlatPass>;
 
 }  // namespace scheduler
 }  // namespace milvus
+#endif
diff --git a/core/src/scheduler/optimizer/FaissIVFPQPass.cpp b/core/src/scheduler/optimizer/FaissIVFPQPass.cpp
new file mode 100644
index 0000000000..f97fec63b4
--- /dev/null
+++ b/core/src/scheduler/optimizer/FaissIVFPQPass.cpp
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#ifdef MILVUS_GPU_VERSION
+#include "scheduler/optimizer/FaissIVFPQPass.h"
+#include "cache/GpuCacheMgr.h"
+#include "scheduler/SchedInst.h"
+#include "scheduler/Utils.h"
+#include "scheduler/task/SearchTask.h"
+#include "scheduler/tasklabel/SpecResLabel.h"
+#include "server/Config.h"
+#include "utils/Log.h"
+
+namespace milvus {
+namespace scheduler {
+
+void
+FaissIVFPQPass::Init() {
+#ifdef MILVUS_GPU_VERSION
+    server::Config& config = server::Config::GetInstance();
+    Status s = config.GetEngineConfigGpuSearchThreshold(threshold_);
+    if (!s.ok()) {
+        threshold_ = std::numeric_limits<int32_t>::max();
+    }
+    s = config.GetGpuResourceConfigSearchResources(gpus);
+    if (!s.ok()) {
+        throw;
+    }
+#endif
+}
+
+bool
+FaissIVFPQPass::Run(const TaskPtr& task) {
+    if (task->Type() != TaskType::SearchTask) {
+        return false;
+    }
+
+    auto search_task = std::static_pointer_cast<XSearchTask>(task);
+    if (search_task->file_->engine_type_ != (int)engine::EngineType::FAISS_PQ) {
+        return false;
+    }
+
+    auto search_job = std::static_pointer_cast<SearchJob>(search_task->job_.lock());
+    ResourcePtr res_ptr;
+    if (search_job->nq() < threshold_) {
+        SERVER_LOG_DEBUG << "FaissIVFPQPass: nq < gpu_search_threshold, specify cpu to search!";
+        res_ptr = ResMgrInst::GetInstance()->GetResource("cpu");
+    } else {
+        auto best_device_id = count_ % gpus.size();
+        SERVER_LOG_DEBUG << "FaissIVFPQPass: nq > gpu_search_threshold, specify gpu" << best_device_id << " to search!";
+        count_++;
+        res_ptr = ResMgrInst::GetInstance()->GetResource(ResourceType::GPU, gpus[best_device_id]);
+    }
+    auto label = std::make_shared<SpecResLabel>(res_ptr);
+    task->label() = label;
+    return true;
+}
+
+}  // namespace scheduler
+}  // namespace milvus
+#endif
diff --git a/core/src/scheduler/optimizer/FaissIVFPQPass.h b/core/src/scheduler/optimizer/FaissIVFPQPass.h
new file mode 100644
index 0000000000..9225f84b7c
--- /dev/null
+++ b/core/src/scheduler/optimizer/FaissIVFPQPass.h
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#ifdef MILVUS_GPU_VERSION
+#pragma once
+
+#include <condition_variable>
+#include <deque>
+#include <limits>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include "Pass.h"
+
+namespace milvus {
+namespace scheduler {
+
+class FaissIVFPQPass : public Pass {
+ public:
+    FaissIVFPQPass() = default;
+
+ public:
+    void
+    Init() override;
+
+    bool
+    Run(const TaskPtr& task) override;
+
+ private:
+    int64_t threshold_ = std::numeric_limits<int64_t>::max();
+    int64_t count_ = 0;
+    std::vector<int64_t> gpus;
+};
+
+using FaissIVFPQPassPtr = std::shared_ptr<FaissIVFPQPass>;
+
+}  // namespace scheduler
+}  // namespace milvus
+#endif
diff --git a/core/src/scheduler/optimizer/FaissIVFSQ8HPass.cpp b/core/src/scheduler/optimizer/FaissIVFSQ8HPass.cpp
index cc04944fda..ad4184187d 100644
--- a/core/src/scheduler/optimizer/FaissIVFSQ8HPass.cpp
+++ b/core/src/scheduler/optimizer/FaissIVFSQ8HPass.cpp
@@ -14,7 +14,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-
+#ifdef MILVUS_GPU_VERSION
 #include "scheduler/optimizer/FaissIVFSQ8HPass.h"
 #include "cache/GpuCacheMgr.h"
 #include "scheduler/SchedInst.h"
@@ -29,7 +29,7 @@ namespace scheduler {
 
 void
 FaissIVFSQ8HPass::Init() {
-#ifdef MILVUS_GPU_VERSION
+#ifdef CUSTOMIZATION
     server::Config& config = server::Config::GetInstance();
     Status s = config.GetEngineConfigGpuSearchThreshold(threshold_);
     if (!s.ok()) {
@@ -41,6 +41,7 @@ FaissIVFSQ8HPass::Init() {
 
 bool
 FaissIVFSQ8HPass::Run(const TaskPtr& task) {
+#ifdef CUSTOMIZATION
     if (task->Type() != TaskType::SearchTask) {
         return false;
     }
@@ -60,12 +61,14 @@ FaissIVFSQ8HPass::Run(const TaskPtr& task) {
         SERVER_LOG_DEBUG << "FaissIVFSQ8HPass: nq > gpu_search_threshold, specify gpu" << best_device_id
                          << " to search!";
         count_++;
-        res_ptr = ResMgrInst::GetInstance()->GetResource(ResourceType::GPU, best_device_id);
+        res_ptr = ResMgrInst::GetInstance()->GetResource(ResourceType::GPU, gpus[best_device_id]);
     }
     auto label = std::make_shared<SpecResLabel>(res_ptr);
     task->label() = label;
     return true;
+#endif
 }
 
 }  // namespace scheduler
 }  // namespace milvus
+#endif
diff --git a/core/src/scheduler/optimizer/FaissIVFSQ8HPass.h b/core/src/scheduler/optimizer/FaissIVFSQ8HPass.h
index 0d2892809f..4e1e37730e 100644
--- a/core/src/scheduler/optimizer/FaissIVFSQ8HPass.h
+++ b/core/src/scheduler/optimizer/FaissIVFSQ8HPass.h
@@ -14,6 +14,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
+#ifdef MILVUS_GPU_VERSION
 #pragma once
 
 #include <condition_variable>
@@ -54,3 +55,4 @@ using FaissIVFSQ8HPassPtr = std::shared_ptr<FaissIVFSQ8HPass>;
 
 }  // namespace scheduler
 }  // namespace milvus
+#endif
diff --git a/core/src/scheduler/optimizer/FaissIVFSQ8Pass.cpp b/core/src/scheduler/optimizer/FaissIVFSQ8Pass.cpp
index c2a1858966..280b024894 100644
--- a/core/src/scheduler/optimizer/FaissIVFSQ8Pass.cpp
+++ b/core/src/scheduler/optimizer/FaissIVFSQ8Pass.cpp
@@ -14,7 +14,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-
+#ifdef MILVUS_GPU_VERSION
 #include "scheduler/optimizer/FaissIVFSQ8Pass.h"
 #include "cache/GpuCacheMgr.h"
 #include "scheduler/SchedInst.h"
@@ -63,7 +63,7 @@ FaissIVFSQ8Pass::Run(const TaskPtr& task) {
         SERVER_LOG_DEBUG << "FaissIVFSQ8Pass: nq > gpu_search_threshold, specify gpu" << best_device_id
                          << " to search!";
         count_++;
-        res_ptr = ResMgrInst::GetInstance()->GetResource(ResourceType::GPU, best_device_id);
+        res_ptr = ResMgrInst::GetInstance()->GetResource(ResourceType::GPU, gpus[best_device_id]);
     }
     auto label = std::make_shared<SpecResLabel>(res_ptr);
     task->label() = label;
@@ -72,3 +72,4 @@ FaissIVFSQ8Pass::Run(const TaskPtr& task) {
 
 }  // namespace scheduler
 }  // namespace milvus
+#endif
diff --git a/core/src/scheduler/optimizer/FaissIVFSQ8Pass.h b/core/src/scheduler/optimizer/FaissIVFSQ8Pass.h
index e92ea2fe4e..47033fc790 100644
--- a/core/src/scheduler/optimizer/FaissIVFSQ8Pass.h
+++ b/core/src/scheduler/optimizer/FaissIVFSQ8Pass.h
@@ -14,6 +14,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
+#ifdef MILVUS_GPU_VERSION
 #pragma once
 
 #include <condition_variable>
@@ -54,3 +55,4 @@ using FaissIVFSQ8PassPtr = std::shared_ptr<FaissIVFSQ8Pass>;
 
 }  // namespace scheduler
 }  // namespace milvus
+#endif
diff --git a/core/src/sdk/include/MilvusApi.h b/core/src/sdk/include/MilvusApi.h
index 5c7736d4e2..0937ce2f8e 100644
--- a/core/src/sdk/include/MilvusApi.h
+++ b/core/src/sdk/include/MilvusApi.h
@@ -85,18 +85,18 @@ struct RowRecord {
  * @brief TopK query result
  */
 struct QueryResult {
-    std::vector<int64_t> ids;
-    std::vector<float> distances;
+    std::vector<int64_t> ids;      ///< Query ids result
+    std::vector<float> distances;  ///< Query distances result
 };
-using TopKQueryResult = std::vector<QueryResult>;
+using TopKQueryResult = std::vector<QueryResult>;  ///< Topk query result
 
 /**
  * @brief index parameters
  */
 struct IndexParam {
-    std::string table_name;
-    IndexType index_type;
-    int32_t nlist;
+    std::string table_name;  ///< Table name for create index
+    IndexType index_type;    ///< Create index type
+    int32_t nlist;           ///< Index nlist
 };
 
 /**
@@ -142,8 +142,8 @@ class Connection {
     /**
      * @brief Connect
      *
-     * Connect function should be called before any operations
-     * Server will be connected after Connect return OK
+     * This method is used to connect server.
+     * Connect function should be called before any operations.
      *
      * @param param, use to provide server information
      *
@@ -156,10 +156,10 @@ class Connection {
     /**
      * @brief Connect
      *
-     * Connect function should be called before any operations
-     * Server will be connected after Connect return OK
+     * This method is used to connect server.
+     * Connect function should be called before any operations.
      *
-     * @param uri, use to provide server information, example: milvus://ipaddress:port
+     * @param uri, use to provide server uri, example: milvus://ipaddress:port
      *
      * @return Indicate if connect is successful
      */
@@ -169,7 +169,7 @@ class Connection {
     /**
      * @brief connected
      *
-     * Connection status.
+     * This method is used to test whether server is connected.
      *
      * @return Indicate if connection status
      */
@@ -179,7 +179,7 @@ class Connection {
     /**
      * @brief Disconnect
      *
-     * Server will be disconnected after Disconnect return OK
+     * This method is used to disconnect server.
      *
      * @return Indicate if disconnect is successful
      */
@@ -189,7 +189,7 @@ class Connection {
     /**
      * @brief Create table method
      *
-     * This method is used to create table
+     * This method is used to create table.
      *
      * @param param, use to provide table information to be created.
      *
@@ -201,7 +201,7 @@ class Connection {
     /**
      * @brief Test table existence method
      *
-     * This method is used to create table
+     * This method is used to create table.
      *
      * @param table_name, target table's name.
      *
@@ -211,13 +211,13 @@ class Connection {
     HasTable(const std::string& table_name) = 0;
 
     /**
-     * @brief Delete table method
+     * @brief Drop table method
      *
-     * This method is used to delete table(and its partitions).
+     * This method is used to drop table(and its partitions).
      *
      * @param table_name, target table's name.
      *
-     * @return Indicate if table is delete successfully.
+     * @return Indicate if table is drop successfully.
      */
     virtual Status
     DropTable(const std::string& table_name) = 0;
@@ -239,14 +239,17 @@ class Connection {
     CreateIndex(const IndexParam& index_param) = 0;
 
     /**
-     * @brief Add vector to table
+     * @brief Insert vector to table
      *
-     * This method is used to add vector array to table.
+     * This method is used to insert vector array to table.
      *
      * @param table_name, target table's name.
      * @param partition_tag, target partition's tag, keep empty if no partition.
      * @param record_array, vector array is inserted.
-     * @param id_array, after inserted every vector is given a id.
+     * @param id_array,
+     *  specify id for each vector,
+     *  if this array is empty, milvus will generate unique id for each vector,
+     *  and return all ids by this parameter.
      *
      * @return Indicate if vector array are inserted successfully
      */
@@ -259,11 +262,12 @@ class Connection {
      *
      * This method is used to query vector in table.
      *
-     * @param table_name, target table's name, keep empty if no partition.
-     * @param partition_tags, target partitions.
+     * @param table_name, target table's name.
+     * @param partition_tags, target partitions, keep empty if no partition.
      * @param query_record_array, all vector are going to be queried.
-     * @param query_range_array, time ranges, if not specified, will search in whole table
+     * @param query_range_array, [deprecated] time ranges, if not specified, will search in whole table
      * @param topk, how many similarity vectors will be searched.
+     * @param nprobe, the number of centroids choose to search.
      * @param topk_query_result_array, result array.
      *
      * @return Indicate if query is successful.
@@ -304,7 +308,7 @@ class Connection {
      *
      * This method is used to list all tables.
      *
-     * @param table_array, all tables are push into the array.
+     * @param table_array, all tables in database.
      *
      * @return Indicate if this operation is successful.
      */
@@ -346,12 +350,13 @@ class Connection {
      *
      * This method is internal used.
      *
-     * @return Server status.
+     * @return Task information in tasktables.
      */
     virtual std::string
     DumpTaskTables() const = 0;
 
     /**
+     * [deprecated]
      * @brief delete tables by date range
      *
      * This method is used to delete table data by date range.
diff --git a/core/src/server/Config.cpp b/core/src/server/Config.cpp
index 95bab84391..6643841fcc 100644
--- a/core/src/server/Config.cpp
+++ b/core/src/server/Config.cpp
@@ -20,6 +20,7 @@
 #include <iostream>
 #include <regex>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "config/YamlConfigMgr.h"
@@ -33,6 +34,8 @@ namespace server {
 
 constexpr uint64_t GB = 1UL << 30;
 
+static const std::unordered_map<std::string, std::string> milvus_config_version_map({{"0.6.0", "0.1"}});
+
 Config&
 Config::GetInstance() {
     static Config config_inst;
@@ -69,6 +72,12 @@ Status
 Config::ValidateConfig() {
     Status s;
 
+    std::string config_version;
+    s = GetConfigVersion(config_version);
+    if (!s.ok()) {
+        return s;
+    }
+
     /* server config */
     std::string server_addr;
     s = GetServerConfigAddress(server_addr);
@@ -196,6 +205,7 @@ Config::ValidateConfig() {
         return s;
     }
 
+    std::cout << "GPU resources " << (gpu_resource_enable ? "ENABLED !" : "DISABLED !") << std::endl;
     if (gpu_resource_enable) {
         int64_t resource_cache_capacity;
         s = GetGpuResourceConfigCacheCapacity(resource_cache_capacity);
@@ -383,6 +393,16 @@ Config::PrintAll() {
 }
 
 ////////////////////////////////////////////////////////////////////////////////
+Status
+Config::CheckConfigVersion(const std::string& value) {
+    if (milvus_config_version_map.at(MILVUS_VERSION) != value) {
+        std::string msg = "Invalid config version: " + value +
+                          ". Expected config version: " + milvus_config_version_map.at(MILVUS_VERSION);
+        return Status(SERVER_INVALID_ARGUMENT, msg);
+    }
+    return Status::OK();
+}
+
 Status
 Config::CheckServerConfigAddress(const std::string& value) {
     if (!ValidationUtil::ValidateIpAddress(value).ok()) {
@@ -766,10 +786,14 @@ Config::CheckGpuResourceConfigBuildIndexResources(const std::vector<std::string>
 
 ////////////////////////////////////////////////////////////////////////////////
 ConfigNode&
-Config::GetConfigNode(const std::string& name) {
+Config::GetConfigRoot() {
     ConfigMgr* mgr = YamlConfigMgr::GetInstance();
-    ConfigNode& root_node = mgr->GetRootNode();
-    return root_node.GetChild(name);
+    return mgr->GetRootNode();
+}
+
+ConfigNode&
+Config::GetConfigNode(const std::string& name) {
+    return GetConfigRoot().GetChild(name);
 }
 
 Status
@@ -816,6 +840,12 @@ Config::GetConfigSequenceStr(const std::string& parent_key, const std::string& c
     return value;
 }
 
+Status
+Config::GetConfigVersion(std::string& value) {
+    value = GetConfigRoot().GetValue(CONFIG_VERSION);
+    return CheckConfigVersion(value);
+}
+
 Status
 Config::GetServerConfigAddress(std::string& value) {
     value = GetConfigStr(CONFIG_SERVER, CONFIG_SERVER_ADDRESS, CONFIG_SERVER_ADDRESS_DEFAULT);
diff --git a/core/src/server/Config.h b/core/src/server/Config.h
index 281a832d57..4361cd1e37 100644
--- a/core/src/server/Config.h
+++ b/core/src/server/Config.h
@@ -28,6 +28,8 @@
 namespace milvus {
 namespace server {
 
+static const char* CONFIG_VERSION = "version";
+
 /* server config */
 static const char* CONFIG_SERVER = "server_config";
 static const char* CONFIG_SERVER_ADDRESS = "address";
@@ -115,6 +117,8 @@ class Config {
     PrintAll();
 
  private:
+    ConfigNode&
+    GetConfigRoot();
     ConfigNode&
     GetConfigNode(const std::string& name);
     Status
@@ -125,6 +129,9 @@ class Config {
     PrintConfigSection(const std::string& config_node_name);
 
     ///////////////////////////////////////////////////////////////////////////
+    Status
+    CheckConfigVersion(const std::string& value);
+
     /* server config */
     Status
     CheckServerConfigAddress(const std::string& value);
@@ -193,6 +200,8 @@ class Config {
     std::string
     GetConfigSequenceStr(const std::string& parent_key, const std::string& child_key, const std::string& delim = ",",
                          const std::string& default_value = "");
+    Status
+    GetConfigVersion(std::string& value);
 
  public:
     /* server config */
diff --git a/core/src/server/Server.cpp b/core/src/server/Server.cpp
index 169463080e..100db5d2ec 100644
--- a/core/src/server/Server.cpp
+++ b/core/src/server/Server.cpp
@@ -183,10 +183,10 @@ Server::Start() {
 
         // print version information
         SERVER_LOG_INFO << "Milvus " << BUILD_TYPE << " version: v" << MILVUS_VERSION << ", built at " << BUILD_TIME;
-#ifdef MILVUS_CPU_VERSION
-        SERVER_LOG_INFO << "CPU edition";
-#else
+#ifdef MILVUS_GPU_VERSION
         SERVER_LOG_INFO << "GPU edition";
+#else
+        SERVER_LOG_INFO << "CPU edition";
 #endif
         server::Metrics::GetInstance().Init();
         server::SystemInfo::GetInstance().Init();
diff --git a/core/src/utils/CommonUtil.cpp b/core/src/utils/CommonUtil.cpp
index 26e43619fb..cfadb2fcc4 100644
--- a/core/src/utils/CommonUtil.cpp
+++ b/core/src/utils/CommonUtil.cpp
@@ -16,6 +16,9 @@
 // under the License.
 
 #include "utils/CommonUtil.h"
+#include "cache/CpuCacheMgr.h"
+#include "cache/GpuCacheMgr.h"
+#include "server/Config.h"
 #include "utils/Log.h"
 
 #include <dirent.h>
@@ -27,6 +30,7 @@
 #include <unistd.h>
 #include <iostream>
 #include <thread>
+#include <vector>
 
 #include "boost/filesystem.hpp"
 
@@ -222,5 +226,24 @@ CommonUtil::ConvertTime(tm time_struct, time_t& time_integer) {
     time_integer = mktime(&time_struct);
 }
 
+void
+CommonUtil::EraseFromCache(const std::string& item_key) {
+    if (item_key.empty()) {
+        //        SERVER_LOG_ERROR << "Empty key cannot be erased from cache";
+        return;
+    }
+
+    cache::CpuCacheMgr::GetInstance()->EraseItem(item_key);
+
+#ifdef MILVUS_GPU_VERSION
+    server::Config& config = server::Config::GetInstance();
+    std::vector<int64_t> gpus;
+    Status s = config.GetGpuResourceConfigSearchResources(gpus);
+    for (auto& gpu : gpus) {
+        cache::GpuCacheMgr::GetInstance(gpu)->EraseItem(item_key);
+    }
+#endif
+}
+
 }  // namespace server
 }  // namespace milvus
diff --git a/core/src/utils/CommonUtil.h b/core/src/utils/CommonUtil.h
index 121196986a..39b553d830 100644
--- a/core/src/utils/CommonUtil.h
+++ b/core/src/utils/CommonUtil.h
@@ -56,6 +56,9 @@ class CommonUtil {
     ConvertTime(time_t time_integer, tm& time_struct);
     static void
     ConvertTime(tm time_struct, time_t& time_integer);
+
+    static void
+    EraseFromCache(const std::string& item_key);
 };
 
 }  // namespace server
diff --git a/core/src/utils/TimeRecorder.cpp b/core/src/utils/TimeRecorder.cpp
index f3061d9d2b..2072346942 100644
--- a/core/src/utils/TimeRecorder.cpp
+++ b/core/src/utils/TimeRecorder.cpp
@@ -96,4 +96,11 @@ TimeRecorder::ElapseFromBegin(const std::string& msg) {
     return span;
 }
 
+TimeRecorderAuto::TimeRecorderAuto(const std::string& header, int64_t log_level) : TimeRecorder(header, log_level) {
+}
+
+TimeRecorderAuto::~TimeRecorderAuto() {
+    ElapseFromBegin("totally cost");
+}
+
 }  // namespace milvus
diff --git a/core/src/utils/TimeRecorder.h b/core/src/utils/TimeRecorder.h
index cc0a86fbe0..8f6990f482 100644
--- a/core/src/utils/TimeRecorder.h
+++ b/core/src/utils/TimeRecorder.h
@@ -28,7 +28,7 @@ class TimeRecorder {
  public:
     explicit TimeRecorder(const std::string& header, int64_t log_level = 1);
 
-    ~TimeRecorder();  // trace = 0, debug = 1, info = 2, warn = 3, error = 4, critical = 5
+    virtual ~TimeRecorder();  // trace = 0, debug = 1, info = 2, warn = 3, error = 4, critical = 5
 
     double
     RecordSection(const std::string& msg);
@@ -50,4 +50,11 @@ class TimeRecorder {
     int64_t log_level_;
 };
 
+class TimeRecorderAuto : public TimeRecorder {
+ public:
+    explicit TimeRecorderAuto(const std::string& header, int64_t log_level = 1);
+
+    ~TimeRecorderAuto();
+};
+
 }  // namespace milvus
diff --git a/core/src/wrapper/ConfAdapter.cpp b/core/src/wrapper/ConfAdapter.cpp
index 7ad1b8b74b..b96f2c80e5 100644
--- a/core/src/wrapper/ConfAdapter.cpp
+++ b/core/src/wrapper/ConfAdapter.cpp
@@ -39,8 +39,6 @@ void
 ConfAdapter::MatchBase(knowhere::Config conf) {
     if (conf->metric_type == knowhere::DEFAULT_TYPE)
         conf->metric_type = knowhere::METRICTYPE::L2;
-    if (conf->gpu_id == knowhere::INVALID_VALUE)
-        conf->gpu_id = 0;
 }
 
 knowhere::Config
@@ -134,7 +132,7 @@ IVFPQConfAdapter::Match(const TempMetaConf& metaconf) {
 
     /*
      * Faiss 1.6
-     * Only 1, 2, 3, 4, 6, 8, 10, 12, 16, 20, 24, 28, 32 dims per sub-quantizer are currently supporte with
+     * Only 1, 2, 3, 4, 6, 8, 10, 12, 16, 20, 24, 28, 32 dims per sub-quantizer are currently supported with
      * no precomputed codes. Precomputed codes supports any number of dimensions, but will involve memory overheads.
      */
     static std::vector<int64_t> support_dim_per_subquantizer{32, 28, 24, 20, 16, 12, 10, 8, 6, 4, 3, 2, 1};
@@ -152,7 +150,12 @@ IVFPQConfAdapter::Match(const TempMetaConf& metaconf) {
 
     if (resset.empty()) {
         // todo(linxj): throw exception here.
-        return nullptr;
+        WRAPPER_LOG_ERROR << "The dims of PQ is wrong : only 1, 2, 3, 4, 6, 8, 10, 12, 16, 20, 24, 28, 32 dims per sub-"
+                             "quantizer are currently supported with no precomputed codes.";
+        throw WrapperException(
+            "The dims of PQ is wrong : only 1, 2, 3, 4, 6, 8, 10, 12, 16, 20, 24, 28, 32 dims "
+            "per sub-quantizer are currently supported with no precomputed codes.");
+        // return nullptr;
     }
     static int64_t compression_level = 1;  // 1:low, 2:high
     if (compression_level == 1) {
diff --git a/core/src/wrapper/VecImpl.cpp b/core/src/wrapper/VecImpl.cpp
index 74e9e94a2f..dda7452cd0 100644
--- a/core/src/wrapper/VecImpl.cpp
+++ b/core/src/wrapper/VecImpl.cpp
@@ -180,13 +180,13 @@ VecIndexImpl::CopyToCpu(const Config& cfg) {
 #endif
 }
 
-VecIndexPtr
-VecIndexImpl::Clone() {
-    // TODO(linxj): exception handle
-    auto clone_index = std::make_shared<VecIndexImpl>(index_->Clone(), type);
-    clone_index->dim = dim;
-    return clone_index;
-}
+// VecIndexPtr
+// VecIndexImpl::Clone() {
+//    // TODO(linxj): exception handle
+//    auto clone_index = std::make_shared<VecIndexImpl>(index_->Clone(), type);
+//    clone_index->dim = dim;
+//    return clone_index;
+//}
 
 int64_t
 VecIndexImpl::GetDeviceId() {
diff --git a/core/src/wrapper/VecImpl.h b/core/src/wrapper/VecImpl.h
index 2b6f07827e..e8dee45d42 100644
--- a/core/src/wrapper/VecImpl.h
+++ b/core/src/wrapper/VecImpl.h
@@ -60,8 +60,8 @@ class VecIndexImpl : public VecIndex {
     Status
     Load(const knowhere::BinarySet& index_binary) override;
 
-    VecIndexPtr
-    Clone() override;
+    //    VecIndexPtr
+    //    Clone() override;
 
     int64_t
     GetDeviceId() override;
diff --git a/core/src/wrapper/VecIndex.h b/core/src/wrapper/VecIndex.h
index e69655b087..536ebb32f6 100644
--- a/core/src/wrapper/VecIndex.h
+++ b/core/src/wrapper/VecIndex.h
@@ -75,8 +75,8 @@ class VecIndex : public cache::DataObj {
     CopyToCpu(const Config& cfg = Config()) = 0;
 
     // TODO(linxj): Deprecated
-    virtual VecIndexPtr
-    Clone() = 0;
+    //    virtual VecIndexPtr
+    //    Clone() = 0;
 
     virtual int64_t
     GetDeviceId() = 0;
diff --git a/core/unittest/db/test_db.cpp b/core/unittest/db/test_db.cpp
index 217fbe429e..0a47ac1b9b 100644
--- a/core/unittest/db/test_db.cpp
+++ b/core/unittest/db/test_db.cpp
@@ -229,6 +229,7 @@ TEST_F(DBTest, DB_TEST) {
 }
 
 TEST_F(DBTest, SEARCH_TEST) {
+    milvus::scheduler::OptimizerInst::GetInstance()->Init();
     std::string config_path(CONFIG_PATH);
     config_path += CONFIG_FILE;
     milvus::server::Config& config = milvus::server::Config::GetInstance();
@@ -290,8 +291,51 @@ TEST_F(DBTest, SEARCH_TEST) {
         milvus::engine::ResultDistances result_distances;
         stat = db_->Query(TABLE_NAME, tags, k, nq, 10, xq.data(), result_ids, result_distances);
         ASSERT_TRUE(stat.ok());
+        stat = db_->Query(TABLE_NAME, tags, k, 1100, 10, xq.data(), result_ids, result_distances);
+        ASSERT_TRUE(stat.ok());
     }
 
+    index.engine_type_ = (int)milvus::engine::EngineType::FAISS_IVFFLAT;
+    db_->CreateIndex(TABLE_NAME, index);  // wait until build index finish
+
+    {
+        std::vector<std::string> tags;
+        milvus::engine::ResultIds result_ids;
+        milvus::engine::ResultDistances result_distances;
+        stat = db_->Query(TABLE_NAME, tags, k, nq, 10, xq.data(), result_ids, result_distances);
+        ASSERT_TRUE(stat.ok());
+        stat = db_->Query(TABLE_NAME, tags, k, 1100, 10, xq.data(), result_ids, result_distances);
+        ASSERT_TRUE(stat.ok());
+    }
+
+    index.engine_type_ = (int)milvus::engine::EngineType::FAISS_IVFSQ8;
+    db_->CreateIndex(TABLE_NAME, index);  // wait until build index finish
+
+    {
+        std::vector<std::string> tags;
+        milvus::engine::ResultIds result_ids;
+        milvus::engine::ResultDistances result_distances;
+        stat = db_->Query(TABLE_NAME, tags, k, nq, 10, xq.data(), result_ids, result_distances);
+        ASSERT_TRUE(stat.ok());
+        stat = db_->Query(TABLE_NAME, tags, k, 1100, 10, xq.data(), result_ids, result_distances);
+        ASSERT_TRUE(stat.ok());
+    }
+
+#ifdef CUSTOMIZATION
+    index.engine_type_ = (int)milvus::engine::EngineType::FAISS_IVFSQ8H;
+    db_->CreateIndex(TABLE_NAME, index);  // wait until build index finish
+
+    {
+        std::vector<std::string> tags;
+        milvus::engine::ResultIds result_ids;
+        milvus::engine::ResultDistances result_distances;
+        stat = db_->Query(TABLE_NAME, tags, k, nq, 10, xq.data(), result_ids, result_distances);
+        ASSERT_TRUE(stat.ok());
+        stat = db_->Query(TABLE_NAME, tags, k, 1100, 10, xq.data(), result_ids, result_distances);
+        ASSERT_TRUE(stat.ok());
+    }
+#endif
+
     {  // search by specify index file
         milvus::engine::meta::DatesT dates;
         std::vector<std::string> file_ids = {"1", "2", "3", "4", "5", "6"};
@@ -666,8 +710,8 @@ TEST_F(DBTest2, DELETE_BY_RANGE_TEST) {
     ASSERT_NE(size, 0UL);
 
     std::vector<milvus::engine::meta::DateT> dates;
-    std::string start_value = CurrentTmDate();
-    std::string end_value = CurrentTmDate(1);
+    std::string start_value = CurrentTmDate(-5);
+    std::string end_value = CurrentTmDate(5);
     ConvertTimeRangeToDBDates(start_value, end_value, dates);
 
     stat = db_->DropTable(TABLE_NAME, dates);
diff --git a/core/unittest/db/test_engine.cpp b/core/unittest/db/test_engine.cpp
index 5130a2c30d..e0edd05537 100644
--- a/core/unittest/db/test_engine.cpp
+++ b/core/unittest/db/test_engine.cpp
@@ -59,6 +59,29 @@ TEST_F(EngineTest, FACTORY_TEST) {
 
         ASSERT_TRUE(engine_ptr != nullptr);
     }
+
+    {
+        auto engine_ptr = milvus::engine::EngineFactory::Build(
+                512, "/tmp/milvus_index_1", milvus::engine::EngineType::FAISS_PQ, milvus::engine::MetricType::IP, 1024);
+
+        ASSERT_TRUE(engine_ptr != nullptr);
+    }
+
+    {
+        auto engine_ptr = milvus::engine::EngineFactory::Build(
+                512, "/tmp/milvus_index_1", milvus::engine::EngineType::SPTAG_KDT,
+                milvus::engine::MetricType::L2, 1024);
+
+        ASSERT_TRUE(engine_ptr != nullptr);
+    }
+
+    {
+        auto engine_ptr = milvus::engine::EngineFactory::Build(
+                512, "/tmp/milvus_index_1", milvus::engine::EngineType::SPTAG_KDT,
+                milvus::engine::MetricType::L2, 1024);
+
+        ASSERT_TRUE(engine_ptr != nullptr);
+    }
 }
 
 TEST_F(EngineTest, ENGINE_IMPL_TEST) {
@@ -69,7 +92,7 @@ TEST_F(EngineTest, ENGINE_IMPL_TEST) {
 
     std::vector<float> data;
     std::vector<int64_t> ids;
-    const int row_count = 10000;
+    const int row_count = 500;
     data.reserve(row_count * dimension);
     ids.reserve(row_count);
     for (int64_t i = 0; i < row_count; i++) {
@@ -88,12 +111,15 @@ TEST_F(EngineTest, ENGINE_IMPL_TEST) {
     status = engine_ptr->CopyToGpu(0, false);
     // ASSERT_TRUE(status.ok());
 
-    auto new_engine = engine_ptr->Clone();
-    ASSERT_EQ(new_engine->Dimension(), dimension);
-    ASSERT_EQ(new_engine->Count(), ids.size());
-    status = new_engine->CopyToCpu();
+//    auto new_engine = engine_ptr->Clone();
+//    ASSERT_EQ(new_engine->Dimension(), dimension);
+//    ASSERT_EQ(new_engine->Count(), ids.size());
+    status = engine_ptr->CopyToCpu();
     // ASSERT_TRUE(status.ok());
 
-    auto engine_build = new_engine->BuildIndex("/tmp/milvus_index_2", milvus::engine::EngineType::FAISS_IVFSQ8);
+    auto engine_build = engine_ptr->BuildIndex("/tmp/milvus_index_2", milvus::engine::EngineType::FAISS_IVFSQ8);
+    engine_build = engine_ptr->BuildIndex("/tmp/milvus_index_3", milvus::engine::EngineType::FAISS_PQ);
+    engine_build = engine_ptr->BuildIndex("/tmp/milvus_index_4", milvus::engine::EngineType::SPTAG_KDT);
+    engine_build = engine_ptr->BuildIndex("/tmp/milvus_index_5", milvus::engine::EngineType::SPTAG_BKT);
     // ASSERT_TRUE(status.ok());
 }
diff --git a/core/unittest/db/test_meta.cpp b/core/unittest/db/test_meta.cpp
index 143bf39383..b89c73c296 100644
--- a/core/unittest/db/test_meta.cpp
+++ b/core/unittest/db/test_meta.cpp
@@ -329,7 +329,7 @@ TEST_F(MetaTest, TABLE_FILES_TEST) {
     status = impl_->CreateTableFile(table_file);
     table_file.file_type_ = milvus::engine::meta::TableFileSchema::NEW;
     status = impl_->UpdateTableFile(table_file);
-    status = impl_->CleanUp();
+    status = impl_->CleanUpShadowFiles();
     ASSERT_TRUE(status.ok());
 
     status = impl_->DropTable(table_id);
diff --git a/core/unittest/db/utils.cpp b/core/unittest/db/utils.cpp
index 293eeccc69..a57bae79b5 100644
--- a/core/unittest/db/utils.cpp
+++ b/core/unittest/db/utils.cpp
@@ -68,17 +68,16 @@ static const char* CONFIG_STR =
     "engine_config:\n"
     "  use_blas_threshold: 20\n"
     "\n"
-    "resource_config:\n"
-#ifdef MILVUS_CPU_VERSION
-    "  search_resources:\n"
-    "    - cpu\n"
-    "  index_build_device: cpu           # CPU used for building index";
-#else
-    "  search_resources:\n"
-    "    - cpu\n"
+#ifdef MILVUS_GPU_VERSION
+    "gpu_resource_config:\n"
+    "  enable: true                      # whether to enable GPU resources\n"
+    "  cache_capacity: 4                 # GB, size of GPU memory per card used for cache, must be a positive integer\n"
+    "  search_resources:                 # define the GPU devices used for search computation, must be in format gpux\n"
+    "    - gpu0\n"
+    "  build_index_resources:            # define the GPU devices used for index building, must be in format gpux\n"
     "    - gpu0\n"
-    "  index_build_device: gpu0          # GPU used for building index";
 #endif
+    "\n";
 
 void
 WriteToFile(const std::string& file_path, const char* content) {
diff --git a/core/unittest/scheduler/test_scheduler.cpp b/core/unittest/scheduler/test_scheduler.cpp
index c839307958..dba8974698 100644
--- a/core/unittest/scheduler/test_scheduler.cpp
+++ b/core/unittest/scheduler/test_scheduler.cpp
@@ -37,10 +37,10 @@ class MockVecIndex : public engine::VecIndex {
              const float* xt = nullptr) {
     }
 
-    engine::VecIndexPtr
-    Clone() override {
-        return milvus::engine::VecIndexPtr();
-    }
+//    engine::VecIndexPtr
+//    Clone() override {
+//        return milvus::engine::VecIndexPtr();
+//    }
 
     int64_t
     GetDeviceId() override {
diff --git a/core/unittest/server/test_cache.cpp b/core/unittest/server/test_cache.cpp
index 92e09d4a26..ce8cfb304c 100644
--- a/core/unittest/server/test_cache.cpp
+++ b/core/unittest/server/test_cache.cpp
@@ -47,10 +47,10 @@ class MockVecIndex : public milvus::engine::VecIndex {
         return milvus::Status();
     }
 
-    milvus::engine::VecIndexPtr
-    Clone() override {
-        return milvus::engine::VecIndexPtr();
-    }
+//    milvus::engine::VecIndexPtr
+//    Clone() override {
+//        return milvus::engine::VecIndexPtr();
+//    }
 
     int64_t
     GetDeviceId() override {
@@ -120,7 +120,7 @@ TEST(CacheTest, DUMMY_TEST) {
     mock_index.Add(1, nullptr, nullptr);
     mock_index.BuildAll(1, nullptr, nullptr, cfg);
     mock_index.Search(1, nullptr, nullptr, nullptr, cfg);
-    mock_index.Clone();
+//    mock_index.Clone();
     mock_index.CopyToCpu(cfg);
     mock_index.CopyToGpu(1, cfg);
     mock_index.GetDeviceId();
diff --git a/core/unittest/server/utils.cpp b/core/unittest/server/utils.cpp
index 0545d34575..c232b1185a 100644
--- a/core/unittest/server/utils.cpp
+++ b/core/unittest/server/utils.cpp
@@ -28,6 +28,8 @@ namespace {
 static const char* VALID_CONFIG_STR =
     "# Default values are used when you make no changes to the following parameters.\n"
     "\n"
+    "version: 0.1"
+    "\n"
     "server_config:\n"
     "  address: 0.0.0.0                  # milvus server ip address (IPv4)\n"
     "  port: 19530                       # port range: 1025 ~ 65534\n"
@@ -52,24 +54,21 @@ static const char* VALID_CONFIG_STR =
     "cache_config:\n"
     "  cpu_cache_capacity: 16            # GB, CPU memory used for cache\n"
     "  cpu_cache_threshold: 0.85         \n"
-    "  gpu_cache_capacity: 4             # GB, GPU memory used for cache\n"
-    "  gpu_cache_threshold: 0.85         \n"
     "  cache_insert_data: false          # whether to load inserted data into cache\n"
     "\n"
     "engine_config:\n"
     "  use_blas_threshold: 20            \n"
     "\n"
-    "resource_config:\n"
-#ifdef MILVUS_CPU_VERSION
-    "  search_resources:\n"
-    "    - cpu\n"
-    "  index_build_device: cpu           # CPU used for building index";
-#else
-    "  search_resources:\n"
-    "    - cpu\n"
+#ifdef MILVUS_GPU_VERSION
+    "gpu_resource_config:\n"
+    "  enable: true                      # whether to enable GPU resources\n"
+    "  cache_capacity: 4                 # GB, size of GPU memory per card used for cache, must be a positive integer\n"
+    "  search_resources:                 # define the GPU devices used for search computation, must be in format gpux\n"
+    "    - gpu0\n"
+    "  build_index_resources:            # define the GPU devices used for index building, must be in format gpux\n"
     "    - gpu0\n"
-    "  index_build_device: gpu0          # GPU used for building index";
 #endif
+    "\n";
 
 static const char* INVALID_CONFIG_STR = "*INVALID*";
 
diff --git a/core/unittest/wrapper/test_wrapper.cpp b/core/unittest/wrapper/test_wrapper.cpp
index 4019c0f63c..2653852494 100644
--- a/core/unittest/wrapper/test_wrapper.cpp
+++ b/core/unittest/wrapper/test_wrapper.cpp
@@ -181,12 +181,23 @@ TEST_P(KnowhereWrapperTest, SERIALIZE_TEST) {
 
 TEST(whatever, test_config) {
     milvus::engine::TempMetaConf conf;
+    conf.nprobe = 16;
+    conf.dim = 128;
     auto nsg_conf = std::make_shared<milvus::engine::NSGConfAdapter>();
     nsg_conf->Match(conf);
-    nsg_conf->MatchSearch(conf, milvus::engine::IndexType::FAISS_IVFPQ_GPU);
+    nsg_conf->MatchSearch(conf, milvus::engine::IndexType::NSG_MIX);
 
     auto pq_conf = std::make_shared<milvus::engine::IVFPQConfAdapter>();
     pq_conf->Match(conf);
+    pq_conf->MatchSearch(conf, milvus::engine::IndexType::FAISS_IVFPQ_MIX);
+
+    auto kdt_conf = std::make_shared<milvus::engine::SPTAGKDTConfAdapter>();
+    kdt_conf->Match(conf);
+    kdt_conf->MatchSearch(conf, milvus::engine::IndexType::SPTAG_KDT_RNT_CPU);
+
+    auto bkt_conf = std::make_shared<milvus::engine::SPTAGBKTConfAdapter>();
+    bkt_conf->Match(conf);
+    bkt_conf->MatchSearch(conf, milvus::engine::IndexType::SPTAG_BKT_RNT_CPU);
 }
 
 // #include "knowhere/index/vector_index/IndexIDMAP.h"
diff --git a/core/unittest/wrapper/utils.cpp b/core/unittest/wrapper/utils.cpp
index 96b9e643f5..a5f8e1b6b2 100644
--- a/core/unittest/wrapper/utils.cpp
+++ b/core/unittest/wrapper/utils.cpp
@@ -56,17 +56,16 @@ static const char* CONFIG_STR =
     "engine_config:\n"
     "  blas_threshold: 20\n"
     "\n"
-    "resource_config:\n"
-#ifdef MILVUS_CPU_VERSION
-    "  search_resources:\n"
-    "    - cpu\n"
-    "  index_build_device: cpu           # CPU used for building index";
-#else
-    "  search_resources:\n"
-    "    - cpu\n"
+#ifdef MILVUS_GPU_VERSION
+    "gpu_resource_config:\n"
+    "  enable: true                      # whether to enable GPU resources\n"
+    "  cache_capacity: 4                 # GB, size of GPU memory per card used for cache, must be a positive integer\n"
+    "  search_resources:                 # define the GPU devices used for search computation, must be in format gpux\n"
+    "    - gpu0\n"
+    "  build_index_resources:            # define the GPU devices used for index building, must be in format gpux\n"
     "    - gpu0\n"
-    "  index_build_device: gpu0          # GPU used for building index";
 #endif
+    "\n";
 
 void
 WriteToFile(const std::string& file_path, const char* content) {
diff --git a/docs/test_report/ivfflat_test_report_cn.md b/docs/test_report/ivfflat_test_report_cn.md
new file mode 100644
index 0000000000..b5e345cbce
--- /dev/null
+++ b/docs/test_report/ivfflat_test_report_cn.md
@@ -0,0 +1,215 @@
+# ivfflat_test_report_cn
+
+## 概述
+
+本文描述了ivfflat索引在milvus单机部署方式下的测试结果。
+
+
+
+## 测试目标
+
+参数不同情况下的查询时间和召回率。
+
+
+
+## 测试方法
+
+### 软硬件环境
+
+操作系统：Ubuntu 18.04
+
+CPU：Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz
+
+GPU0：GeForce RTX 2080Ti 11GB
+
+GPU1：GeForce RTX 2080Ti 11GB
+
+GPU2：GeForce RTX 2080Ti 11GB
+
+GPU3：GeForce RTX 2080Ti 11GB
+
+内存：768GB
+
+Docker版本：19.03
+
+NVIDIA Driver版本：430.50
+
+Milvus版本：0.5.3
+
+SDK接口：Python 3.6.8
+
+pymilvus版本：0.2.5
+
+
+
+### 数据模型
+
+本测试中用到的主要数据:
+
+- 数据来源：sift1b
+- 数据类型：hdf5
+
+关于该数据集的详细信息请参考 : http://corpus-texmex.irisa.fr/ 。
+
+
+
+### 测试指标
+
+- Query Elapsed Time：数据库查询所有向量的时间（以秒计）。影响Query Elapsed Time的变量：
+
+  - nq (被查询向量的数量)
+
+  > 备注：在向量查询测试中，我们会测试下面参数不同的取值来观察结果：
+  >
+  > 被查询向量的数量nq将按照 [1, 5, 10,  100, 200, 400, 600, 800, 1000]的数量分组。 
+
+- Recall: 实际返回的正确结果占总数之比。影响Recall的变量：
+
+  - nq (被查询向量的数量)
+  - topk (单条查询中最相似的K个结果)
+
+  > 备注：在向量准确性测试中，我们会测试下面参数不同的取值来观察结果：
+  >
+  > 被查询向量的数量nq将按照 [50, 200, 400, 600, 800, 1000]的数量分组，
+  >
+  > 单条查询中最相似的K个结果topk将按照[1, 10, 100]的数量分组。
+
+
+
+## 测试报告
+
+### 测试环境
+
+数据集：sift1b-1,000,000,000向量，128维
+
+表格属性：
+
+- nlist: 16384
+- metric_type: L2
+
+查询设置：
+
+- nprobe: 32
+
+Milvus设置：
+
+- cpu_cache_capacity: 600
+- gpu_cache_capacity: 6
+- use_blas_threshold: 2100
+
+Milvus设置的详细定义可以参考 https://milvus.io/docs/en/reference/milvus_config/ 。
+
+测试方法
+
+通过一次仅改变一个参数的值，测试查询向量时间和召回率。
+
+- 查询后是否重启Milvus：否
+
+
+
+### 性能测试
+
+#### 数据查询
+
+测试结果
+
+Query Elapsed Time 
+
+topk : 100
+
+search_resources: cpu, gpu0, gpu1, gpu2, gpu3
+
+| nq/topk | topk=100 |
+| :-----: | :------: |
+|  nq=1   |  0.649   |
+|  nq=5   |  0.911   |
+|  nq=10  |  1.393   |
+| nq=100  |  2.189   |
+| nq=200  |  6.134   |
+| nq=400  |  9.480   |
+| nq=600  |  16.616  |
+| nq=800  |  22.225  |
+| nq=1000 |  25.901  |
+
+当nq为1000时，在CPU模式下查询一条128维向量需要耗时约26毫秒。 
+
+
+
+topk : 100
+
+search_resources: gpu0, gpu1, gpu2, gpu3
+
+| nq/topk | topk=100 |
+| :-----: | :------: |
+|  nq=1   |  14.348  |
+|  nq=5   |  14.326  |
+|  nq=10  |  14.387  |
+| nq=100  |  14.684  |
+| nq=200  |  14.665  |
+| nq=400  |  14.750  |
+| nq=600  |  15.009  |
+| nq=800  |  15.350  |
+| nq=1000 |  15.336  |
+
+当nq为1000时，在GPU模式下查询一条128维向量需要耗时约15毫秒。 
+
+
+
+**总结**
+
+在CPU模式下查询耗时随nq的增长快速增大，而在GPU模式下查询耗时的增大则缓慢许多。当nq较小时，CPU模式比GPU模式耗时更少。但当nq足够大时，GPU模式则更具有优势。
+
+在GPU模式下的查询耗时由两部分组成：（1）索引从CPU到GPU的拷贝时间；（2）所有分桶的查询时间。当nq小于500时，索引从CPU到GPU 的拷贝时间无法被有效均摊，此时CPU模式时一个更优的选择；当nq大于500时，选择GPU模式更合理。
+
+和CPU相比，GPU具有更多的核数和更强的算力。当nq较大时，GPU在计算上的优势能被更好地被体现。
+
+
+
+### 召回率测试
+
+**测试结果**
+
+topk = 1 : recall - recall@1
+
+topk = 10 : recall - recall@10
+
+topk = 100 : recall - recall@100
+
+我们利用sift1b数据集中的ground_truth来计算查询结果的召回率。
+
+
+
+Recall of CPU Mode
+
+search_resources: cpu, gpu0, gpu1, gpu2, gpu3
+
+| nq/topk | topk=1 | topk=10 | topk=100 |
+| :-----: | :----: | :-----: | :------: |
+|  nq=50  | 0.960  |  0.952  |  0.936   |
+| nq=200  | 0.975  |  0.964  |  0.939   |
+| nq=400  | 0.983  |  0.967  |  0.937   |
+| nq=600  | 0.970  |  0.964  |  0.939   |
+| nq=800  | 0.970  |  0.960  |  0.939   |
+| nq=1000 | 0.976  |  0.961  |  0.941   |
+
+
+
+Recall of GPU Mode
+
+search_resources: gpu0, gpu1, gpu2, gpu3
+
+| nq/topk | topk=1 | topk=10 | topk=100 |
+| :-----: | :----: | :-----: | :------: |
+|  nq=50  | 0.980  |  0.952  |  0.946   |
+| nq=200  | 0.970  |  0.962  |  0.934   |
+| nq=400  | 0.975  |  0.953  |  0.939   |
+| nq=600  | 0.970  |  0.957  |  0.939   |
+| nq=800  | 0.981  |  0.963  |  0.941   |
+| nq=1000 | 0.979  |  0.964  |  0.938   |
+
+
+
+**总结**
+
+随着nq的增大，召回率逐渐稳定至93%以上。
+
diff --git a/docs/test_report/ivfflat_test_report_en.md b/docs/test_report/ivfflat_test_report_en.md
new file mode 100644
index 0000000000..f26db9c44b
--- /dev/null
+++ b/docs/test_report/ivfflat_test_report_en.md
@@ -0,0 +1,214 @@
+# ivffalt_test_report_en
+
+## Summary
+
+This document contains the test reports of IVF_FLAT index on Milvus single server.
+
+
+
+## Test objectives
+
+The time cost and recall when searching with different parameters.`
+
+
+
+## Test method
+
+### Hardware/Software requirements
+
+Operating System: Ubuntu 18.04
+
+CPU: Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz
+
+GPU0: GeForce RTX 2080Ti 11GB
+
+GPU1: GeForce RTX 2080Ti 11GB
+
+GPU2: GeForce RTX 2080Ti 11GB
+
+GPU3: GeForce RTX 2080Ti 11GB
+
+Memory: 768GB
+
+Docker version: 19.03
+
+NVIDIA Driver version: 430.50
+
+Milvus version: 0.5.3
+
+SDK interface: Python 3.6.8
+
+pymilvus version: 0.2.5
+
+
+
+### Data model
+
+The data used in the tests are:
+
+- Data source: sift1b
+- Data type: hdf5
+
+For details on this dataset, you can check : http://corpus-texmex.irisa.fr/ .
+
+
+
+### Measures
+
+- Query Elapsed Time: Time cost (in seconds) to run a query. Variables that affect Query Elapsed Time:
+
+  - nq (Number of queried vectors)
+
+  > Note: In the query test of query elapsed time, we will test the following parameters with different values:
+  >
+  > nq - grouped by: [1, 5, 10,  100, 200, 400, 600, 800, 1000], 
+
+- Recall: The fraction of the total amount of relevant instances that were actually retrieved . Variables that affect Recall:
+
+  - nq (Number of queried vectors)
+  - topk (Top k result of a query)
+
+  > Note: In the query test of recall, we will test the following parameters with different values:
+  >
+  > nq - grouped by: [50, 200, 400, 600, 800, 1000], 
+  >
+  > topk - grouped by: [1, 10, 100]
+
+
+
+## Test reports
+
+### Test environment
+
+Data base: sift1b-1,000,000,000 vectors, 128-dimension
+
+Table Attributes
+
+- nlist: 16384
+- metric_type: L2
+
+Query configuration 
+
+- nprobe: 32
+
+Milvus configuration 
+
+- cpu_cache_capacity: 600
+- gpu_cache_capacity: 6
+- use_blas_threshold: 2100
+
+The definitions of Milvus configuration are on https://milvus.io/docs/en/reference/milvus_config/.
+
+Test method
+
+Test the query elapsed time and recall with several parameters, and once only change one parameter.
+
+- Whether to restart Milvus after each query: No
+
+
+
+### Performance test
+
+#### Data query
+
+**Test result**
+
+Query Elapsed Time 
+
+topk : 100
+
+search_resources: cpu, gpu0, gpu1, gpu2, gpu3
+
+| nq/topk | topk=100 |
+| :-----: | :------: |
+|  nq=1   |  0.649   |
+|  nq=5   |  0.911   |
+|  nq=10  |  1.393   |
+| nq=100  |  2.189   |
+| nq=200  |  6.134   |
+| nq=400  |  9.480   |
+| nq=600  |  16.616  |
+| nq=800  |  22.225  |
+| nq=1000 |  25.901  |
+
+When nq is 1000, the query time cost of a 128-dimension vector is around 26ms in CPU Mode. 
+
+
+
+topk : 100
+
+search_resources: gpu0, gpu1, gpu2, gpu3
+
+| nq/topk | topk=100 |
+| :-----: | :------: |
+|  nq=1   |  14.348  |
+|  nq=5   |  14.326  |
+|  nq=10  |  14.387  |
+| nq=100  |  14.684  |
+| nq=200  |  14.665  |
+| nq=400  |  14.750  |
+| nq=600  |  15.009  |
+| nq=800  |  15.350  |
+| nq=1000 |  15.336  |
+
+When nq is 1000, the query time cost of a 128-dimension vector is around 15ms in GPU Mode. 
+
+
+
+**Conclusion**
+
+The query elapsed time in CPU Mode increases quickly with nq, while in GPU Mode query elapsed time  increases much slower. When nq is small,  CPU Mode consumes less time than GPU Mode. However, as nq becomes larger, GPU Mode shows its advantage against CPU Mode. 
+
+The query elapsed time in GPU Mode consists of two parts: (1) index CPU-to-GPU copy time; (2) nprobe buckets search time. When nq is smaller than 500, index CPU-to-GPU copy time cannot be amortized efficiently, CPU Mode is a better choice; when nq is larger than 500, choosing GPU Mode is better.
+
+Compared with CPU, GPU has much more cores and stronger computing capability. When nq is large, it can better reflect GPU's advantages on computing.
+
+
+
+### Recall test
+
+**Test result**
+
+topk = 1 : recall - recall@1
+
+topk = 10 : recall - recall@10
+
+topk = 100 : recall - recall@100
+
+We use the ground_truth in sift1b dataset to calculate the recall of query results.
+
+
+
+Recall of CPU Mode
+
+search_resources: cpu, gpu0, gpu1, gpu2, gpu3
+
+| nq/topk | topk=1 | topk=10 | topk=100 |
+| :-----: | :----: | :-----: | :------: |
+|  nq=50  | 0.960  |  0.952  |  0.936   |
+| nq=200  | 0.975  |  0.964  |  0.939   |
+| nq=400  | 0.983  |  0.967  |  0.937   |
+| nq=600  | 0.970  |  0.964  |  0.939   |
+| nq=800  | 0.970  |  0.960  |  0.939   |
+| nq=1000 | 0.976  |  0.961  |  0.941   |
+
+
+
+Recall of GPU Mode
+
+search_resources: gpu0, gpu1, gpu2, gpu3
+
+| nq/topk | topk=1 | topk=10 | topk=100 |
+| :-----: | :----: | :-----: | :------: |
+|  nq=50  | 0.980  |  0.952  |  0.946   |
+| nq=200  | 0.970  |  0.962  |  0.934   |
+| nq=400  | 0.975  |  0.953  |  0.939   |
+| nq=600  | 0.970  |  0.957  |  0.939   |
+| nq=800  | 0.981  |  0.963  |  0.941   |
+| nq=1000 | 0.979  |  0.964  |  0.938   |
+
+
+
+**Conclusion**
+
+As nq increases, the recall gradually stabilizes to over 93%.
\ No newline at end of file
diff --git a/shards/mishards/service_handler.py b/shards/mishards/service_handler.py
index f192aff72c..1b56c864ed 100644
--- a/shards/mishards/service_handler.py
+++ b/shards/mishards/service_handler.py
@@ -61,6 +61,10 @@ class ServiceHandler(milvus_pb2_grpc.MilvusServiceServicer):
                 return status, [], []
 
             row_num = files_collection.row_num
+            # row_num is equal to 0, result is empty
+            if not row_num:
+                continue
+
             ids = files_collection.ids
             diss = files_collection.distances  # distance collections
             # TODO: batch_len is equal to topk, may need to compare with topk
@@ -136,15 +140,12 @@ class ServiceHandler(milvus_pb2_grpc.MilvusServiceServicer):
 
             with self.tracer.start_span('search_{}'.format(addr),
                                         child_of=span):
-                ret = conn.search_vectors_in_files(
-                    table_name=query_params['table_id'],
-                    file_ids=query_params['file_ids'],
-                    query_records=vectors,
-                    top_k=topk,
-                    nprobe=nprobe
-                    )
+                ret = conn.search_vectors_in_files(table_name=query_params['table_id'],
+                                                   file_ids=query_params['file_ids'],
+                                                   query_records=vectors,
+                                                   top_k=topk,
+                                                   nprobe=nprobe)
                 end = time.time()
-                logger.info('search_vectors_in_files takes: {}'.format(end - start))
 
                 all_topk_results.append(ret)
 
@@ -317,12 +318,12 @@ class ServiceHandler(milvus_pb2_grpc.MilvusServiceServicer):
                                                          metadata=metadata)
 
         now = time.time()
-        logger.info('SearchVector takes: {}'.format(now - start))
+        # logger.info('SearchVector takes: {}'.format(now - start))
 
         topk_result_list = milvus_pb2.TopKQueryResult(
             status=status_pb2.Status(error_code=status.error_code,
                                      reason=status.reason),
-            row_num=len(query_record_array),
+            row_num=len(request.query_record_array) if len(id_results) else 0,
             ids=id_results,
             distances=dis_results)
         return topk_result_list
diff --git a/shards/mishards/settings.py b/shards/mishards/settings.py
index 832f1639ea..3ab4777369 100644
--- a/shards/mishards/settings.py
+++ b/shards/mishards/settings.py
@@ -50,7 +50,6 @@ class TracingConfig:
         }
     }
 
-    max_overflow=0
 
 class DefaultConfig:
     SQLALCHEMY_DATABASE_URI = env.str('SQLALCHEMY_DATABASE_URI')
diff --git a/shards/utils/colors.py b/shards/utils/colors.py
new file mode 100644
index 0000000000..74ce614d5c
--- /dev/null
+++ b/shards/utils/colors.py
@@ -0,0 +1,72 @@
+# Reset
+Color_Off = '\033[0m'       # Text Reset
+
+# Regular Colors
+Black = '\033[0;30m'        # Black
+Red = '\033[0;31m'          # Red
+Green = '\033[0;32m'        # Green
+Yellow = '\033[0;33m'       # Yellow
+Blue = '\033[0;34m'         # Blue
+Purple = '\033[0;35m'       # Purple
+Cyan = '\033[0;36m'         # Cyan
+White = '\033[0;37m'        # White
+
+# Bold
+BBlack = '\033[1;30m'       # Black
+BRed = '\033[1;31m'         # Red
+BGreen = '\033[1;32m'       # Green
+BYellow = '\033[1;33m'      # Yellow
+BBlue = '\033[1;34m'        # Blue
+BPurple = '\033[1;35m'      # Purple
+BCyan = '\033[1;36m'        # Cyan
+BWhite = '\033[1;37m'       # White
+
+# Underline
+UBlack = '\033[4;30m'       # Black
+URed = '\033[4;31m'         # Red
+UGreen = '\033[4;32m'       # Green
+UYellow = '\033[4;33m'      # Yellow
+UBlue = '\033[4;34m'        # Blue
+UPurple = '\033[4;35m'      # Purple
+UCyan = '\033[4;36m'        # Cyan
+UWhite = '\033[4;37m'       # White
+
+# Background
+On_Black = '\033[40m'       # Black
+On_Red = '\033[41m'         # Red
+On_Green = '\033[42m'       # Green
+On_Yellow = '\033[43m'      # Yellow
+On_Blue = '\033[44m'        # Blue
+On_Purple = '\033[45m'      # Purple
+On_Cyan = '\033[46m'        # Cyan
+On_White = '\033[47m'       # White
+
+# High Intensity
+IBlack = '\033[0;90m'       # Black
+IRed = '\033[0;91m'         # Red
+IGreen = '\033[0;92m'       # Green
+IYellow = '\033[0;93m'      # Yellow
+IBlue = '\033[0;94m'        # Blue
+IPurple = '\033[0;95m'      # Purple
+ICyan = '\033[0;96m'        # Cyan
+IWhite = '\033[0;97m'       # White
+
+# Bold High Intensity
+BIBlack = '\033[1;90m'      # Black
+BIRed = '\033[1;91m'        # Red
+BIGreen = '\033[1;92m'      # Green
+BIYellow = '\033[1;93m'     # Yellow
+BIBlue = '\033[1;94m'       # Blue
+BIPurple = '\033[1;95m'     # Purple
+BICyan = '\033[1;96m'       # Cyan
+BIWhite = '\033[1;97m'      # White
+
+# High Intensity backgrounds
+On_IBlack = '\033[0;100m'   # Black
+On_IRed = '\033[0;101m'     # Red
+On_IGreen = '\033[0;102m'   # Green
+On_IYellow = '\033[0;103m'  # Yellow
+On_IBlue = '\033[0;104m'    # Blue
+On_IPurple = '\033[0;105m'  # Purple
+On_ICyan = '\033[0;106m'    # Cyan
+On_IWhite = '\033[0;107m'   # White
diff --git a/shards/utils/logger_helper.py b/shards/utils/logger_helper.py
index b4e3b9c5b6..11558fd8fa 100644
--- a/shards/utils/logger_helper.py
+++ b/shards/utils/logger_helper.py
@@ -1,8 +1,10 @@
 import os
 import datetime
+import copy
 from pytz import timezone
 from logging import Filter
 import logging.config
+from utils import colors
 
 
 class InfoFilter(logging.Filter):
@@ -31,29 +33,60 @@ class CriticalFilter(logging.Filter):
 
 
 COLORS = {
-    'HEADER': '\033[95m',
-    'INFO': '\033[92m',
-    'DEBUG': '\033[94m',
-    'WARNING': '\033[93m',
-    'ERROR': '\033[95m',
-    'CRITICAL': '\033[91m',
-    'ENDC': '\033[0m',
+    'HEADER': colors.BWhite,
+    'INFO': colors.On_IWhite + colors.BBlack,
+    'INFOM': colors.White,
+    'DEBUG': colors.On_IBlue + colors.BWhite,
+    'DEBUGM': colors.BIBlue,
+    'WARNING': colors.On_IYellow + colors.BWhite,
+    'WARNINGM': colors.BIYellow,
+    'ERROR': colors.On_IRed + colors.BWhite,
+    'ERRORM': colors.BIRed,
+    'CRITICAL': colors.On_Red + colors.BWhite,
+    'CRITICALM': colors.BRed,
+    'ASCTIME': colors.On_Cyan + colors.BIYellow,
+    'MESSAGE': colors.IGreen,
+    'FILENAME': colors.BCyan,
+    'LINENO': colors.BCyan,
+    'THREAD': colors.BCyan,
+    'ENDC': colors.Color_Off,
 }
 
 
 class ColorFulFormatColMixin:
     def format_col(self, message_str, level_name):
         if level_name in COLORS.keys():
-            message_str = COLORS.get(level_name) + message_str + COLORS.get(
-                'ENDC')
+            message_str = COLORS[level_name] + message_str + COLORS['ENDC']
         return message_str
 
+    def formatTime(self, record, datefmt=None):
+        ret = super().formatTime(record, datefmt)
+        ret = COLORS['ASCTIME'] + ret + COLORS['ENDC']
+        return ret
 
-class ColorfulFormatter(logging.Formatter, ColorFulFormatColMixin):
+
+class ColorfulLogRecordProxy(logging.LogRecord):
+    def __init__(self, record):
+        self._record = record
+        msg_level = record.levelname + 'M'
+        self.msg = '{}{}{}'.format(COLORS[msg_level], record.msg, COLORS['ENDC'])
+        self.filename = COLORS['FILENAME'] + record.filename + COLORS['ENDC']
+        self.lineno = '{}{}{}'.format(COLORS['LINENO'], record.lineno, COLORS['ENDC'])
+        self.threadName = '{}{}{}'.format(COLORS['THREAD'], record.threadName, COLORS['ENDC'])
+        self.levelname = COLORS[record.levelname] + record.levelname + COLORS['ENDC']
+
+    def __getattr__(self, attr):
+        if attr not in self.__dict__:
+            return getattr(self._record, attr)
+        return getattr(self, attr)
+
+
+class ColorfulFormatter(ColorFulFormatColMixin, logging.Formatter):
     def format(self, record):
-        message_str = super(ColorfulFormatter, self).format(record)
+        proxy = ColorfulLogRecordProxy(record)
+        message_str = super().format(proxy)
 
-        return self.format_col(message_str, level_name=record.levelname)
+        return message_str
 
 
 def config(log_level, log_path, name, tz='UTC'):
@@ -76,7 +109,9 @@ def config(log_level, log_path, name, tz='UTC'):
                 'format': '%(asctime)s | %(levelname)s | %(name)s | %(threadName)s: %(message)s (%(filename)s:%(lineno)s)',
             },
             'colorful_console': {
-                'format': '%(asctime)s | %(levelname)s | %(name)s | %(threadName)s: %(message)s (%(filename)s:%(lineno)s)',
+                'format': '%(asctime)s | %(levelname)s: %(message)s (%(filename)s:%(lineno)s) (%(threadName)s)',
+                # 'format': '%(asctime)s | %(levelname)s | %(threadName)s: %(message)s (%(filename)s:%(lineno)s)',
+                # 'format': '%(asctime)s | %(levelname)s | %(name)s | %(threadName)s: %(message)s (%(filename)s:%(lineno)s)',
                 '()': ColorfulFormatter,
             },
         },
diff --git a/tests/milvus_python_test/test_add_vectors.py b/tests/milvus_python_test/test_add_vectors.py
index 7245d51ea2..7c9d9e691c 100644
--- a/tests/milvus_python_test/test_add_vectors.py
+++ b/tests/milvus_python_test/test_add_vectors.py
@@ -1300,7 +1300,8 @@ class TestNameInvalid(object):
         assert not status.OK()
 
     @pytest.mark.level(2)
-    def test_add_vectors_with_invalid_tag_name(self, connect, get_tag_name):
+    def test_add_vectors_with_invalid_tag_name(self, connect, get_table_name, get_tag_name):
+        table_name = get_table_name
         tag_name = get_tag_name
         vectors = gen_vectors(1, dim)
         status, result = connect.add_vectors(table_name, vectors, partition_tag=tag_name)
diff --git a/tests/milvus_python_test/test_index.py b/tests/milvus_python_test/test_index.py
index b253cf02a3..8ce03b6a61 100644
--- a/tests/milvus_python_test/test_index.py
+++ b/tests/milvus_python_test/test_index.py
@@ -497,6 +497,7 @@ class TestIndexBase:
         status, ids = connect.add_vectors(table, vectors)
         for i in range(2):
             status = connect.create_index(table, index_params)
+
             assert status.OK()
             status, result = connect.describe_index(table)
             logging.getLogger().info(result)
@@ -569,7 +570,10 @@ class TestIndexIP:
         logging.getLogger().info(index_params)
         status, ids = connect.add_vectors(ip_table, vectors)
         status = connect.create_index(ip_table, index_params)
-        assert status.OK()
+        if index_params["index_type"] == IndexType.IVF_PQ:
+            assert not status.OK()
+        else:
+            assert status.OK()
 
     @pytest.mark.timeout(BUILD_TIMEOUT)
     def test_create_index_partition(self, connect, ip_table, get_index_params):
@@ -584,7 +588,10 @@ class TestIndexIP:
         status = connect.create_partition(ip_table, partition_name, tag)
         status, ids = connect.add_vectors(ip_table, vectors, partition_tag=tag)
         status = connect.create_index(partition_name, index_params)
-        assert status.OK()
+        if index_params["index_type"] == IndexType.IVF_PQ:
+            assert not status.OK()
+        else:
+            assert status.OK()
 
     @pytest.mark.level(2)
     def test_create_index_without_connect(self, dis_connect, ip_table):
@@ -609,14 +616,17 @@ class TestIndexIP:
         logging.getLogger().info(index_params)
         status, ids = connect.add_vectors(ip_table, vectors)
         status = connect.create_index(ip_table, index_params)
-        assert status.OK()
-        logging.getLogger().info(connect.describe_index(ip_table))
-        query_vecs = [vectors[0], vectors[1], vectors[2]]
-        top_k = 5
-        status, result = connect.search_vectors(ip_table, top_k, nprobe, query_vecs)
-        logging.getLogger().info(result)
-        assert status.OK()
-        assert len(result) == len(query_vecs)
+        if index_params["index_type"] == IndexType.IVF_PQ:
+            assert not status.OK()
+        else:
+            assert status.OK()
+            logging.getLogger().info(connect.describe_index(ip_table))
+            query_vecs = [vectors[0], vectors[1], vectors[2]]
+            top_k = 5
+            status, result = connect.search_vectors(ip_table, top_k, nprobe, query_vecs)
+            logging.getLogger().info(result)
+            assert status.OK()
+            assert len(result) == len(query_vecs)
 
     # TODO: enable
     @pytest.mark.timeout(BUILD_TIMEOUT)
@@ -943,16 +953,19 @@ class TestIndexIP:
         index_params = get_index_params
         status, ids = connect.add_vectors(ip_table, vectors)
         status = connect.create_index(ip_table, index_params)
-        assert status.OK()
-        status, result = connect.describe_index(ip_table)
-        logging.getLogger().info(result)
-        status = connect.drop_index(ip_table)
-        assert status.OK()
-        status, result = connect.describe_index(ip_table)
-        logging.getLogger().info(result)
-        assert result._nlist == 16384
-        assert result._table_name == ip_table
-        assert result._index_type == IndexType.FLAT
+        if index_params["index_type"] == IndexType.IVF_PQ:
+            assert not status.OK()
+        else:
+            assert status.OK()        
+            status, result = connect.describe_index(ip_table)
+            logging.getLogger().info(result)
+            status = connect.drop_index(ip_table)
+            assert status.OK()
+            status, result = connect.describe_index(ip_table)
+            logging.getLogger().info(result)
+            assert result._nlist == 16384
+            assert result._table_name == ip_table
+            assert result._index_type == IndexType.FLAT
 
     def test_drop_index_partition(self, connect, ip_table, get_simple_index_params):
         '''
@@ -965,16 +978,19 @@ class TestIndexIP:
         status = connect.create_partition(ip_table, partition_name, tag)
         status, ids = connect.add_vectors(ip_table, vectors, partition_tag=tag)
         status = connect.create_index(ip_table, index_params)
-        assert status.OK()
-        status, result = connect.describe_index(ip_table)
-        logging.getLogger().info(result)
-        status = connect.drop_index(ip_table)
-        assert status.OK()
-        status, result = connect.describe_index(ip_table)
-        logging.getLogger().info(result)
-        assert result._nlist == 16384
-        assert result._table_name == ip_table
-        assert result._index_type == IndexType.FLAT
+        if index_params["index_type"] == IndexType.IVF_PQ:
+            assert not status.OK()
+        else:
+            assert status.OK()
+            status, result = connect.describe_index(ip_table)
+            logging.getLogger().info(result)
+            status = connect.drop_index(ip_table)
+            assert status.OK()
+            status, result = connect.describe_index(ip_table)
+            logging.getLogger().info(result)
+            assert result._nlist == 16384
+            assert result._table_name == ip_table
+            assert result._index_type == IndexType.FLAT
 
     def test_drop_index_partition_A(self, connect, ip_table, get_simple_index_params):
         '''
@@ -987,19 +1003,22 @@ class TestIndexIP:
         status = connect.create_partition(ip_table, partition_name, tag)
         status, ids = connect.add_vectors(ip_table, vectors, partition_tag=tag)
         status = connect.create_index(partition_name, index_params)
-        assert status.OK()
-        status = connect.drop_index(ip_table)
-        assert status.OK()
-        status, result = connect.describe_index(ip_table)
-        logging.getLogger().info(result)
-        assert result._nlist == 16384
-        assert result._table_name == ip_table
-        assert result._index_type == IndexType.FLAT
-        status, result = connect.describe_index(partition_name)
-        logging.getLogger().info(result)
-        assert result._nlist == 16384
-        assert result._table_name == partition_name
-        assert result._index_type == IndexType.FLAT
+        if index_params["index_type"] == IndexType.IVF_PQ:
+            assert not status.OK()
+        else:
+            assert status.OK()
+            status = connect.drop_index(ip_table)
+            assert status.OK()
+            status, result = connect.describe_index(ip_table)
+            logging.getLogger().info(result)
+            assert result._nlist == 16384
+            assert result._table_name == ip_table
+            assert result._index_type == IndexType.FLAT
+            status, result = connect.describe_index(partition_name)
+            logging.getLogger().info(result)
+            assert result._nlist == 16384
+            assert result._table_name == partition_name
+            assert result._index_type == IndexType.FLAT
 
     def test_drop_index_partition_B(self, connect, ip_table, get_simple_index_params):
         '''
@@ -1012,19 +1031,22 @@ class TestIndexIP:
         status = connect.create_partition(ip_table, partition_name, tag)
         status, ids = connect.add_vectors(ip_table, vectors, partition_tag=tag)
         status = connect.create_index(partition_name, index_params)
-        assert status.OK()
-        status = connect.drop_index(partition_name)
-        assert status.OK()
-        status, result = connect.describe_index(ip_table)
-        logging.getLogger().info(result)
-        assert result._nlist == 16384
-        assert result._table_name == ip_table
-        assert result._index_type == IndexType.FLAT
-        status, result = connect.describe_index(partition_name)
-        logging.getLogger().info(result)
-        assert result._nlist == 16384
-        assert result._table_name == partition_name
-        assert result._index_type == IndexType.FLAT
+        if index_params["index_type"] == IndexType.IVF_PQ:
+            assert not status.OK()
+        else:
+            assert status.OK()
+            status = connect.drop_index(partition_name)
+            assert status.OK()
+            status, result = connect.describe_index(ip_table)
+            logging.getLogger().info(result)
+            assert result._nlist == 16384
+            assert result._table_name == ip_table
+            assert result._index_type == IndexType.FLAT
+            status, result = connect.describe_index(partition_name)
+            logging.getLogger().info(result)
+            assert result._nlist == 16384
+            assert result._table_name == partition_name
+            assert result._index_type == IndexType.FLAT
 
     def test_drop_index_partition_C(self, connect, ip_table, get_simple_index_params):
         '''
@@ -1040,24 +1062,27 @@ class TestIndexIP:
         status = connect.create_partition(ip_table, new_partition_name, new_tag)
         status, ids = connect.add_vectors(ip_table, vectors)
         status = connect.create_index(ip_table, index_params)
-        assert status.OK()
-        status = connect.drop_index(new_partition_name)
-        assert status.OK()
-        status, result = connect.describe_index(new_partition_name)
-        logging.getLogger().info(result)
-        assert result._nlist == 16384
-        assert result._table_name == new_partition_name
-        assert result._index_type == IndexType.FLAT
-        status, result = connect.describe_index(partition_name)
-        logging.getLogger().info(result)
-        assert result._nlist == index_params["nlist"]
-        assert result._table_name == partition_name
-        assert result._index_type == index_params["index_type"]
-        status, result = connect.describe_index(ip_table)
-        logging.getLogger().info(result)
-        assert result._nlist == index_params["nlist"]
-        assert result._table_name == ip_table
-        assert result._index_type == index_params["index_type"]
+        if index_params["index_type"] == IndexType.IVF_PQ:
+            assert not status.OK()
+        else:
+            assert status.OK()
+            status = connect.drop_index(new_partition_name)
+            assert status.OK()
+            status, result = connect.describe_index(new_partition_name)
+            logging.getLogger().info(result)
+            assert result._nlist == 16384
+            assert result._table_name == new_partition_name
+            assert result._index_type == IndexType.FLAT
+            status, result = connect.describe_index(partition_name)
+            logging.getLogger().info(result)
+            assert result._nlist == index_params["nlist"]
+            assert result._table_name == partition_name
+            assert result._index_type == index_params["index_type"]
+            status, result = connect.describe_index(ip_table)
+            logging.getLogger().info(result)
+            assert result._nlist == index_params["nlist"]
+            assert result._table_name == ip_table
+            assert result._index_type == index_params["index_type"]
 
     def test_drop_index_repeatly(self, connect, ip_table, get_simple_index_params):
         '''
@@ -1068,18 +1093,21 @@ class TestIndexIP:
         index_params = get_simple_index_params
         status, ids = connect.add_vectors(ip_table, vectors)
         status = connect.create_index(ip_table, index_params)
-        assert status.OK()
-        status, result = connect.describe_index(ip_table)
-        logging.getLogger().info(result)
-        status = connect.drop_index(ip_table)
-        assert status.OK()
-        status = connect.drop_index(ip_table)
-        assert status.OK()
-        status, result = connect.describe_index(ip_table)
-        logging.getLogger().info(result)
-        assert result._nlist == 16384
-        assert result._table_name == ip_table
-        assert result._index_type == IndexType.FLAT
+        if index_params["index_type"] == IndexType.IVF_PQ:
+            assert not status.OK()
+        else:
+            assert status.OK()
+            status, result = connect.describe_index(ip_table)
+            logging.getLogger().info(result)
+            status = connect.drop_index(ip_table)
+            assert status.OK()
+            status = connect.drop_index(ip_table)
+            assert status.OK()
+            status, result = connect.describe_index(ip_table)
+            logging.getLogger().info(result)
+            assert result._nlist == 16384
+            assert result._table_name == ip_table
+            assert result._index_type == IndexType.FLAT
 
     @pytest.mark.level(2)
     def test_drop_index_without_connect(self, dis_connect, ip_table):
@@ -1120,16 +1148,19 @@ class TestIndexIP:
         status, ids = connect.add_vectors(ip_table, vectors)
         for i in range(2):
             status = connect.create_index(ip_table, index_params)
-            assert status.OK()
-            status, result = connect.describe_index(ip_table)
-            logging.getLogger().info(result)
-            status = connect.drop_index(ip_table)
-            assert status.OK()
-            status, result = connect.describe_index(ip_table)
-            logging.getLogger().info(result)
-            assert result._nlist == 16384
-            assert result._table_name == ip_table
-            assert result._index_type == IndexType.FLAT
+            if index_params["index_type"] == IndexType.IVF_PQ:
+                assert not status.OK()
+            else:
+                assert status.OK()
+                status, result = connect.describe_index(ip_table)
+                logging.getLogger().info(result)
+                status = connect.drop_index(ip_table)
+                assert status.OK()
+                status, result = connect.describe_index(ip_table)
+                logging.getLogger().info(result)
+                assert result._nlist == 16384
+                assert result._table_name == ip_table
+                assert result._index_type == IndexType.FLAT
 
     def test_create_drop_index_repeatly_different_index_params(self, connect, ip_table):
         '''
diff --git a/tests/milvus_python_test/test_search_vectors.py b/tests/milvus_python_test/test_search_vectors.py
index e0b1bc09ea..7aebc78e31 100644
--- a/tests/milvus_python_test/test_search_vectors.py
+++ b/tests/milvus_python_test/test_search_vectors.py
@@ -266,6 +266,37 @@ class TestSearchBase:
         assert check_result(result[1], new_ids[0])
         assert result[1][0].distance <= epsilon
 
+    def test_search_l2_index_params_partition_F(self, connect, table, get_simple_index_params):
+        '''
+        target: test basic search fuction, all the search params is corrent, test all index params, and build
+        method: search table with the given vectors and tags with "re" expr, check the result
+        expected: search status ok, and the length of the result is top_k
+        '''
+        tag = "atag"
+        new_tag = "new_tag"
+        index_params = get_simple_index_params
+        logging.getLogger().info(index_params)
+        partition_name = gen_unique_str()
+        new_partition_name = gen_unique_str()
+        status = connect.create_partition(table, partition_name, tag)
+        status = connect.create_partition(table, new_partition_name, new_tag)
+        vectors, ids = self.init_data(connect, partition_name)
+        new_vectors, new_ids = self.init_data(connect, new_partition_name, nb=1000)
+        status = connect.create_index(table, index_params)
+        query_vec = [vectors[0], new_vectors[0]]
+        top_k = 10
+        nprobe = 1
+        status, result = connect.search_vectors(table, top_k, nprobe, query_vec, partition_tags=["new(.*)"])
+        logging.getLogger().info(result)
+        assert status.OK()
+        assert result[0][0].distance > epsilon
+        assert result[1][0].distance <= epsilon
+        status, result = connect.search_vectors(table, top_k, nprobe, query_vec, partition_tags=["(.*)tag"])
+        logging.getLogger().info(result)
+        assert status.OK()
+        assert result[0][0].distance <= epsilon
+        assert result[1][0].distance <= epsilon
+
     def test_search_ip_index_params(self, connect, ip_table, get_index_params):
         '''
         target: test basic search fuction, all the search params is corrent, test all index params, and build
diff --git a/tests/milvus_python_test/utils.py b/tests/milvus_python_test/utils.py
index 1686ad7129..e591521815 100644
--- a/tests/milvus_python_test/utils.py
+++ b/tests/milvus_python_test/utils.py
@@ -69,7 +69,7 @@ def gen_invalid_ips():
             "\n",
             "\t",
             "中文",
-            "a".join("a" for i in range(256))
+            "a".join("a" for _ in range(256))
     ]
     return ips
 
@@ -116,7 +116,7 @@ def gen_invalid_uris():
             "tcp:// :%s" % port,
             # "tcp://123.0.0.1:%s" % port,
             "tcp://127.0.0:%s" % port,
-            "tcp://255.0.0.0:%s" % port,
+            # "tcp://255.0.0.0:%s" % port,
             # "tcp://255.255.0.0:%s" % port,
             # "tcp://255.255.255.0:%s" % port,
             # "tcp://255.255.255.255:%s" % port,
@@ -437,7 +437,7 @@ def gen_invalid_index_params():
 
 def gen_index_params():
     index_params = []
-    index_types = [IndexType.FLAT, IndexType.IVFLAT, IndexType.IVF_SQ8, IndexType.IVF_SQ8H]
+    index_types = [IndexType.FLAT, IndexType.IVFLAT, IndexType.IVF_SQ8, IndexType.IVF_SQ8H, IndexType.IVF_PQ]
     nlists = [1, 16384, 50000]
 
     def gen_params(index_types, nlists):
@@ -450,7 +450,7 @@ def gen_index_params():
 
 def gen_simple_index_params():
     index_params = []
-    index_types = [IndexType.FLAT, IndexType.IVFLAT, IndexType.IVF_SQ8, IndexType.IVF_SQ8H]
+    index_types = [IndexType.FLAT, IndexType.IVFLAT, IndexType.IVF_SQ8, IndexType.IVF_SQ8H, IndexType.IVF_PQ]
     nlists = [1024]
 
     def gen_params(index_types, nlists):