{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Autofaiss getting started" ] }, { "cell_type": "markdown", "metadata": { "id": "ggE42ZFYxow6" }, "source": [ "## Information\n", "\n", "**This Demo notebook automatically creates a Faiss knn indices with the most optimal similarity search parameters.**\n", "\n", "It selects the best indexing parameters to achieve the highest recalls given memory and query speed constraints.\n", "\n", "Github: https://github.com/criteo/autofaiss" ] }, { "cell_type": "markdown", "metadata": { "id": "8g-fxpx7187-" }, "source": [ "## Parameters" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "cellView": "form", "id": "sqvufPhwySsS" }, "outputs": [], "source": [ "#@title Index parameters\n", "\n", "max_index_query_time_ms = 10 #@param {type: \"number\"}\n", "max_index_memory_usage = \"10MB\" #@param\n", "metric_type = \"l2\" #@param ['ip', 'l2']" ] }, { "cell_type": "markdown", "metadata": { "id": "j6Rg7pm82Fml" }, "source": [ "## Embeddings creation (add your own embeddings here)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "vRVmva4nEOGg" }, "outputs": [], "source": [ "import numpy as np\n", "\n", "# Create embeddings\n", "embeddings = np.float32(np.random.rand(4000, 100))" ] }, { "cell_type": "markdown", "metadata": { "id": "2lbqqSxFtI6n" }, "source": [ "## Save your embeddings on the disk" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "NUje1UcmF3-7" }, "outputs": [], "source": [ "# Create a new folder\n", "import os\n", "import shutil\n", "embeddings_dir = \"embeddings_folder\"\n", "if os.path.exists(embeddings_dir):\n", " shutil.rmtree(embeddings_dir)\n", "os.makedirs(embeddings_dir)\n", "\n", "# Save your embeddings\n", "# You can split you embeddings in several parts if it is too big\n", "# The data will be read in the lexicographical order of the filenames\n", "np.save(f\"{embeddings_dir}/part1.npy\", embeddings[:2000]) \n", "np.save(f\"{embeddings_dir}/part2.npy\", embeddings[2000:]) " ] }, { "cell_type": "markdown", "metadata": { "id": "L7h7hl6VubLt" }, "source": [ "## Build the KNN index with Autofaiss" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "9q0LDFu0p2-g" }, "outputs": [], "source": [ "os.makedirs(\"my_index_folder\", exist_ok=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_vIqMP8zHTHO", "outputId": "0d6a830e-a33b-4595-d4b1-d5a5048c408e" }, "outputs": [], "source": [ "# Install autofaiss\n", "!pip install autofaiss &> /dev/null\n", "\n", "# Build a KNN index\n", "!autofaiss build_index --embeddings={embeddings_dir} \\\n", " --index_path=\"knn.index\" \\\n", " --index_infos_path=\"infos.json\" \\\n", " --metric_type={metric_type} \\\n", " --max_index_query_time_ms=5 \\\n", " --max_index_memory_usage={max_index_memory_usage}" ] }, { "cell_type": "markdown", "metadata": { "id": "Uk5PrzhRujjL" }, "source": [ "## Load the index and play with it" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qPJg6O6Fsq7R", "outputId": "42e46393-e505-46be-b015-51a1797d471e" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Top 5 elements in the dataset for max inner product search:\n", "1: Vector number 2933 with distance 10.404068946838379\n", "2: Vector number 168 with distance 10.53512191772461\n", "3: Vector number 2475 with distance 10.688979148864746\n", "4: Vector number 2525 with distance 10.713528633117676\n", "5: Vector number 3463 with distance 10.774477005004883\n" ] } ], "source": [ "import faiss\n", "import glob\n", "import numpy as np\n", "\n", "my_index = faiss.read_index(\"knn.index\")\n", "\n", "query_vector = np.float32(np.random.rand(1, 100))\n", "k = 5\n", "distances, indices = my_index.search(query_vector, k)\n", "\n", "print(f\"Top {k} elements in the dataset for max inner product search:\")\n", "for i, (dist, indice) in enumerate(zip(distances[0], indices[0])):\n", " print(f\"{i+1}: Vector number {indice:4} with distance {dist}\")" ] }, { "cell_type": "markdown", "metadata": { "id": "0cnt0_vAuw5z" }, "source": [ "## (Bonus) Python version of the CLI" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 698 }, "id": "DXBVQpMXt3Y6", "outputId": "efb3326f-4b43-4e9f-8c25-31b5d1021fcd" }, "outputs": [], "source": [ "from autofaiss import build_index\n", "\n", "build_index(embeddings=\"embeddings_folder\",\n", " index_path=\"knn.index\",\n", " index_infos_path=\"infos.json\",\n", " max_index_query_time_ms = max_index_query_time_ms,\n", " max_index_memory_usage = max_index_memory_usage,\n", " metric_type=metric_type)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "oX5Takvp35df" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "Autofaiss getting started.ipynb", "provenance": [], "toc_visible": true }, "kernelspec": { "display_name": "Python 3.8.11 64-bit ('3.8.11')", "name": "python381164bit381120cd66ba50c9492bb87593841dca4296" }, "language_info": { "name": "python", "version": "3.8.11-final" } }, "nbformat": 4, "nbformat_minor": 0 }