Skip to content

Commit

Permalink
Merge #4633
Browse files Browse the repository at this point in the history
4633: Allow to mark vectors as "userProvided" r=Kerollmops a=dureuill

# Pull Request

## Related issue
Fixes #4606 

## What does this PR do?

[See usage in PRD](https://meilisearch.notion.site/v1-9-AI-search-changes-e90d6803eca8417aa70a1ac5d0225697#deb96fb0595947bda7d4a371100326eb)

- Extends the shape of the special `_vectors` field in documents.
    - previously, the `_vectors` field had to be an object, with each field the name of a configured embedder, and each value either `null`, an embedding (array of numbers), or an array of embeddings.
    - In this PR, the value of an embedder in the `_vectors` field can additionally be an object. The object has two fields:
      1. `embeddings`: `null`, an embedding (array of numbers), or an array of embeddings.
      2. `userProvided`: a boolean indicating if the vector was provided by the user.
    - The previous form `embedder_or_array_of_embedders` is semantically equivalent to:
    ```json
    {
        "embeddings": embedder_or_array_of_embedders,
        "userProvided": true
    }
    ```
- During the indexing step, the subfields and values of the `_vectors` field that have `userProvided` set to **false** are added in the vector DB, but not in the documents DB: that means that future modifications of the documents will trigger a regeneration of that particular vector using the document template.
- This allows **importing** embeddings as a one-shot process, while still retaining the ability to regenerate embeddings on document change.
- The dump process now uses this ability: it enriches the `_vectors` fields of documents with the embeddings that were autogenerated, marking them as not `userProvided`. This allows importing the vectors from a dump without regenerating them.

### Tests

This PR adds the following tests

- Long-needed hybrid search tests of a simple hf embedder
- Dump test that imports vectors. Due to the difficulty of actually importing a dump in tests, we just read the dump and check it contains the expected content.
- Tests in the index-scheduler: this tests that documents containing the same kind of instructions as in the dump indexes as expected


Co-authored-by: Louis Dureuil <[email protected]>
  • Loading branch information
meili-bors[bot] and dureuill committed May 22, 2024
2 parents abe2977 + 8a941c0 commit 6d2b3ca
Show file tree
Hide file tree
Showing 32 changed files with 4,579 additions and 295 deletions.
134 changes: 134 additions & 0 deletions dump/src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,140 @@ pub(crate) mod test {
use super::*;
use crate::reader::v6::RuntimeTogglableFeatures;

#[test]
fn import_dump_v6_with_vectors() {
// dump containing two indexes
//
// "vector", configured with an embedder
// contains:
// - one document with an overriden vector,
// - one document with a natural vector
// - one document with a _vectors map containing one additional embedder name and a natural vector
// - one document with a _vectors map containing one additional embedder name and an overriden vector
//
// "novector", no embedder
// contains:
// - a document without vector
// - a document with a random _vectors field
let dump = File::open("tests/assets/v6-with-vectors.dump").unwrap();
let mut dump = DumpReader::open(dump).unwrap();

// top level infos
insta::assert_display_snapshot!(dump.date().unwrap(), @"2024-05-16 15:51:34.151044 +00:00:00");
insta::assert_debug_snapshot!(dump.instance_uid().unwrap(), @"None");

// tasks
let tasks = dump.tasks().unwrap().collect::<Result<Vec<_>>>().unwrap();
let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip();
meili_snap::snapshot_hash!(meili_snap::json_string!(tasks), @"278f63325ef06ca04d01df98d8207b94");
assert_eq!(update_files.len(), 10);
assert!(update_files[0].is_none()); // the dump creation
assert!(update_files[1].is_none());
assert!(update_files[2].is_none());
assert!(update_files[3].is_none());
assert!(update_files[4].is_none());
assert!(update_files[5].is_none());
assert!(update_files[6].is_none());
assert!(update_files[7].is_none());
assert!(update_files[8].is_none());
assert!(update_files[9].is_none());

// indexes
let mut indexes = dump.indexes().unwrap().collect::<Result<Vec<_>>>().unwrap();
// the index are not ordered in any way by default
indexes.sort_by_key(|index| index.metadata().uid.to_string());

let mut vector_index = indexes.pop().unwrap();
let mut novector_index = indexes.pop().unwrap();
assert!(indexes.is_empty());

// vector

insta::assert_json_snapshot!(vector_index.metadata(), @r###"
{
"uid": "vector",
"primaryKey": "id",
"createdAt": "2024-05-16T15:33:17.240962Z",
"updatedAt": "2024-05-16T15:40:55.723052Z"
}
"###);

{
let documents: Result<Vec<_>> = vector_index.documents().unwrap().collect();
let mut documents = documents.unwrap();
assert_eq!(documents.len(), 4);

documents.sort_by_key(|doc| doc.get("id").unwrap().to_string());

{
let document = documents.pop().unwrap();
insta::assert_json_snapshot!(document);
}

{
let document = documents.pop().unwrap();
insta::assert_json_snapshot!(document);
}

{
let document = documents.pop().unwrap();
insta::assert_json_snapshot!(document);
}

{
let document = documents.pop().unwrap();
insta::assert_json_snapshot!(document);
}
}

// novector

insta::assert_json_snapshot!(novector_index.metadata(), @r###"
{
"uid": "novector",
"primaryKey": "id",
"createdAt": "2024-05-16T15:33:03.568055Z",
"updatedAt": "2024-05-16T15:33:07.530217Z"
}
"###);

insta::assert_json_snapshot!(novector_index.settings().unwrap().embedders, @"null");

{
let documents: Result<Vec<_>> = novector_index.documents().unwrap().collect();
let mut documents = documents.unwrap();
assert_eq!(documents.len(), 2);

documents.sort_by_key(|doc| doc.get("id").unwrap().to_string());

{
let document = documents.pop().unwrap();
insta::assert_json_snapshot!(document, @r###"
{
"id": "e1",
"other": "random1",
"_vectors": "toto"
}
"###);
}

{
let document = documents.pop().unwrap();
insta::assert_json_snapshot!(document, @r###"
{
"id": "e0",
"other": "random0"
}
"###);
}
}

assert_eq!(
dump.features().unwrap().unwrap(),
RuntimeTogglableFeatures { vector_store: true, ..Default::default() }
);
}

#[test]
fn import_dump_v6_experimental() {
let dump = File::open("tests/assets/v6-with-experimental.dump").unwrap();
Expand Down

0 comments on commit 6d2b3ca

Please sign in to comment.