Skip to content

Commit

Permalink
[WIP] Compressed posting lists
Browse files Browse the repository at this point in the history
  • Loading branch information
xzfc committed May 16, 2024
1 parent ccf7f1d commit 3ae3667
Show file tree
Hide file tree
Showing 15 changed files with 1,536 additions and 132 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions lib/segment/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,12 @@ edition = "2021"
multiling-chinese = ["charabia/chinese"]
multiling-japanese = ["charabia/japanese"]
multiling-korean = ["charabia/korean"]
testing = []
testing = ["common/testing"]

[build-dependencies]
cc = "1.0"

[dev-dependencies]
common = { path = "../common/common", features = ["testing"] }
criterion = "0.5"
dataset = { path = "../common/dataset" }
indicatif = { workspace = true }
Expand Down
1 change: 1 addition & 0 deletions lib/sparse/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ testing = []

[dependencies]
atomicwrites = "0.4.3"
bitpacking = "0.9.2"
common = { path = "../common/common" }
io = { path = "../common/io" }
memory = { path = "../common/memory" }
Expand Down
108 changes: 66 additions & 42 deletions lib/sparse/benches/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ use rand::SeedableRng as _;
use sparse::common::scores_memory_pool::ScoresMemoryPool;
use sparse::common::sparse_vector::SparseVector;
use sparse::common::sparse_vector_fixture::{random_positive_sparse_vector, random_sparse_vector};
use sparse::index::inverted_index::inverted_index_compressed_immutable_ram::InvertedIndexImmutableRam;
use sparse::index::inverted_index::inverted_index_ram::InvertedIndexRam;
use sparse::index::inverted_index::inverted_index_ram_builder::InvertedIndexBuilder;
use sparse::index::inverted_index::InvertedIndex as _;
use sparse::index::loaders::{self, Csr};
use sparse::index::search_context::SearchContext;
mod prof;
Expand All @@ -26,14 +28,17 @@ pub fn bench_search(c: &mut Criterion) {
bench_uniform_random(c, "random-50k", 50_000);
bench_uniform_random(c, "random-500k", 500_000);

let query_vectors =
loaders::load_csr_vecs(Dataset::NeurIps2023Queries.download().unwrap()).unwrap();
{
let query_vectors =
loaders::load_csr_vecs(Dataset::NeurIps2023Queries.download().unwrap()).unwrap();

let index_1m = load_csr_index(Dataset::NeurIps2023_1M.download().unwrap(), 1.0).unwrap();
run_bench(c, "neurips2023-1M", index_1m, query_vectors.clone());
let index_1m = load_csr_index(Dataset::NeurIps2023_1M.download().unwrap(), 1.0).unwrap();
run_bench(c, "neurips2023-1M", index_1m, &query_vectors);

let index_full = load_csr_index(Dataset::NeurIps2023Full.download().unwrap(), 0.25).unwrap();
run_bench(c, "neurips2023-full-25pct", index_full, query_vectors);
let index_full =
load_csr_index(Dataset::NeurIps2023Full.download().unwrap(), 0.25).unwrap();
run_bench(c, "neurips2023-full-25pct", index_full, &query_vectors);
}

bench_movies(c);
}
Expand All @@ -52,7 +57,7 @@ fn bench_uniform_random(c: &mut Criterion, name: &str, num_vectors: usize) {
.map(|_| random_positive_sparse_vector(&mut rnd, MAX_SPARSE_DIM))
.collect::<Vec<_>>();

run_bench(c, name, index, query_vectors);
run_bench(c, name, index, &query_vectors);
}

pub fn bench_movies(c: &mut Criterion) {
Expand All @@ -70,29 +75,18 @@ pub fn bench_movies(c: &mut Criterion) {
.map(|(idx, vec)| (idx as PointOffsetType, vec.unwrap().into_remapped())),
);

run_bench(c, "movies", index, query_vectors);
run_bench(c, "movies", index, &query_vectors);
}

pub fn run_bench(
c: &mut Criterion,
name: &str,
index: InvertedIndexRam,
mut query_vectors: Vec<SparseVector>,
query_vectors: &[SparseVector],
) {
let pool = ScoresMemoryPool::new();
let stopped = AtomicBool::new(false);

let mut group = c.benchmark_group(format!("search/{}", name));

let mut it = query_vectors.iter().cycle();
group.bench_function("basic", |b| {
b.iter_batched(
|| it.next().unwrap().clone().into_remapped(),
|vec| SearchContext::new(vec, TOP, &index, pool.get(), &stopped).search(&|_| true),
criterion::BatchSize::SmallInput,
)
});

let hottest_id = index
.postings
.iter()
Expand All @@ -114,32 +108,62 @@ pub fn run_bench(
index.postings[hottest_id as usize].elements.len(),
);

for vec in &mut query_vectors {
vec.indices.truncate(4);
vec.values.truncate(4);
if let Err(idx) = vec.indices.binary_search(&hottest_id) {
if idx < vec.indices.len() {
vec.indices[idx] = hottest_id;
vec.values[idx] = 1.0;
} else {
vec.indices.push(hottest_id);
vec.values.push(1.0);
let hottest_query_vectors = query_vectors
.iter()
.cloned()
.map(|mut vec| {
vec.indices.truncate(4);
vec.values.truncate(4);
if let Err(idx) = vec.indices.binary_search(&hottest_id) {
if idx < vec.indices.len() {
vec.indices[idx] = hottest_id;
vec.values[idx] = 1.0;
} else {
vec.indices.push(hottest_id);
vec.values.push(1.0);
}
}
}
}
vec.into_remapped()
})
.collect::<Vec<_>>();

let mut group = c.benchmark_group(format!("search/{}", name));

let mut it = query_vectors.iter().cycle();
group.bench_function("basic", |b| {
b.iter_batched(
|| it.next().unwrap().clone().into_remapped(),
|vec| SearchContext::new(vec, TOP, &index, pool.get(), &stopped).search(&|_| true),
criterion::BatchSize::SmallInput,
)
});

let mut it = hottest_query_vectors.iter().cycle();
group.bench_function("hottest", |b| {
b.iter(|| {
SearchContext::new(
it.next().unwrap().clone().into_remapped(),
TOP,
&index,
pool.get(),
&stopped,
)
.search(&|_| true)
})
b.iter_batched(
|| it.next().unwrap().clone(),
|vec| SearchContext::new(vec, TOP, &index, pool.get(), &stopped).search(&|_| true),
criterion::BatchSize::SmallInput,
)
});

let index = InvertedIndexImmutableRam::from_ram_index(index, "nonexistent/path").unwrap();
let mut it = query_vectors.iter().cycle();
group.bench_function("basic_immutable", |b| {
b.iter_batched(
|| it.next().unwrap().clone().into_remapped(),
|vec| SearchContext::new(vec, TOP, &index, pool.get(), &stopped).search(&|_| true),
criterion::BatchSize::SmallInput,
)
});

let mut it = hottest_query_vectors.iter().cycle();
group.bench_function("hottest_immutable", |b| {
b.iter_batched(
|| it.next().unwrap().clone(),
|vec| SearchContext::new(vec, TOP, &index, pool.get(), &stopped).search(&|_| true),
criterion::BatchSize::SmallInput,
)
});
}

Expand Down

0 comments on commit 3ae3667

Please sign in to comment.