Skip to content

Commit

Permalink
[WIP] Compressed posting lists
Browse files Browse the repository at this point in the history
  • Loading branch information
xzfc committed May 7, 2024
1 parent 6f738ca commit 8e4c200
Show file tree
Hide file tree
Showing 15 changed files with 1,163 additions and 108 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ codegen-units = 256 # restore default value for faster compilation
inherits = "release"
lto = false
opt-level = 3
codegen-units = 16

[patch.crates-io]
# Temporary patch until <https://github.com/hyperium/tonic/pull/1401> is merged
Expand Down
2 changes: 2 additions & 0 deletions lib/segment/benches/sparse_index_build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ fn sparse_vector_index_build_benchmark(c: &mut Criterion) {
sparse_vector_index.build_index(permit, &stopped).unwrap();

// intent: measure mmap conversion time
/* XXX(xzfc): mmap disabled for now
group.bench_function("convert-mmap-index", |b| {
b.iter(|| {
let mmap_index_dir = Builder::new().prefix("mmap_index_dir").tempdir().unwrap();
Expand All @@ -116,6 +117,7 @@ fn sparse_vector_index_build_benchmark(c: &mut Criterion) {
assert_eq!(mmap_inverted_index.vector_count(), NUM_VECTORS);
})
});
*/

group.finish();
}
Expand Down
2 changes: 1 addition & 1 deletion lib/segment/src/index/sparse_index/sparse_vector_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ impl<TInvertedIndex: InvertedIndex> SparseVectorIndex<TInvertedIndex> {
for dim_id in query_vector.indices.iter() {
if let Some(dim_id) = self.indices_tracker.remap_index(*dim_id) {
if let Some(posting_list) = self.inverted_index.get(&dim_id) {
for element in posting_list.elements.iter() {
for element in posting_list {
unique_record_ids.insert(element.record_id);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use std::sync::Arc;

use common::cpu::CpuPermit;
use common::types::{PointOffsetType, TelemetryDetail};
use itertools::Itertools;
use rand::rngs::StdRng;
use rand::SeedableRng;
use segment::common::operation_error::OperationResult;
Expand Down Expand Up @@ -164,13 +165,12 @@ fn check_index_storage_consistency<T: InvertedIndex>(sparse_vector_index: &Spars
let posting_list = sparse_vector_index.inverted_index.get(dim_id).unwrap();
// assert posting list sorted by record id
assert!(posting_list
.elements
.windows(2)
.all(|w| w[0].record_id < w[1].record_id));
.clone()
.tuple_windows()
.all(|(w0, w1)| w0.record_id < w1.record_id));
// assert posted list contains record id
assert!(posting_list
.elements
.iter()
.clone()
.any(|e| e.record_id == id && e.weight == *dim_value));
}
// check the vector can be found via search using large top
Expand Down
2 changes: 2 additions & 0 deletions lib/sparse/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ edition = "2021"
testing = []

[dependencies]
bitpacking = "0.9.2"
common = { path = "../common/common" }
io = { path = "../common/io" }
memory = { path = "../common/memory" }
Expand All @@ -24,6 +25,7 @@ ordered-float = "4.2"
rand = "0.8.5"
validator = { workspace = true }
itertools = "0.12.1"
log = "0.4"
parking_lot = "0.12.2"

[dev-dependencies]
Expand Down
2 changes: 2 additions & 0 deletions lib/sparse/benches/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ pub fn run_bench(
)
});

/*
let hottest_id = index
.postings
.iter()
Expand Down Expand Up @@ -141,6 +142,7 @@ pub fn run_bench(
.search(&|_| true)
})
});
*/
}

fn load_csr_index(path: impl AsRef<Path>, ratio: f32) -> io::Result<InvertedIndexRam> {
Expand Down
45 changes: 30 additions & 15 deletions lib/sparse/src/index/inverted_index/inverted_index_mmap.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#![allow(unused_imports, unused_variables)] // XXX(xzfc): mmap disabled for now

use std::mem::size_of;
use std::path::{Path, PathBuf};
use std::sync::Arc;
Expand All @@ -16,7 +18,7 @@ use crate::common::sparse_vector::RemappedSparseVector;
use crate::common::types::{DimId, DimOffset};
use crate::index::inverted_index::inverted_index_ram::InvertedIndexRam;
use crate::index::inverted_index::InvertedIndex;
use crate::index::posting_list::{PostingElement, PostingListIterator};
use crate::index::posting_list2::PostingListIterator;

const POSTING_HEADER_SIZE: usize = size_of::<PostingListFileHeader>();
const INDEX_FILE_NAME: &str = "inverted_index.data";
Expand All @@ -26,6 +28,8 @@ const INDEX_CONFIG_FILE_NAME: &str = "inverted_index_config.json";
pub struct InvertedIndexFileHeader {
pub posting_count: usize, // number oof posting lists
pub vector_count: usize, // number of unique vectors indexed
#[serde(default)]
pub version: usize,
}

/// Inverted flatten index from dimension id to posting list
Expand All @@ -36,14 +40,15 @@ pub struct InvertedIndexMmap {
}

#[derive(Debug, Default, Clone)]
struct PostingListFileHeader {
pub struct PostingListFileHeader {
pub start_offset: u64,
pub end_offset: u64,
}

impl InvertedIndex for InvertedIndexMmap {
fn open(path: &Path) -> std::io::Result<Self> {
Self::load(path)
// Self::load(path)
unimplemented!()
}

fn save(&self, path: &Path) -> std::io::Result<()> {
Expand All @@ -52,15 +57,17 @@ impl InvertedIndex for InvertedIndexMmap {
}

fn get(&self, id: &DimId) -> Option<PostingListIterator> {
self.get(id).map(PostingListIterator::new)
// self.get(id).map(PostingListIterator::new)
unimplemented!()
}

fn len(&self) -> usize {
self.file_header.posting_count
}

fn posting_list_len(&self, id: &DimOffset) -> Option<usize> {
self.get(id).map(|posting_list| posting_list.len())
// self.get(id).map(|posting_list| posting_list.len())
unimplemented!()
}

fn files(path: &Path) -> Vec<PathBuf> {
Expand All @@ -78,18 +85,21 @@ impl InvertedIndex for InvertedIndexMmap {
ram_index: InvertedIndexRam,
path: P,
) -> std::io::Result<Self> {
Self::convert_and_save(&ram_index, path)
// Self::convert_and_save(&ram_index, path)
unimplemented!()
}

fn vector_count(&self) -> usize {
self.file_header.vector_count
// self.file_header.vector_count
unimplemented!()
}

fn max_index(&self) -> Option<DimId> {
match self.file_header.posting_count {
0 => None,
len => Some(len as DimId - 1),
}
// match self.file_header.posting_count {
// 0 => None,
// len => Some(len as DimId - 1),
// }
unimplemented!()
}
}

Expand All @@ -102,6 +112,7 @@ impl InvertedIndexMmap {
path.join(INDEX_CONFIG_FILE_NAME)
}

/*
pub fn get(&self, id: &DimId) -> Option<&[PostingElement]> {
// check that the id is not out of bounds (posting_count includes the empty zeroth entry)
if *id >= self.file_header.posting_count as DimId {
Expand Down Expand Up @@ -179,7 +190,7 @@ impl InvertedIndexMmap {
fn total_posting_elements_size(inverted_index_ram: &InvertedIndexRam) -> usize {
let mut total_posting_elements_size = 0;
for posting in &inverted_index_ram.postings {
total_posting_elements_size += posting.elements.len() * size_of::<PostingElement>();
total_posting_elements_size += posting.len() * size_of::<PostingElement>();
}
total_posting_elements_size
Expand All @@ -192,7 +203,7 @@ impl InvertedIndexMmap {
) {
let mut elements_offset: usize = total_posting_headers_size;
for (id, posting) in inverted_index_ram.postings.iter().enumerate() {
let posting_elements_size = posting.elements.len() * size_of::<PostingElement>();
let posting_elements_size = posting.len() * size_of::<PostingElement>();
let posting_header = PostingListFileHeader {
start_offset: elements_offset as u64,
end_offset: (elements_offset + posting_elements_size) as u64,
Expand All @@ -215,14 +226,17 @@ impl InvertedIndexMmap {
let mut offset = total_posting_headers_size;
for posting in &inverted_index_ram.postings {
// save posting element
let posting_elements_bytes = transmute_to_u8_slice(&posting.elements);
let elements = posting.to_vec(); // TODO(xzfc): avoid copy
let posting_elements_bytes = transmute_to_u8_slice(&elements);
mmap[offset..offset + posting_elements_bytes.len()]
.copy_from_slice(posting_elements_bytes);
offset += posting_elements_bytes.len();
}
}
*/
}

/*
#[cfg(test)]
mod tests {
use tempfile::Builder;
Expand All @@ -235,7 +249,7 @@ mod tests {
inverted_index_mmap: &InvertedIndexMmap,
) {
for id in 0..inverted_index_ram.postings.len() as DimId {
let posting_list_ram = inverted_index_ram.get(&id).unwrap().elements.as_slice();
let posting_list_ram = inverted_index_ram.get(&id).unwrap().to_vec();
let posting_list_mmap = inverted_index_mmap.get(&id).unwrap();
assert_eq!(posting_list_ram.len(), posting_list_mmap.len());
for i in 0..posting_list_ram.len() {
Expand Down Expand Up @@ -286,3 +300,4 @@ mod tests {
assert!(inverted_index_mmap.get(&100).is_none());
}
}
*/

0 comments on commit 8e4c200

Please sign in to comment.