-
Notifications
You must be signed in to change notification settings - Fork 180
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: row id index structures (experimental) (#2303)
These are experimental indices to map from stable row ids to row addresses. It's possible there are some improvements to serialization format or performance we will make before stabilizing, but I'd like to defer that work so we can unblock work with the stable row ids. These row id indices are optimized for storage size (in-memory and on-disk) and access speed. Closes: #2308
- Loading branch information
Showing
13 changed files
with
2,114 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
||
syntax = "proto3"; | ||
|
||
package lance.table; | ||
// TODO: what would it take to store this in a LanceV2 file? | ||
// Or would flatbuffers be better for this? | ||
|
||
/// A sequence of row IDs. This is split up into one or more segments, | ||
/// each of which can be encoded in different ways. The encodings are optimized | ||
/// for values that are sorted, which will often be the case with row ids. | ||
/// They also have optimized forms depending on how sparse the values are. | ||
message RowIdSequence { | ||
repeated U64Segment segments = 1; | ||
} | ||
|
||
/// Different ways to encode a sequence of u64 values. | ||
message U64Segment { | ||
/// A range of u64 values. | ||
message Range { | ||
/// The start of the range, inclusive. | ||
uint64 start = 1; | ||
/// The enc of the range, exclusive. | ||
uint64 end = 2; | ||
} | ||
|
||
/// A range of u64 values with holes. | ||
message RangeWithHoles { | ||
/// The start of the range, inclusive. | ||
uint64 start = 1; | ||
/// The end of the range, exclusive. | ||
uint64 end = 2; | ||
/// The holes in the range, as a sorted array of values; | ||
/// Binary search can be used to check whether a value is a hole and should | ||
/// be skipped. This can also be used to count the number of holes before a | ||
/// given value, if you need to find the logical offset of a value in the | ||
/// segment. | ||
EncodedU64Array holes = 3; | ||
} | ||
|
||
/// A range of u64 values with a bitmap. | ||
message RangeWithBitmap { | ||
/// The start of the range, inclusive. | ||
uint64 start = 1; | ||
/// The enc of the range, exclusive. | ||
uint64 end = 2; | ||
/// A bitmap of the values in the range. The bitmap is a sequence of bytes, | ||
/// where each byte represents 8 values. The first byte represents values | ||
/// start to start + 7, the second byte represents values start + 8 to | ||
/// start + 15, and so on. The most significant bit of each byte represents | ||
/// the first value in the range, and the least significant bit represents | ||
/// the last value in the range. If the bit is set, the value is in the | ||
/// range; if it is not set, the value is not in the range. | ||
bytes bitmap = 3; | ||
} | ||
|
||
oneof segment { | ||
/// When the values are sorted and contiguous. | ||
Range range = 1; | ||
/// When the values are sorted but have a few gaps. | ||
RangeWithHoles range_with_holes = 2; | ||
/// When the values are sorted but have many gaps. | ||
RangeWithBitmap range_with_bitmap = 3; | ||
/// When the values are sorted but are sparse. | ||
EncodedU64Array sorted_array = 4; | ||
/// A general array of values, which is not sorted. | ||
EncodedU64Array array = 5; | ||
} | ||
} | ||
|
||
/// A basic bitpacked array of u64 values. | ||
message EncodedU64Array { | ||
message U16Array { | ||
uint64 base = 1; | ||
/// The deltas are stored as 16-bit unsigned integers. | ||
/// (protobuf doesn't support 16-bit integers, so we use bytes instead) | ||
bytes offsets = 2; | ||
} | ||
|
||
message U32Array { | ||
uint64 base = 1; | ||
/// The deltas are stored as 32-bit unsigned integers. | ||
/// (we use bytes instead of uint32 to avoid overhead of varint encoding) | ||
bytes offsets = 2; | ||
} | ||
|
||
message U64Array { | ||
/// (We use bytes instead of uint64 to avoid overhead of varint encoding) | ||
bytes values = 2; | ||
} | ||
|
||
oneof array { | ||
U16Array u16_array = 1; | ||
U32Array u32_array = 2; | ||
U64Array u64_array = 3; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.