Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce memory usage of field maps in FieldInfos and BlockTree TermsReader. #13327

Merged
merged 9 commits into from
May 13, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentReadState;
Expand All @@ -35,10 +35,11 @@
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.ReadAdvice;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.hppc.IntCursor;
import org.apache.lucene.util.hppc.IntObjectHashMap;

/**
* A block-based terms index and dictionary that assigns terms to variable length blocks according
Expand Down Expand Up @@ -113,7 +114,8 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
// produce DocsEnum on demand
final PostingsReaderBase postingsReader;

private final Map<String, FieldReader> fieldMap;
private final FieldInfos fieldInfos;
private final IntObjectHashMap<FieldReader> fieldMap;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR proposes to leverage the existing field-name -> FieldInfo map in FieldInfos to not repeat the ref to the field name strings here. Instead use the field number (specific to the FieldInfos) as key, so that we can use a compact primitive map.

Then below, in terms(String fieldName), we can use FieldInfos.fieldInfo(String fieldName) as a first mapping to the field number, and then use this compact map to get the Terms.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW this is what Lucene90PointsReader does today as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice. It could benefit from the IntObjectHashMap.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually there are many usages of Map<Integer, Object>. I could open some PRs when memory (and perf) matters after IntObjectHashMap is in.

private final List<String> fieldList;

final String segment;
Expand Down Expand Up @@ -157,7 +159,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
// Read per-field details
String metaName =
IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_META_EXTENSION);
Map<String, FieldReader> fieldMap = null;
IntObjectHashMap<FieldReader> fieldMap = null;
Throwable priorE = null;
long indexLength = -1, termsLength = -1;
try (ChecksumIndexInput metaIn = state.directory.openChecksumInput(metaName)) {
Expand All @@ -175,7 +177,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
if (numFields < 0) {
throw new CorruptIndexException("invalid numFields: " + numFields, metaIn);
}
fieldMap = CollectionUtil.newHashMap(numFields);
fieldMap = new IntObjectHashMap<>(numFields);
for (int i = 0; i < numFields; ++i) {
final int field = metaIn.readVInt();
final long numTerms = metaIn.readVLong();
Expand Down Expand Up @@ -216,7 +218,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
final long indexStartFP = metaIn.readVLong();
FieldReader previous =
fieldMap.put(
fieldInfo.name,
fieldInfo.number,
new FieldReader(
this,
fieldInfo,
Expand Down Expand Up @@ -250,10 +252,9 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
// correct
CodecUtil.retrieveChecksum(indexIn, indexLength);
CodecUtil.retrieveChecksum(termsIn, termsLength);
List<String> fieldList = new ArrayList<>(fieldMap.keySet());
fieldList.sort(null);
fieldInfos = state.fieldInfos;
this.fieldMap = fieldMap;
this.fieldList = Collections.unmodifiableList(fieldList);
this.fieldList = sortFieldNames(fieldMap, state.fieldInfos);
success = true;
} finally {
if (!success) {
Expand All @@ -277,6 +278,16 @@ private static BytesRef readBytesRef(IndexInput in) throws IOException {
return bytes;
}

private static List<String> sortFieldNames(
IntObjectHashMap<FieldReader> fieldMap, FieldInfos fieldInfos) {
List<String> fieldNames = new ArrayList<>(fieldMap.size());
for (IntCursor fieldNumberCursor : fieldMap.keys()) {
fieldNames.add(fieldInfos.fieldInfo(fieldNumberCursor.value).name);
}
fieldNames.sort(null);
return Collections.unmodifiableList(fieldNames);
}

// for debugging
// private static String toHex(int v) {
// return "0x" + Integer.toHexString(v);
Expand All @@ -301,7 +312,8 @@ public Iterator<String> iterator() {
@Override
public Terms terms(String field) throws IOException {
assert field != null;
return fieldMap.get(field);
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
return fieldInfo == null ? null : fieldMap.get(fieldInfo.number);
}

@Override
Expand Down
114 changes: 78 additions & 36 deletions lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import static org.apache.lucene.index.FieldInfo.verifySameStoreTermVectors;
import static org.apache.lucene.index.FieldInfo.verifySameVectorOptions;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
Expand All @@ -35,6 +35,7 @@
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.hppc.IntObjectHashMap;

/**
* Collection of {@link FieldInfo}s (accessible by number or by name).
Expand All @@ -61,9 +62,8 @@ public class FieldInfos implements Iterable<FieldInfo> {
private final String parentField;

// used only by fieldInfo(int)
private final FieldInfo[] byNumber;

private final HashMap<String, FieldInfo> byName = new HashMap<>();
private final FieldInfoByNumber byNumber;
private final HashMap<String, FieldInfo> byName;
private final Collection<FieldInfo> values; // for an unmodifiable iterator

/** Constructs a new FieldInfos from an array of FieldInfo objects */
Expand All @@ -81,30 +81,13 @@ public FieldInfos(FieldInfo[] infos) {
String softDeletesField = null;
String parentField = null;

int size = 0; // number of elements in byNumberTemp, number of used array slots
FieldInfo[] byNumberTemp = new FieldInfo[10]; // initial array capacity of 10
byName = new HashMap<>((int) (infos.length / 0.75f) + 1);
dsmiley marked this conversation as resolved.
Show resolved Hide resolved
for (FieldInfo info : infos) {
if (info.number < 0) {
throw new IllegalArgumentException(
"illegal field number: " + info.number + " for field " + info.name);
}
size = info.number >= size ? info.number + 1 : size;
dsmiley marked this conversation as resolved.
Show resolved Hide resolved
if (info.number >= byNumberTemp.length) { // grow array
byNumberTemp = ArrayUtil.grow(byNumberTemp, info.number + 1);
}
FieldInfo previous = byNumberTemp[info.number];
if (previous != null) {
throw new IllegalArgumentException(
"duplicate field numbers: "
+ previous.name
+ " and "
+ info.name
+ " have: "
+ info.number);
}
byNumberTemp[info.number] = info;

previous = byName.put(info.name, info);
FieldInfo previous = byName.put(info.name, info);
if (previous != null) {
throw new IllegalArgumentException(
"duplicate field names: "
Expand Down Expand Up @@ -156,15 +139,17 @@ public FieldInfos(FieldInfo[] infos) {
this.softDeletesField = softDeletesField;
this.parentField = parentField;

List<FieldInfo> valuesTemp = new ArrayList<>(infos.length);
byNumber = new FieldInfo[size];
for (int i = 0; i < size; i++) {
byNumber[i] = byNumberTemp[i];
if (byNumberTemp[i] != null) {
valuesTemp.add(byNumberTemp[i]);
}
}
values = Collections.unmodifiableCollection(valuesTemp);
FieldInfo[] sortedFieldInfos = ArrayUtil.copyOfSubArray(infos, 0, infos.length);
Arrays.sort(sortedFieldInfos, (fi1, fi2) -> Integer.compare(fi1.number, fi2.number));
int maxFieldNumber = infos.length == 0 ? -1 : sortedFieldInfos[infos.length - 1].number;
// If there are many fields and the max field number is greater than twice the number
// of fields, then a map structure is more compact to store the by-number mapping.
byNumber =
maxFieldNumber >= 2 * infos.length && maxFieldNumber >= 32
? new MapFieldInfoByNumber(infos)
: new ArrayFieldInfoByNumber(infos, maxFieldNumber);
// The iteration of FieldInfo is ordered by ascending field number.
values = Collections.unmodifiableCollection(Arrays.asList(sortedFieldInfos));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nowadays, do List.of(sortedFieldInfos) rather than this double-layer referring to two other classes

Copy link
Contributor Author

@bruno-roustant bruno-roustant Apr 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

List.of() makes a copy as it considers the input as an "untrusted array". Here we don't copy, just wrap. Actually we could keep just Arrays.asList(sortedFieldInfos) since we own it privately, so we know we don't modify it (only iterator(), which does not support removal for Arrays.asList).

}

/**
Expand Down Expand Up @@ -323,10 +308,7 @@ public FieldInfo fieldInfo(int fieldNumber) {
if (fieldNumber < 0) {
throw new IllegalArgumentException("Illegal field number: " + fieldNumber);
}
if (fieldNumber >= byNumber.length) {
return null;
}
return byNumber[fieldNumber];
return byNumber.get(fieldNumber);
}

static final class FieldDimensions {
Expand Down Expand Up @@ -821,4 +803,64 @@ FieldInfos finish() {
return new FieldInfos(byName.values().toArray(new FieldInfo[byName.size()]));
}
}

private interface FieldInfoByNumber {
dsmiley marked this conversation as resolved.
Show resolved Hide resolved

FieldInfo get(int fieldNumber);
}

private static class MapFieldInfoByNumber implements FieldInfoByNumber {

private final IntObjectHashMap<FieldInfo> map;

MapFieldInfoByNumber(FieldInfo[] fieldInfos) {
map = new IntObjectHashMap<>(fieldInfos.length);
for (FieldInfo fieldInfo : fieldInfos) {
FieldInfo previous = map.put(fieldInfo.number, fieldInfo);
if (previous != null) {
throw new IllegalArgumentException(
"duplicate field numbers: "
+ previous.name
+ " and "
+ fieldInfo.name
+ " have: "
+ fieldInfo.number);
}
}
}

@Override
public FieldInfo get(int fieldNumber) {
return map.get(fieldNumber);
}
}

private static class ArrayFieldInfoByNumber implements FieldInfoByNumber {

private static final FieldInfo[] EMPTY = new FieldInfo[0];

private final FieldInfo[] array;

ArrayFieldInfoByNumber(FieldInfo[] fieldInfos, int maxFieldNumber) {
array = fieldInfos.length == 0 ? EMPTY : new FieldInfo[maxFieldNumber + 1];
for (FieldInfo fieldInfo : fieldInfos) {
FieldInfo previous = array[fieldInfo.number];
if (previous != null) {
throw new IllegalArgumentException(
"duplicate field numbers: "
+ previous.name
+ " and "
+ fieldInfo.name
+ " have: "
+ fieldInfo.number);
}
array[fieldInfo.number] = fieldInfo;
}
}

@Override
public FieldInfo get(int fieldNumber) {
return fieldNumber >= array.length ? null : array[fieldNumber];
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import java.util.Arrays;
import org.apache.lucene.util.hppc.BitMixer;
import org.apache.lucene.util.hppc.IntCursor;
import org.apache.lucene.util.hppc.IntIntHashMap;

/**
Expand Down Expand Up @@ -94,7 +95,7 @@ int[] getArray() {
}
arrayCache = new int[inner.size()];
int i = 0;
for (IntIntHashMap.IntCursor cursor : inner.keys()) {
for (IntCursor cursor : inner.keys()) {
arrayCache[i++] = cursor.value;
}
// we need to sort this array since "equals" method depend on this
Expand All @@ -114,7 +115,7 @@ long longHashCode() {
return hashCode;
}
hashCode = inner.size();
for (IntIntHashMap.IntCursor cursor : inner.keys()) {
for (IntCursor cursor : inner.keys()) {
hashCode += BitMixer.mix(cursor.value);
}
hashUpdated = true;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.util.hppc;

import java.util.Iterator;
import java.util.NoSuchElementException;

/**
* Simplifies the implementation of iterators a bit. Modeled loosely after Google Guava's API.
*
* <p>Forked from com.carrotsearch.hppc.AbstractIterator
*/
public abstract class AbstractIterator<E> implements Iterator<E> {
private static final int NOT_CACHED = 0;
private static final int CACHED = 1;
private static final int AT_END = 2;

/** Current iterator state. */
private int state = NOT_CACHED;

/** The next element to be returned from {@link #next()} if fetched. */
private E nextElement;

@Override
public boolean hasNext() {
if (state == NOT_CACHED) {
state = CACHED;
nextElement = fetch();
}
return state == CACHED;
}

@Override
public E next() {
if (!hasNext()) {
throw new NoSuchElementException();
}

state = NOT_CACHED;
return nextElement;
}

/** Default implementation throws {@link UnsupportedOperationException}. */
@Override
public void remove() {
throw new UnsupportedOperationException();
}

/**
* Fetch next element. The implementation must return {@link #done()} when all elements have been
* fetched.
*
* @return Returns the next value for the iterator or chain-calls {@link #done()}.
*/
protected abstract E fetch();

/**
* Call when done.
*
* @return Returns a unique sentinel value to indicate end-of-iteration.
*/
protected final E done() {
state = AT_END;
return null;
}
}
Loading