Skip to content

Commit

Permalink
Reduce memory usage of field maps in FieldInfos and BlockTree TermsRe…
Browse files Browse the repository at this point in the history
…ader. (#13327)
  • Loading branch information
bruno-roustant committed May 13, 2024
1 parent 25f1efd commit 8c738ba
Show file tree
Hide file tree
Showing 11 changed files with 1,765 additions and 156 deletions.
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,8 @@ Optimizations
* GITHUB#12408: Lazy initialization improvements for Facets implementations when there are segments with no hits
to count. (Greg Miller)

* GITHUB#13327: Reduce memory usage of field maps in FieldInfos and BlockTree TermsReader. (Bruno Roustant, David Smiley)

Bug Fixes
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentReadState;
Expand All @@ -35,10 +35,11 @@
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.ReadAdvice;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.hppc.IntCursor;
import org.apache.lucene.util.hppc.IntObjectHashMap;

/**
* A block-based terms index and dictionary that assigns terms to variable length blocks according
Expand Down Expand Up @@ -113,7 +114,8 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
// produce DocsEnum on demand
final PostingsReaderBase postingsReader;

private final Map<String, FieldReader> fieldMap;
private final FieldInfos fieldInfos;
private final IntObjectHashMap<FieldReader> fieldMap;
private final List<String> fieldList;

final String segment;
Expand Down Expand Up @@ -157,7 +159,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
// Read per-field details
String metaName =
IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_META_EXTENSION);
Map<String, FieldReader> fieldMap = null;
IntObjectHashMap<FieldReader> fieldMap = null;
Throwable priorE = null;
long indexLength = -1, termsLength = -1;
try (ChecksumIndexInput metaIn = state.directory.openChecksumInput(metaName)) {
Expand All @@ -175,7 +177,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
if (numFields < 0) {
throw new CorruptIndexException("invalid numFields: " + numFields, metaIn);
}
fieldMap = CollectionUtil.newHashMap(numFields);
fieldMap = new IntObjectHashMap<>(numFields);
for (int i = 0; i < numFields; ++i) {
final int field = metaIn.readVInt();
final long numTerms = metaIn.readVLong();
Expand Down Expand Up @@ -216,7 +218,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
final long indexStartFP = metaIn.readVLong();
FieldReader previous =
fieldMap.put(
fieldInfo.name,
fieldInfo.number,
new FieldReader(
this,
fieldInfo,
Expand Down Expand Up @@ -250,10 +252,9 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
// correct
CodecUtil.retrieveChecksum(indexIn, indexLength);
CodecUtil.retrieveChecksum(termsIn, termsLength);
List<String> fieldList = new ArrayList<>(fieldMap.keySet());
fieldList.sort(null);
fieldInfos = state.fieldInfos;
this.fieldMap = fieldMap;
this.fieldList = Collections.unmodifiableList(fieldList);
this.fieldList = sortFieldNames(fieldMap, state.fieldInfos);
success = true;
} finally {
if (!success) {
Expand All @@ -277,6 +278,16 @@ private static BytesRef readBytesRef(IndexInput in) throws IOException {
return bytes;
}

private static List<String> sortFieldNames(
IntObjectHashMap<FieldReader> fieldMap, FieldInfos fieldInfos) {
List<String> fieldNames = new ArrayList<>(fieldMap.size());
for (IntCursor fieldNumberCursor : fieldMap.keys()) {
fieldNames.add(fieldInfos.fieldInfo(fieldNumberCursor.value).name);
}
fieldNames.sort(null);
return Collections.unmodifiableList(fieldNames);
}

// for debugging
// private static String toHex(int v) {
// return "0x" + Integer.toHexString(v);
Expand All @@ -301,7 +312,8 @@ public Iterator<String> iterator() {
@Override
public Terms terms(String field) throws IOException {
assert field != null;
return fieldMap.get(field);
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
return fieldInfo == null ? null : fieldMap.get(fieldInfo.number);
}

@Override
Expand Down
88 changes: 52 additions & 36 deletions lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@
import static org.apache.lucene.index.FieldInfo.verifySameStoreTermVectors;
import static org.apache.lucene.index.FieldInfo.verifySameVectorOptions;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
Expand All @@ -34,7 +33,7 @@
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CollectionUtil;

/**
* Collection of {@link FieldInfo}s (accessible by number or by name).
Expand Down Expand Up @@ -62,11 +61,15 @@ public class FieldInfos implements Iterable<FieldInfo> {

// used only by fieldInfo(int)
private final FieldInfo[] byNumber;
private final HashMap<String, FieldInfo> byName;

private final HashMap<String, FieldInfo> byName = new HashMap<>();
private final Collection<FieldInfo> values; // for an unmodifiable iterator
/** Iterator in ascending order of field number. */
private final Collection<FieldInfo> values;

/** Constructs a new FieldInfos from an array of FieldInfo objects */
/**
* Constructs a new FieldInfos from an array of FieldInfo objects. The array can be used directly
* as the backing structure.
*/
public FieldInfos(FieldInfo[] infos) {
boolean hasVectors = false;
boolean hasPostings = false;
Expand All @@ -81,30 +84,21 @@ public FieldInfos(FieldInfo[] infos) {
String softDeletesField = null;
String parentField = null;

int size = 0; // number of elements in byNumberTemp, number of used array slots
FieldInfo[] byNumberTemp = new FieldInfo[10]; // initial array capacity of 10
byName = CollectionUtil.newHashMap(infos.length);
int maxFieldNumber = -1;
boolean fieldNumberStrictlyAscending = true;
for (FieldInfo info : infos) {
if (info.number < 0) {
int fieldNumber = info.number;
if (fieldNumber < 0) {
throw new IllegalArgumentException(
"illegal field number: " + info.number + " for field " + info.name);
}
size = info.number >= size ? info.number + 1 : size;
if (info.number >= byNumberTemp.length) { // grow array
byNumberTemp = ArrayUtil.grow(byNumberTemp, info.number + 1);
}
FieldInfo previous = byNumberTemp[info.number];
if (previous != null) {
throw new IllegalArgumentException(
"duplicate field numbers: "
+ previous.name
+ " and "
+ info.name
+ " have: "
+ info.number);
if (maxFieldNumber < fieldNumber) {
maxFieldNumber = fieldNumber;
} else {
fieldNumberStrictlyAscending = false;
}
byNumberTemp[info.number] = info;

previous = byName.put(info.name, info);
FieldInfo previous = byName.put(info.name, info);
if (previous != null) {
throw new IllegalArgumentException(
"duplicate field names: "
Expand Down Expand Up @@ -156,15 +150,40 @@ public FieldInfos(FieldInfo[] infos) {
this.softDeletesField = softDeletesField;
this.parentField = parentField;

List<FieldInfo> valuesTemp = new ArrayList<>(infos.length);
byNumber = new FieldInfo[size];
for (int i = 0; i < size; i++) {
byNumber[i] = byNumberTemp[i];
if (byNumberTemp[i] != null) {
valuesTemp.add(byNumberTemp[i]);
if (fieldNumberStrictlyAscending && maxFieldNumber == infos.length - 1) {
// The input FieldInfo[] contains all fields numbered from 0 to infos.length - 1, and they are
// sorted, use it directly. This is an optimization when reading a segment with all fields
// since the FieldInfo[] is sorted.
byNumber = infos;
values = Arrays.asList(byNumber);
} else {
byNumber = new FieldInfo[maxFieldNumber + 1];
for (FieldInfo fieldInfo : infos) {
FieldInfo existing = byNumber[fieldInfo.number];
if (existing != null) {
throw new IllegalArgumentException(
"duplicate field numbers: "
+ existing.name
+ " and "
+ fieldInfo.name
+ " have: "
+ fieldInfo.number);
}
byNumber[fieldInfo.number] = fieldInfo;
}
if (maxFieldNumber == infos.length - 1) {
// No fields are missing, use byNumber.
values = Arrays.asList(byNumber);
} else {
if (!fieldNumberStrictlyAscending) {
// The below code is faster than
// Arrays.stream(byNumber).filter(Objects::nonNull).toList(),
// mainly when the input FieldInfo[] is small compared to maxFieldNumber.
Arrays.sort(infos, (fi1, fi2) -> Integer.compare(fi1.number, fi2.number));
}
values = Arrays.asList(infos);
}
}
values = Collections.unmodifiableCollection(valuesTemp);
}

/**
Expand Down Expand Up @@ -323,10 +342,7 @@ public FieldInfo fieldInfo(int fieldNumber) {
if (fieldNumber < 0) {
throw new IllegalArgumentException("Illegal field number: " + fieldNumber);
}
if (fieldNumber >= byNumber.length) {
return null;
}
return byNumber[fieldNumber];
return fieldNumber >= byNumber.length ? null : byNumber[fieldNumber];
}

static final class FieldDimensions {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import java.util.Arrays;
import org.apache.lucene.util.hppc.BitMixer;
import org.apache.lucene.util.hppc.IntCursor;
import org.apache.lucene.util.hppc.IntIntHashMap;

/**
Expand Down Expand Up @@ -94,7 +95,7 @@ int[] getArray() {
}
arrayCache = new int[inner.size()];
int i = 0;
for (IntIntHashMap.IntCursor cursor : inner.keys()) {
for (IntCursor cursor : inner.keys()) {
arrayCache[i++] = cursor.value;
}
// we need to sort this array since "equals" method depend on this
Expand All @@ -114,7 +115,7 @@ long longHashCode() {
return hashCode;
}
hashCode = inner.size();
for (IntIntHashMap.IntCursor cursor : inner.keys()) {
for (IntCursor cursor : inner.keys()) {
hashCode += BitMixer.mix(cursor.value);
}
hashUpdated = true;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.util.hppc;

import java.util.Iterator;
import java.util.NoSuchElementException;

/**
* Simplifies the implementation of iterators a bit. Modeled loosely after Google Guava's API.
*
* <p>Forked from com.carrotsearch.hppc.AbstractIterator
*/
public abstract class AbstractIterator<E> implements Iterator<E> {
private static final int NOT_CACHED = 0;
private static final int CACHED = 1;
private static final int AT_END = 2;

/** Current iterator state. */
private int state = NOT_CACHED;

/** The next element to be returned from {@link #next()} if fetched. */
private E nextElement;

@Override
public boolean hasNext() {
if (state == NOT_CACHED) {
state = CACHED;
nextElement = fetch();
}
return state == CACHED;
}

@Override
public E next() {
if (!hasNext()) {
throw new NoSuchElementException();
}

state = NOT_CACHED;
return nextElement;
}

/** Default implementation throws {@link UnsupportedOperationException}. */
@Override
public void remove() {
throw new UnsupportedOperationException();
}

/**
* Fetch next element. The implementation must return {@link #done()} when all elements have been
* fetched.
*
* @return Returns the next value for the iterator or chain-calls {@link #done()}.
*/
protected abstract E fetch();

/**
* Call when done.
*
* @return Returns a unique sentinel value to indicate end-of-iteration.
*/
protected final E done() {
state = AT_END;
return null;
}
}
Loading

0 comments on commit 8c738ba

Please sign in to comment.