Reduce memory usage of field maps in FieldInfos and BlockTree TermsRe…

…ader. (#13327)
apache · May 13, 2024 · 8c738ba · 8c738ba
1 parent 25f1efd
commit 8c738ba
Show file tree

Hide file tree

Showing 11 changed files with 1,765 additions and 156 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -337,6 +337,8 @@ Optimizations
 * GITHUB#12408: Lazy initialization improvements for Facets implementations when there are segments with no hits
  to count. (Greg Miller)
 
+* GITHUB#13327: Reduce memory usage of field maps in FieldInfos and BlockTree TermsReader. (Bruno Roustant, David Smiley)
+
 Bug Fixes
 ---------------------
 

diff --git a/...re/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java b/...re/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java
@@ -21,12 +21,12 @@
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
-import java.util.Map;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.FieldsProducer;
 import org.apache.lucene.codecs.PostingsReaderBase;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.SegmentReadState;
@@ -35,10 +35,11 @@
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.ReadAdvice;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.CollectionUtil;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.fst.ByteSequenceOutputs;
 import org.apache.lucene.util.fst.Outputs;
+import org.apache.lucene.util.hppc.IntCursor;
+import org.apache.lucene.util.hppc.IntObjectHashMap;
 
 /**
  * A block-based terms index and dictionary that assigns terms to variable length blocks according
@@ -113,7 +114,8 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
  // produce DocsEnum on demand
  final PostingsReaderBase postingsReader;
 
- private final Map<String, FieldReader> fieldMap;
+ private final FieldInfos fieldInfos;
+ private final IntObjectHashMap<FieldReader> fieldMap;
  private final List<String> fieldList;
 
  final String segment;
@@ -157,7 +159,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
  // Read per-field details
  String metaName =
  IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_META_EXTENSION);
- Map<String, FieldReader> fieldMap = null;
+ IntObjectHashMap<FieldReader> fieldMap = null;
  Throwable priorE = null;
  long indexLength = -1, termsLength = -1;
  try (ChecksumIndexInput metaIn = state.directory.openChecksumInput(metaName)) {
@@ -175,7 +177,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
  if (numFields < 0) {
  throw new CorruptIndexException("invalid numFields: " + numFields, metaIn);
  }
- fieldMap = CollectionUtil.newHashMap(numFields);
+ fieldMap = new IntObjectHashMap<>(numFields);
  for (int i = 0; i < numFields; ++i) {
  final int field = metaIn.readVInt();
  final long numTerms = metaIn.readVLong();
@@ -216,7 +218,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
  final long indexStartFP = metaIn.readVLong();
  FieldReader previous =
  fieldMap.put(
- fieldInfo.name,
+ fieldInfo.number,
  new FieldReader(
  this,
  fieldInfo,
@@ -250,10 +252,9 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
  // correct
  CodecUtil.retrieveChecksum(indexIn, indexLength);
  CodecUtil.retrieveChecksum(termsIn, termsLength);
- List<String> fieldList = new ArrayList<>(fieldMap.keySet());
- fieldList.sort(null);
+ fieldInfos = state.fieldInfos;
  this.fieldMap = fieldMap;
- this.fieldList = Collections.unmodifiableList(fieldList);
+ this.fieldList = sortFieldNames(fieldMap, state.fieldInfos);
  success = true;
  } finally {
  if (!success) {
@@ -277,6 +278,16 @@ private static BytesRef readBytesRef(IndexInput in) throws IOException {
  return bytes;
  }
 
+ private static List<String> sortFieldNames(
+ IntObjectHashMap<FieldReader> fieldMap, FieldInfos fieldInfos) {
+ List<String> fieldNames = new ArrayList<>(fieldMap.size());
+ for (IntCursor fieldNumberCursor : fieldMap.keys()) {
+ fieldNames.add(fieldInfos.fieldInfo(fieldNumberCursor.value).name);
+ }
+ fieldNames.sort(null);
+ return Collections.unmodifiableList(fieldNames);
+ }
+
  // for debugging
  // private static String toHex(int v) {
  // return "0x" + Integer.toHexString(v);
@@ -301,7 +312,8 @@ public Iterator<String> iterator() {
  @Override
  public Terms terms(String field) throws IOException {
  assert field != null;
- return fieldMap.get(field);
+ FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ return fieldInfo == null ? null : fieldMap.get(fieldInfo.number);
  }
 
  @Override

diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
@@ -23,9 +23,8 @@
 import static org.apache.lucene.index.FieldInfo.verifySameStoreTermVectors;
 import static org.apache.lucene.index.FieldInfo.verifySameVectorOptions;
 
-import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
-import java.util.Collections;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
@@ -34,7 +33,7 @@
 import java.util.Set;
 import java.util.stream.Collectors;
 import java.util.stream.StreamSupport;
-import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.CollectionUtil;
 
 /**
  * Collection of {@link FieldInfo}s (accessible by number or by name).
@@ -62,11 +61,15 @@ public class FieldInfos implements Iterable<FieldInfo> {
 
  // used only by fieldInfo(int)
  private final FieldInfo[] byNumber;
+ private final HashMap<String, FieldInfo> byName;
 
- private final HashMap<String, FieldInfo> byName = new HashMap<>();
- private final Collection<FieldInfo> values; // for an unmodifiable iterator
+ /** Iterator in ascending order of field number. */
+ private final Collection<FieldInfo> values;
 
- /** Constructs a new FieldInfos from an array of FieldInfo objects */
+ /**
+ * Constructs a new FieldInfos from an array of FieldInfo objects. The array can be used directly
+ * as the backing structure.
+ */
  public FieldInfos(FieldInfo[] infos) {
  boolean hasVectors = false;
  boolean hasPostings = false;
@@ -81,30 +84,21 @@ public FieldInfos(FieldInfo[] infos) {
  String softDeletesField = null;
  String parentField = null;
 
- int size = 0; // number of elements in byNumberTemp, number of used array slots
- FieldInfo[] byNumberTemp = new FieldInfo[10]; // initial array capacity of 10
+ byName = CollectionUtil.newHashMap(infos.length);
+ int maxFieldNumber = -1;
+ boolean fieldNumberStrictlyAscending = true;
  for (FieldInfo info : infos) {
- if (info.number < 0) {
+ int fieldNumber = info.number;
+ if (fieldNumber < 0) {
  throw new IllegalArgumentException(
  "illegal field number: " + info.number + " for field " + info.name);
  }
- size = info.number >= size ? info.number + 1 : size;
- if (info.number >= byNumberTemp.length) { // grow array
- byNumberTemp = ArrayUtil.grow(byNumberTemp, info.number + 1);
- }
- FieldInfo previous = byNumberTemp[info.number];
- if (previous != null) {
- throw new IllegalArgumentException(
- "duplicate field numbers: "
- + previous.name
- + " and "
- + info.name
- + " have: "
- + info.number);
+ if (maxFieldNumber < fieldNumber) {
+ maxFieldNumber = fieldNumber;
+ } else {
+ fieldNumberStrictlyAscending = false;
  }
- byNumberTemp[info.number] = info;
-
- previous = byName.put(info.name, info);
+ FieldInfo previous = byName.put(info.name, info);
  if (previous != null) {
  throw new IllegalArgumentException(
  "duplicate field names: "
@@ -156,15 +150,40 @@ public FieldInfos(FieldInfo[] infos) {
  this.softDeletesField = softDeletesField;
  this.parentField = parentField;
 
- List<FieldInfo> valuesTemp = new ArrayList<>(infos.length);
- byNumber = new FieldInfo[size];
- for (int i = 0; i < size; i++) {
- byNumber[i] = byNumberTemp[i];
- if (byNumberTemp[i] != null) {
- valuesTemp.add(byNumberTemp[i]);
+ if (fieldNumberStrictlyAscending && maxFieldNumber == infos.length - 1) {
+ // The input FieldInfo[] contains all fields numbered from 0 to infos.length - 1, and they are
+ // sorted, use it directly. This is an optimization when reading a segment with all fields
+ // since the FieldInfo[] is sorted.
+ byNumber = infos;
+ values = Arrays.asList(byNumber);
+ } else {
+ byNumber = new FieldInfo[maxFieldNumber + 1];
+ for (FieldInfo fieldInfo : infos) {
+ FieldInfo existing = byNumber[fieldInfo.number];
+ if (existing != null) {
+ throw new IllegalArgumentException(
+ "duplicate field numbers: "
+ + existing.name
+ + " and "
+ + fieldInfo.name
+ + " have: "
+ + fieldInfo.number);
+ }
+ byNumber[fieldInfo.number] = fieldInfo;
+ }
+ if (maxFieldNumber == infos.length - 1) {
+ // No fields are missing, use byNumber.
+ values = Arrays.asList(byNumber);
+ } else {
+ if (!fieldNumberStrictlyAscending) {
+ // The below code is faster than
+ // Arrays.stream(byNumber).filter(Objects::nonNull).toList(),
+ // mainly when the input FieldInfo[] is small compared to maxFieldNumber.
+ Arrays.sort(infos, (fi1, fi2) -> Integer.compare(fi1.number, fi2.number));
+ }
+ values = Arrays.asList(infos);
  }
  }
- values = Collections.unmodifiableCollection(valuesTemp);
  }
 
  /**
@@ -323,10 +342,7 @@ public FieldInfo fieldInfo(int fieldNumber) {
  if (fieldNumber < 0) {
  throw new IllegalArgumentException("Illegal field number: " + fieldNumber);
  }
- if (fieldNumber >= byNumber.length) {
- return null;
- }
- return byNumber[fieldNumber];
+ return fieldNumber >= byNumber.length ? null : byNumber[fieldNumber];
  }
 
  static final class FieldDimensions {

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StateSet.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StateSet.java
@@ -19,6 +19,7 @@
 
 import java.util.Arrays;
 import org.apache.lucene.util.hppc.BitMixer;
+import org.apache.lucene.util.hppc.IntCursor;
 import org.apache.lucene.util.hppc.IntIntHashMap;
 
 /**
@@ -94,7 +95,7 @@ int[] getArray() {
  }
  arrayCache = new int[inner.size()];
  int i = 0;
- for (IntIntHashMap.IntCursor cursor : inner.keys()) {
+ for (IntCursor cursor : inner.keys()) {
  arrayCache[i++] = cursor.value;
  }
  // we need to sort this array since "equals" method depend on this
@@ -114,7 +115,7 @@ long longHashCode() {
  return hashCode;
  }
  hashCode = inner.size();
- for (IntIntHashMap.IntCursor cursor : inner.keys()) {
+ for (IntCursor cursor : inner.keys()) {
  hashCode += BitMixer.mix(cursor.value);
  }
  hashUpdated = true;

diff --git a/lucene/core/src/java/org/apache/lucene/util/hppc/AbstractIterator.java b/lucene/core/src/java/org/apache/lucene/util/hppc/AbstractIterator.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.hppc;
+
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+/**
+ * Simplifies the implementation of iterators a bit. Modeled loosely after Google Guava's API.
+ *
+ * <p>Forked from com.carrotsearch.hppc.AbstractIterator
+ */
+public abstract class AbstractIterator<E> implements Iterator<E> {
+ private static final int NOT_CACHED = 0;
+ private static final int CACHED = 1;
+ private static final int AT_END = 2;
+
+ /** Current iterator state. */
+ private int state = NOT_CACHED;
+
+ /** The next element to be returned from {@link #next()} if fetched. */
+ private E nextElement;
+
+ @Override
+ public boolean hasNext() {
+ if (state == NOT_CACHED) {
+ state = CACHED;
+ nextElement = fetch();
+ }
+ return state == CACHED;
+ }
+
+ @Override
+ public E next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+
+ state = NOT_CACHED;
+ return nextElement;
+ }
+
+ /** Default implementation throws {@link UnsupportedOperationException}. */
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+ /**
+ * Fetch next element. The implementation must return {@link #done()} when all elements have been
+ * fetched.
+ *
+ * @return Returns the next value for the iterator or chain-calls {@link #done()}.
+ */
+ protected abstract E fetch();
+
+ /**
+ * Call when done.
+ *
+ * @return Returns a unique sentinel value to indicate end-of-iteration.
+ */
+ protected final E done() {
+ state = AT_END;
+ return null;
+ }
+}