apache · bruno-roustant · May 13, 2024 · Apr 29, 2024 · Apr 30, 2024 · May 2, 2024
diff --git a/...re/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java b/...re/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java
@@ -21,12 +21,12 @@
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
-import java.util.Map;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.FieldsProducer;
 import org.apache.lucene.codecs.PostingsReaderBase;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.SegmentReadState;
@@ -35,10 +35,11 @@
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.ReadAdvice;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.CollectionUtil;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.fst.ByteSequenceOutputs;
 import org.apache.lucene.util.fst.Outputs;
+import org.apache.lucene.util.hppc.IntCursor;
+import org.apache.lucene.util.hppc.IntObjectHashMap;
 
 /**
  * A block-based terms index and dictionary that assigns terms to variable length blocks according
@@ -113,7 +114,8 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
  // produce DocsEnum on demand
  final PostingsReaderBase postingsReader;
 
- private final Map<String, FieldReader> fieldMap;
+ private final FieldInfos fieldInfos;
+ private final IntObjectHashMap<FieldReader> fieldMap;
  private final List<String> fieldList;
 
  final String segment;
@@ -157,7 +159,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
  // Read per-field details
  String metaName =
  IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_META_EXTENSION);
- Map<String, FieldReader> fieldMap = null;
+ IntObjectHashMap<FieldReader> fieldMap = null;
  Throwable priorE = null;
  long indexLength = -1, termsLength = -1;
  try (ChecksumIndexInput metaIn = state.directory.openChecksumInput(metaName)) {
@@ -175,7 +177,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
  if (numFields < 0) {
  throw new CorruptIndexException("invalid numFields: " + numFields, metaIn);
  }
- fieldMap = CollectionUtil.newHashMap(numFields);
+ fieldMap = new IntObjectHashMap<>(numFields);
  for (int i = 0; i < numFields; ++i) {
  final int field = metaIn.readVInt();
  final long numTerms = metaIn.readVLong();
@@ -216,7 +218,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
  final long indexStartFP = metaIn.readVLong();
  FieldReader previous =
  fieldMap.put(
- fieldInfo.name,
+ fieldInfo.number,
  new FieldReader(
  this,
  fieldInfo,
@@ -250,10 +252,9 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
  // correct
  CodecUtil.retrieveChecksum(indexIn, indexLength);
  CodecUtil.retrieveChecksum(termsIn, termsLength);
- List<String> fieldList = new ArrayList<>(fieldMap.keySet());
- fieldList.sort(null);
+ fieldInfos = state.fieldInfos;
  this.fieldMap = fieldMap;
- this.fieldList = Collections.unmodifiableList(fieldList);
+ this.fieldList = sortFieldNames(fieldMap, state.fieldInfos);
  success = true;
  } finally {
  if (!success) {
@@ -277,6 +278,16 @@ private static BytesRef readBytesRef(IndexInput in) throws IOException {
  return bytes;
  }
 
+ private static List<String> sortFieldNames(
+ IntObjectHashMap<FieldReader> fieldMap, FieldInfos fieldInfos) {
+ List<String> fieldNames = new ArrayList<>(fieldMap.size());
+ for (IntCursor fieldNumberCursor : fieldMap.keys()) {
+ fieldNames.add(fieldInfos.fieldInfo(fieldNumberCursor.value).name);
+ }
+ fieldNames.sort(null);
+ return Collections.unmodifiableList(fieldNames);
+ }
+
  // for debugging
  // private static String toHex(int v) {
  // return "0x" + Integer.toHexString(v);
@@ -301,7 +312,8 @@ public Iterator<String> iterator() {
  @Override
  public Terms terms(String field) throws IOException {
  assert field != null;
- return fieldMap.get(field);
+ FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ return fieldInfo == null ? null : fieldMap.get(fieldInfo.number);
  }
 
  @Override

diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
@@ -23,7 +23,7 @@
 import static org.apache.lucene.index.FieldInfo.verifySameStoreTermVectors;
 import static org.apache.lucene.index.FieldInfo.verifySameVectorOptions;
 
-import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
@@ -35,6 +35,7 @@
 import java.util.stream.Collectors;
 import java.util.stream.StreamSupport;
 import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.hppc.IntObjectHashMap;
 
 /**
  * Collection of {@link FieldInfo}s (accessible by number or by name).
@@ -61,9 +62,8 @@ public class FieldInfos implements Iterable<FieldInfo> {
  private final String parentField;
 
  // used only by fieldInfo(int)
- private final FieldInfo[] byNumber;
-
- private final HashMap<String, FieldInfo> byName = new HashMap<>();
+ private final FieldInfoByNumber byNumber;
+ private final HashMap<String, FieldInfo> byName;
  private final Collection<FieldInfo> values; // for an unmodifiable iterator
 
  /** Constructs a new FieldInfos from an array of FieldInfo objects */
@@ -81,30 +81,13 @@ public FieldInfos(FieldInfo[] infos) {
  String softDeletesField = null;
  String parentField = null;
 
- int size = 0; // number of elements in byNumberTemp, number of used array slots
- FieldInfo[] byNumberTemp = new FieldInfo[10]; // initial array capacity of 10
+ byName = new HashMap<>((int) (infos.length / 0.75f) + 1);
  for (FieldInfo info : infos) {
  if (info.number < 0) {
  throw new IllegalArgumentException(
  "illegal field number: " + info.number + " for field " + info.name);
  }
- size = info.number >= size ? info.number + 1 : size;
- if (info.number >= byNumberTemp.length) { // grow array
- byNumberTemp = ArrayUtil.grow(byNumberTemp, info.number + 1);
- }
- FieldInfo previous = byNumberTemp[info.number];
- if (previous != null) {
- throw new IllegalArgumentException(
- "duplicate field numbers: "
- + previous.name
- + " and "
- + info.name
- + " have: "
- + info.number);
- }
- byNumberTemp[info.number] = info;
-
- previous = byName.put(info.name, info);
+ FieldInfo previous = byName.put(info.name, info);
  if (previous != null) {
  throw new IllegalArgumentException(
  "duplicate field names: "
@@ -156,15 +139,17 @@ public FieldInfos(FieldInfo[] infos) {
  this.softDeletesField = softDeletesField;
  this.parentField = parentField;
 
- List<FieldInfo> valuesTemp = new ArrayList<>(infos.length);
- byNumber = new FieldInfo[size];
- for (int i = 0; i < size; i++) {
- byNumber[i] = byNumberTemp[i];
- if (byNumberTemp[i] != null) {
- valuesTemp.add(byNumberTemp[i]);
- }
- }
- values = Collections.unmodifiableCollection(valuesTemp);
+ FieldInfo[] sortedFieldInfos = ArrayUtil.copyOfSubArray(infos, 0, infos.length);
+ Arrays.sort(sortedFieldInfos, (fi1, fi2) -> Integer.compare(fi1.number, fi2.number));
+ int maxFieldNumber = infos.length == 0 ? -1 : sortedFieldInfos[infos.length - 1].number;
+ // If there are many fields and the max field number is greater than twice the number
+ // of fields, then a map structure is more compact to store the by-number mapping.
+ byNumber =
+ maxFieldNumber >= 2 * infos.length && maxFieldNumber >= 32
+ ? new MapFieldInfoByNumber(infos)
+ : new ArrayFieldInfoByNumber(infos, maxFieldNumber);
+ // The iteration of FieldInfo is ordered by ascending field number.
+ values = Collections.unmodifiableCollection(Arrays.asList(sortedFieldInfos));
  }
 
  /**
@@ -323,10 +308,7 @@ public FieldInfo fieldInfo(int fieldNumber) {
  if (fieldNumber < 0) {
  throw new IllegalArgumentException("Illegal field number: " + fieldNumber);
  }
- if (fieldNumber >= byNumber.length) {
- return null;
- }
- return byNumber[fieldNumber];
+ return byNumber.get(fieldNumber);
  }
 
  static final class FieldDimensions {
@@ -821,4 +803,64 @@ FieldInfos finish() {
  return new FieldInfos(byName.values().toArray(new FieldInfo[byName.size()]));
  }
  }
+
+ private interface FieldInfoByNumber {
+
+ FieldInfo get(int fieldNumber);
+ }
+
+ private static class MapFieldInfoByNumber implements FieldInfoByNumber {
+
+ private final IntObjectHashMap<FieldInfo> map;
+
+ MapFieldInfoByNumber(FieldInfo[] fieldInfos) {
+ map = new IntObjectHashMap<>(fieldInfos.length);
+ for (FieldInfo fieldInfo : fieldInfos) {
+ FieldInfo previous = map.put(fieldInfo.number, fieldInfo);
+ if (previous != null) {
+ throw new IllegalArgumentException(
+ "duplicate field numbers: "
+ + previous.name
+ + " and "
+ + fieldInfo.name
+ + " have: "
+ + fieldInfo.number);
+ }
+ }
+ }
+
+ @Override
+ public FieldInfo get(int fieldNumber) {
+ return map.get(fieldNumber);
+ }
+ }
+
+ private static class ArrayFieldInfoByNumber implements FieldInfoByNumber {
+
+ private static final FieldInfo[] EMPTY = new FieldInfo[0];
+
+ private final FieldInfo[] array;
+
+ ArrayFieldInfoByNumber(FieldInfo[] fieldInfos, int maxFieldNumber) {
+ array = fieldInfos.length == 0 ? EMPTY : new FieldInfo[maxFieldNumber + 1];
+ for (FieldInfo fieldInfo : fieldInfos) {
+ FieldInfo previous = array[fieldInfo.number];
+ if (previous != null) {
+ throw new IllegalArgumentException(
+ "duplicate field numbers: "
+ + previous.name
+ + " and "
+ + fieldInfo.name
+ + " have: "
+ + fieldInfo.number);
+ }
+ array[fieldInfo.number] = fieldInfo;
+ }
+ }
+
+ @Override
+ public FieldInfo get(int fieldNumber) {
+ return fieldNumber >= array.length ? null : array[fieldNumber];
+ }
+ }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StateSet.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StateSet.java
@@ -19,6 +19,7 @@
 
 import java.util.Arrays;
 import org.apache.lucene.util.hppc.BitMixer;
+import org.apache.lucene.util.hppc.IntCursor;
 import org.apache.lucene.util.hppc.IntIntHashMap;
 
 /**
@@ -94,7 +95,7 @@ int[] getArray() {
  }
  arrayCache = new int[inner.size()];
  int i = 0;
- for (IntIntHashMap.IntCursor cursor : inner.keys()) {
+ for (IntCursor cursor : inner.keys()) {
  arrayCache[i++] = cursor.value;
  }
  // we need to sort this array since "equals" method depend on this
@@ -114,7 +115,7 @@ long longHashCode() {
  return hashCode;
  }
  hashCode = inner.size();
- for (IntIntHashMap.IntCursor cursor : inner.keys()) {
+ for (IntCursor cursor : inner.keys()) {
  hashCode += BitMixer.mix(cursor.value);
  }
  hashUpdated = true;

diff --git a/lucene/core/src/java/org/apache/lucene/util/hppc/AbstractIterator.java b/lucene/core/src/java/org/apache/lucene/util/hppc/AbstractIterator.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.hppc;
+
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+/**
+ * Simplifies the implementation of iterators a bit. Modeled loosely after Google Guava's API.
+ *
+ * <p>Forked from com.carrotsearch.hppc.AbstractIterator
+ */
+public abstract class AbstractIterator<E> implements Iterator<E> {
+ private static final int NOT_CACHED = 0;
+ private static final int CACHED = 1;
+ private static final int AT_END = 2;
+
+ /** Current iterator state. */
+ private int state = NOT_CACHED;
+
+ /** The next element to be returned from {@link #next()} if fetched. */
+ private E nextElement;
+
+ @Override
+ public boolean hasNext() {
+ if (state == NOT_CACHED) {
+ state = CACHED;
+ nextElement = fetch();
+ }
+ return state == CACHED;
+ }
+
+ @Override
+ public E next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+
+ state = NOT_CACHED;
+ return nextElement;
+ }
+
+ /** Default implementation throws {@link UnsupportedOperationException}. */
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+ /**
+ * Fetch next element. The implementation must return {@link #done()} when all elements have been
+ * fetched.
+ *
+ * @return Returns the next value for the iterator or chain-calls {@link #done()}.
+ */
+ protected abstract E fetch();
+
+ /**
+ * Call when done.
+ *
+ * @return Returns a unique sentinel value to indicate end-of-iteration.
+ */
+ protected final E done() {
+ state = AT_END;
+ return null;
+ }
+}