From c2a2674ada148b6f7b1ee7a45fff3e04420b078f Mon Sep 17 00:00:00 2001 From: PSeitz Date: Sat, 8 Jun 2024 14:40:03 +0900 Subject: [PATCH] use JSON for concat field (#4937) use JSON for concat fields instead of concatenating text fields. With https://github.com/quickwit-oss/tantivy/pull/2383 we support now non-object values on the root in the JSON field. As a nice side-effect, this will make regular JSON fields more powerful. Instead only for nested types, JSON is now also useable for flat mixed-type fields. Closes: https://github.com/quickwit-oss/quickwit/issues/4924 --- .../src/default_doc_mapper/default_mapper.rs | 9 ++--- .../default_doc_mapper/field_mapping_entry.rs | 4 +- .../src/default_doc_mapper/mapping_tree.rs | 37 ++++++------------- .../quickwit-query/src/query_ast/utils.rs | 8 +--- 4 files changed, 18 insertions(+), 40 deletions(-) diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs index 537ba1f460..f0c940ebaf 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs @@ -730,7 +730,6 @@ mod tests { use super::DefaultDocMapper; use crate::default_doc_mapper::field_mapping_entry::DEFAULT_TOKENIZER_NAME; - use crate::default_doc_mapper::mapping_tree::value_to_pretokenized; use crate::{ DefaultDocMapperBuilder, DocMapper, DocParsingError, DOCUMENT_SIZE_FIELD_NAME, DYNAMIC_FIELD_NAME, FIELD_PRESENCE_FIELD_NAME, SOURCE_FIELD_NAME, @@ -1815,7 +1814,7 @@ mod tests { }"#, "concat", r#"{"some_int": 25}"#, - vec![value_to_pretokenized(25).into()], + vec![25_u64.into()], ); test_doc_from_json_test_aux( r#"{ @@ -1830,7 +1829,7 @@ mod tests { }"#, "concat", r#"{"some_int": 25}"#, - vec![value_to_pretokenized(25).into()], + vec![25_u64.into()], ); } @@ -1853,7 +1852,7 @@ mod tests { }"#, "concat", r#"{"some_bool": false}"#, - vec![value_to_pretokenized(false).into()], + vec![false.into()], ); test_doc_from_json_test_aux( r#"{ @@ -1868,7 +1867,7 @@ mod tests { }"#, "concat", r#"{"some_bool": true}"#, - vec![value_to_pretokenized(true).into()], + vec![true.into()], ); } diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs index 450cd02f0c..9f1f1e6caa 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs @@ -702,9 +702,9 @@ impl Default for QuickwitConcatenateOptions { } } -impl From for TextOptions { +impl From for JsonObjectOptions { fn from(quickwit_text_options: QuickwitConcatenateOptions) -> Self { - let mut text_options = TextOptions::default(); + let mut text_options = JsonObjectOptions::default(); let text_field_indexing = TextFieldIndexing::default() .set_index_option(quickwit_text_options.indexing_options.record) .set_fieldnorms(quickwit_text_options.indexing_options.fieldnorms) diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs index 702ece804c..a1e41141fb 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs @@ -29,7 +29,6 @@ use tantivy::schema::{ BytesOptions, DateOptions, Field, IntoIpv6Addr, IpAddrOptions, JsonObjectOptions, NumericOptions, OwnedValue as TantivyValue, SchemaBuilder, TextOptions, }; -use tantivy::tokenizer::{PreTokenizedString, Token}; use tantivy::TantivyDocument as Document; use super::date_time_type::QuickwitDateTimeOptions; @@ -54,20 +53,6 @@ pub enum LeafType { Text(QuickwitTextOptions), } -pub(crate) fn value_to_pretokenized(val: T) -> PreTokenizedString { - let text = val.to_string(); - PreTokenizedString { - text: text.clone(), - tokens: vec![Token { - offset_from: 0, - offset_to: 1, - position: 0, - text, - position_length: 1, - }], - } -} - enum MapOrArrayIter { Array(std::vec::IntoIter), Map(serde_json::map::IntoIter), @@ -161,12 +146,12 @@ pub(crate) fn map_primitive_json_to_tantivy(value: JsonValue) -> Option None, JsonValue::String(text) => Some(TantivyValue::Str(text)), - JsonValue::Bool(val) => Some(value_to_pretokenized(val).into()), + JsonValue::Bool(val) => Some((val).into()), JsonValue::Number(number) => { if let Some(val) = u64::from_json_number(&number) { - Some(value_to_pretokenized(val).into()) + Some((val).into()) } else { - i64::from_json_number(&number).map(|val| value_to_pretokenized(val).into()) + i64::from_json_number(&number).map(|val| (val).into()) } } } @@ -219,7 +204,7 @@ impl LeafType { } } - fn tantivy_string_value_from_json( + fn tantivy_value_from_json( &self, json_val: JsonValue, ) -> Result, String> { @@ -233,16 +218,16 @@ impl LeafType { } LeafType::I64(numeric_options) => { let val = i64::from_json_to_self(json_val, numeric_options.coerce)?; - Ok(OneOrIter::one(value_to_pretokenized(val).into())) + Ok(OneOrIter::one((val).into())) } LeafType::U64(numeric_options) => { let val = u64::from_json_to_self(json_val, numeric_options.coerce)?; - Ok(OneOrIter::one(value_to_pretokenized(val).into())) + Ok(OneOrIter::one((val).into())) } LeafType::F64(_) => Err("unsuported concat type: f64".to_string()), LeafType::Bool(_) => { if let JsonValue::Bool(val) = json_val { - Ok(OneOrIter::one(value_to_pretokenized(val).into())) + Ok(OneOrIter::one((val).into())) } else { Err(format!("expected boolean, got `{json_val}`")) } @@ -313,7 +298,7 @@ impl MappingLeaf { if !self.concatenate.is_empty() { let concat_values = self .typ - .tantivy_string_value_from_json(el_json_val.clone()) + .tantivy_value_from_json(el_json_val.clone()) .map_err(|err_msg| DocParsingError::ValueError(path.join("."), err_msg))?; for concat_value in concat_values { for field in &self.concatenate { @@ -333,7 +318,7 @@ impl MappingLeaf { if !self.concatenate.is_empty() { let concat_values = self .typ - .tantivy_string_value_from_json(json_val.clone()) + .tantivy_value_from_json(json_val.clone()) .map_err(|err_msg| DocParsingError::ValueError(path.join("."), err_msg))?; for concat_value in concat_values { for field in &self.concatenate { @@ -982,8 +967,8 @@ fn build_mapping_tree_from_entries<'a>( if mapping_node.branches.contains_key(name) { bail!("duplicated field definition `{}`", name); } - let text_options: TextOptions = options.clone().into(); - let field = schema.add_text_field(name, text_options); + let text_options: JsonObjectOptions = options.clone().into(); + let field = schema.add_json_field(name, text_options); for sub_field in &options.concatenate_fields { for matched_field in mapping_node diff --git a/quickwit/quickwit-query/src/query_ast/utils.rs b/quickwit/quickwit-query/src/query_ast/utils.rs index 3b189fd125..593ac978ef 100644 --- a/quickwit/quickwit-query/src/query_ast/utils.rs +++ b/quickwit/quickwit-query/src/query_ast/utils.rs @@ -54,13 +54,7 @@ pub fn find_field_or_hit_dynamic<'a>( }; let field_entry = schema.get_field_entry(field); let typ = field_entry.field_type().value_type(); - if path.is_empty() { - if typ == Type::Json { - return Err(InvalidQuery::JsonFieldRootNotSearchable { - full_path: full_path.to_string(), - }); - } - } else if typ != Type::Json { + if !path.is_empty() && typ != Type::Json { return Err(InvalidQuery::FieldDoesNotExist { full_path: full_path.to_string(), });