diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fb3666..105db9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,14 @@ # Changelog +## [0.2.2](https://github.com/Blobfolio/trimothy/releases/tag/v0.2.2) - 2023-10-04 + +### New + +* `NormalizeWhitespace::normalized_control_and_whitespace` member method + + + ## [0.2.1](https://github.com/Blobfolio/trimothy/releases/tag/v0.2.1) - 2023-10-04 ### Changed diff --git a/CREDITS.md b/CREDITS.md index 1c8827d..ce9cab6 100644 --- a/CREDITS.md +++ b/CREDITS.md @@ -1,6 +1,6 @@ # Project Dependencies Package: trimothy - Version: 0.2.1 - Generated: 2023-10-05 01:56:28 UTC + Version: 0.2.2 + Generated: 2023-10-05 03:27:02 UTC This package has no dependencies. diff --git a/Cargo.toml b/Cargo.toml index 8371a94..12bc8cc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "trimothy" -version = "0.2.1" +version = "0.2.2" authors = ["Blobfolio, LLC. "] edition = "2021" rust-version = "1.56" diff --git a/README.md b/README.md index f834fa1..36f7d72 100644 --- a/README.md +++ b/README.md @@ -62,11 +62,14 @@ This trait brings _mutable_ match-based trimming `String`, `Vec`, and `Box<[ ### NormalizeWhitespace -This trait exposes an iterator over byte/string slice contents with the edges trimmed, and all contiguous inner whitespace converted to a single horizontal space. This trait is also implemented for existing `u8`/`char` iterators. +This trait exposes an iterator over byte/string slice contents that trims the edges and compacts/converts all inner, contiguous spans of whitespace to a single horizontal space. + +This trait is implemented for `&[u8]`, `&str`, and `Iterator`s with `u8`/`char` items. | Method | Description | | ------ | ----------- | -| `normalized_whitespace` | Return said iterator. | +| `normalized_whitespace` | Return a whitespace-normalizing iterator. | +| `normalized_control_and_whitespace` | Return a control- and whitespace-normalizing iterator. | diff --git a/src/iter.rs b/src/iter.rs index 46ed86b..d41a610 100644 --- a/src/iter.rs +++ b/src/iter.rs @@ -20,14 +20,41 @@ use core::{ /// trimming the edges and compacting any inner whitespace spans, converting /// them to single horizontal spaces (one per span). /// +/// To trim/compact control characters too, use the +/// `normalized_control_and_whitespace` method instead. +/// /// This can be called on an `&[u8]` or `&str` directly, or any iterator /// yielding owned `u8` or `char` items. +/// +/// Normalization can optionally be extended to cover control characters too, +/// trimming and compacting them as if they were whitespace (along with any +/// actual whitespace). +/// +/// ``` +/// use trimothy::NormalizeWhitespace; +/// +/// let abnormal = " \0Hello\0\t\0Dolly\0\0"; +/// +/// // Normally, crap like \0 won't get normalized. +/// let normal: String = abnormal.normalized_whitespace().collect(); +/// assert_eq!(normal, "\0Hello\0 \0Dolly\0\0"); +/// +/// // But it can be. +/// let normal: String = abnormal.normalized_control_and_whitespace().collect(); +/// assert_eq!(normal, "Hello Dolly"); +/// ``` pub trait NormalizeWhitespace> { /// # Normalized Whitespace Iterator. /// /// Modify a byte or char iterator to trim the ends, and convert all /// contiguous inner whitespace to a single horizontal space. fn normalized_whitespace(self) -> NormalizeWhiteSpaceIter; + + /// # Normalized Control/Whitespace Iterator. + /// + /// Same as `normalized_whitespace`, but also trim/normalize control + /// characters. + fn normalized_control_and_whitespace(self) -> NormalizeWhiteSpaceIter; } impl<'a> NormalizeWhitespace>> for &'a [u8] { @@ -64,6 +91,25 @@ impl<'a> NormalizeWhitespace>> for &'a [u8] { fn normalized_whitespace(self) -> NormalizeWhiteSpaceIter>> { self.iter().copied().normalized_whitespace() } + + /// # Normalized Control/Whitespace Iterator. + /// + /// Same as `normalized_whitespace`, but also trim/normalize control + /// characters. + /// + /// ## Examples + /// + /// ``` + /// use trimothy::NormalizeWhitespace; + /// + /// let abnormal: &[u8] = b" \0Hello\x1b\0World!\0"; + /// let normal: Vec = abnormal.normalized_control_and_whitespace().collect(); + /// assert_eq!(normal, b"Hello World!"); + /// ``` + fn normalized_control_and_whitespace(self) + -> NormalizeWhiteSpaceIter>> { + self.iter().copied().normalized_control_and_whitespace() + } } impl<'a> NormalizeWhitespace> for &'a str { @@ -99,6 +145,25 @@ impl<'a> NormalizeWhitespace> for &'a str { fn normalized_whitespace(self) -> NormalizeWhiteSpaceIter> { self.chars().normalized_whitespace() } + + /// # Normalized Control/Whitespace Iterator. + /// + /// Same as `normalized_whitespace`, but also trim/normalize control + /// characters. + /// + /// ## Examples + /// + /// ``` + /// use trimothy::NormalizeWhitespace; + /// + /// let abnormal: &str = " \0Hello\x1b\0World!\0"; + /// let normal: String = abnormal.normalized_control_and_whitespace().collect(); + /// assert_eq!(normal, "Hello World!"); + /// ``` + fn normalized_control_and_whitespace(self) + -> NormalizeWhiteSpaceIter> { + self.chars().normalized_control_and_whitespace() + } } @@ -110,23 +175,37 @@ impl<'a> NormalizeWhitespace> for &'a str { /// `NormalizeWhitespace::normalized_whitespace` implementation. pub struct NormalizeWhiteSpaceIter> { iter: I, + normalize_control: bool, next: Option, } -/// # Helper: Implementations +/// # Implementation Helper /// /// Implement our custom `NormalizeWhitespace` trait for existing iterators, /// and implement `Iterator` for the corresponding `NormalizeWhiteSpaceIter` /// struct. macro_rules! iter { - ($ty:ty, $is:ident, $ws:literal) => ( + ($ty:ty, $is_ws:ident, $is_ctrl:ident, $ws:literal) => ( impl> NormalizeWhitespace<$ty, I> for I { fn normalized_whitespace(mut self) -> NormalizeWhiteSpaceIter<$ty, I> { // Return the iterator, starting with the first non-whitespace // character. - let next = self.by_ref().find(|n| ! n.$is()); + let next = self.by_ref().find(|n| ! n.$is_ws()); NormalizeWhiteSpaceIter { iter: self, + normalize_control: false, + next, + } + } + + fn normalized_control_and_whitespace(mut self) + -> NormalizeWhiteSpaceIter<$ty, I> { + // Return the iterator, starting with the first non-whitespace, + // non-control character. + let next = self.by_ref().find(|n| ! n.$is_ws() && ! n.$is_ctrl()); + NormalizeWhiteSpaceIter { + iter: self, + normalize_control: true, next, } } @@ -136,29 +215,75 @@ macro_rules! iter { type Item = $ty; fn next(&mut self) -> Option { - // Anything in the buffer? + // Anything in the buffer from last time? Return it! if let Some(next) = self.next.take() { return Some(next); } - // Pull the next thing. + // Pull the next thing! let next = self.iter.next()?; - if next.$is() { - // If there's something other than whitespace later on, return a - // single horizontal space. Otherwise we're done. - self.next = self.by_ref().find(|n| ! n.$is()); + + // Normalization required. + if next.$is_ws() || (self.normalize_control && next.$is_ctrl()) { + // Make sure there's something _after_ this that won't get + // normalized away, otherwise we've reached the end. + let ctrl = self.normalize_control; + self.next = self.by_ref().find(|n| ! n.$is_ws() && (! ctrl || ! n.$is_ctrl())); if self.next.is_some() { Some($ws) } else { None } } - // Passthrough any non-whitespace bits. + // It's fine as-is. else { Some(next) } } fn size_hint(&self) -> (usize, Option) { + // Because we're potentially dropping things, the lower limit + // is at most one. + let lower = usize::from(self.next.is_some()); let (_, upper) = self.iter.size_hint(); - (0, upper) + (lower, upper.map(|n| n + lower)) } } ); } -iter!(char, is_whitespace, ' '); -iter!(u8, is_ascii_whitespace, b' '); +iter!(char, is_whitespace, is_control, ' '); +iter!(u8, is_ascii_whitespace, is_ascii_control, b' '); + + + +#[cfg(test)] +mod test { + use super::*; + use alloc::{ + string::String, + vec::Vec, + }; + + #[test] + fn t_normalized_control() { + let example = " \0 Hello\0 Dolly. \x1b"; + // Control is control. + assert_eq!( + example.normalized_whitespace().collect::(), + "\0 Hello\0 Dolly. \x1b", + ); + + // Control is whitespace. + assert_eq!( + example.normalized_control_and_whitespace().collect::(), + "Hello Dolly.", + ); + + let example = example.as_bytes(); + // Control is control. + assert_eq!( + example.normalized_whitespace().collect::>(), + b"\0 Hello\0 Dolly. \x1b", + ); + + // Control is whitespace. + assert_eq!( + example.normalized_control_and_whitespace().collect::>(), + b"Hello Dolly.", + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index 6fc28bc..dcb9083 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -64,11 +64,14 @@ This trait brings _mutable_ match-based trimming `String`, `Vec`, and `Box<[ ### [`NormalizeWhitespace`] -This trait exposes an iterator over byte/string slice contents with the edges trimmed, and all contiguous inner whitespace converted to a single horizontal space. This trait is also implemented for existing `u8`/`char` iterators. +This trait exposes an iterator over byte/string slice contents that trims the edges and compacts/converts all inner, contiguous spans of whitespace to a single horizontal space. + +This trait is implemented for `&[u8]`, `&str`, and `Iterator`s with `u8`/`char` items. | Method | Description | | ------ | ----------- | -| `normalized_whitespace` | Return said iterator. | +| `normalized_whitespace` | Return a whitespace-normalizing iterator. | +| `normalized_control_and_whitespace` | Return a control- and whitespace-normalizing iterator. | */ #![forbid(unsafe_code)]