cargo / compact_str / audit
cargo : compact_str @ 0.9.1
PE Patrick Elsen signed 2026-06-01 published 2026-06-01

src/repr/bytes.rs

182 lines · rust · 1 line annotation

use core::str::Utf8Error;use bytes::Buf;use super::{Repr, MAX_SIZE};use crate::UnwrapWithMsg;impl Repr {    /// Converts a [`Buf`] of bytes to a [`Repr`], checking that the provided bytes are valid UTF-8    pub(crate) fn from_utf8_buf<B: Buf>(buf: &mut B) -> Result<Self, Utf8Error> {        // SAFETY: We check below to make sure the provided buffer is valid UTF-8        let (repr, bytes_written) = unsafe { Self::collect_buf(buf) };        // Check to make sure the provided bytes are valid UTF-8, return the Repr if they are!        match core::str::from_utf8(&repr.as_slice()[..bytes_written]) {            Ok(_) => Ok(repr),            Err(e) => Err(e),        }    }    /// Converts a [`Buf`] of bytes to a [`Repr`], without checking for valid UTF-8    ///    /// # Safety    /// * The provided buffer must be valid UTF-8    pub(crate) unsafe fn from_utf8_buf_unchecked<B: Buf>(buf: &mut B) -> Self {        let (repr, _bytes_written) = Self::collect_buf(buf);        repr    }    /// Collects the bytes from a [`Buf`] into a [`Repr`]    ///    /// # Safety    /// * The caller must guarantee that `buf` is valid UTF-8    unsafe fn collect_buf<B: Buf>(buf: &mut B) -> (Self, usize) {        // Get an empty Repr we can write into        let mut repr = super::EMPTY;        let mut bytes_written = 0;        debug_assert_eq!(repr.len(), bytes_written);
        while buf.has_remaining() {            let chunk = buf.chunk();            let chunk_len = chunk.len();            // There's an edge case where the final byte of this buffer == `HEAP_MASK`, which is            // invalid UTF-8, but would result in us creating an inline variant, that identifies as            // a heap variant. If a user ever tried to reference the data at all, we'd incorrectly            // try and read data from an invalid memory address, causing undefined behavior.            if bytes_written < MAX_SIZE && bytes_written + chunk_len == MAX_SIZE {                let last_byte = chunk[chunk_len - 1];                // If we hit the edge case, reserve additional space to make the repr becomes heap                // allocated, which prevents us from writing this last byte inline                if last_byte >= 0b11000000 {                    repr.reserve(MAX_SIZE + 1).unwrap_with_msg();                }            }            // reserve at least enough space to fit this chunk            repr.reserve(chunk_len).unwrap_with_msg();            // SAFETY: The caller is responsible for making sure the provided buffer is UTF-8. This            // invariant is documented in the public API            let slice = repr.as_mut_buf();            // write the chunk into the Repr            slice[bytes_written..bytes_written + chunk_len].copy_from_slice(chunk);            // Set the length of the Repr            // SAFETY: We just wrote an additional `chunk_len` bytes into the Repr            bytes_written += chunk_len;            repr.set_len(bytes_written);            // advance the pointer of the buffer            buf.advance(chunk_len);        }        (repr, bytes_written)    }
Line 41–77

collect_buf handles a subtle inline/heap edge case: if the input has exactly MAX_SIZE bytes and the final byte is >= 0b11000000 (i.e. would be interpreted as a length-tag or heap-discriminant rather than UTF-8 content), the buffer is forced onto the heap via reserve(MAX_SIZE + 1). Without this, the inline path would produce a value that lies about its variant — invalid UTF-8 would still be caught by the UTF-8 check, but reads during that check would already touch the wrong memory.

}#[cfg(test)]mod test {    #[cfg(feature = "std")]    use std::io::Cursor;    use test_case::test_case;    use super::Repr;    #[test_case(""; "empty")]    #[test_case("hello world"; "short")]    #[test_case("hello, this is a long string which should be heap allocated"; "long")]    fn test_from_utf8_buf(word: &'static str) {        let mut buf = Cursor::new(word.as_bytes());        let repr = Repr::from_utf8_buf(&mut buf).unwrap();        assert_eq!(repr.as_str(), word);        assert_eq!(repr.len(), word.len());    }    #[test]    fn test_from_utf8_packed() {        cfg_if::cfg_if! {            if #[cfg(target_pointer_width = "64")] {                let packed = "this string is 24 chars!";            } else if #[cfg(target_pointer_width = "32")] {                let packed = "i am 12 char";            } else {                compile_error!("unsupported architecture!")            }        }        let mut buf = Cursor::new(packed.as_bytes());        let repr = Repr::from_utf8_buf(&mut buf).unwrap();        assert_eq!(repr.as_str(), packed);        // This repr should __not__ be heap allocated        assert!(!repr.is_heap_allocated());    }    #[test]    fn test_fuzz_panic() {        let bytes = &[            255, 255, 255, 255, 255, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 12, 0, 0, 96,        ];        let mut buf: Cursor<&[u8]> = Cursor::new(bytes);        assert!(Repr::from_utf8_buf(&mut buf).is_err());    }    #[test]    fn test_valid_repr_but_invalid_utf8() {        let bytes = &[            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 192,        ];        let mut buf: Cursor<&[u8]> = Cursor::new(bytes);        assert!(Repr::from_utf8_buf(&mut buf).is_err());    }    #[test]    fn test_fake_heap_variant() {        let bytes = &[            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255,        ];        let mut buf: Cursor<&[u8]> = Cursor::new(bytes);        assert!(Repr::from_utf8_buf(&mut buf).is_err());    }    #[test]    fn test_from_non_contiguous() {        let data = [            211, 247, 211, 247, 121, 135, 151, 255, 126, 205, 255, 204, 211, 51, 51, 0, 52, 55,            247, 204, 45, 37, 44, 210, 132, 50, 206, 121, 135, 151, 255, 126, 205, 255, 204, 211,            51, 51, 0, 52, 55, 247, 204, 45, 44, 210, 132, 50, 206, 51,        ];        let (front, back) = data.split_at(data.len() / 2 + 1);        let mut queue = alloc::collections::VecDeque::with_capacity(data.len());        // create a non-contiguous slice of memory in queue        front.iter().copied().for_each(|x| queue.push_back(x));        back.iter().copied().for_each(|x| queue.push_front(x));        // make sure it's non-contiguous        let (a, b) = queue.as_slices();        assert!(data.is_empty() || !a.is_empty());        assert!(data.is_empty() || !b.is_empty());        assert_eq!(data.len(), queue.len());        assert!(Repr::from_utf8_buf(&mut queue).is_err());    }    #[test]    #[should_panic(expected = "Utf8Error")]    fn test_invalid_utf8() {        let invalid = &[0, 159];        let mut buf: Cursor<&[u8]> = Cursor::new(invalid);        Repr::from_utf8_buf(&mut buf).unwrap();    }}