src/dfa/dense.rs
src/dfa/dense.rs, line 2340-2447
pub fn from_bytes(
slice: &'a [u8],
) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> {
// SAFETY: This is safe because we validate the transition table, start
// table, match states and accelerators below. If any validation fails,
// then we return an error.
let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
// Note that validation order is important here:
//
// * `MatchState::validate` can be called with an untrusted DFA.
// * `TransistionTable::validate` uses `dfa.ms` through `match_len`.
// * `StartTable::validate` needs a valid transition table.
//
// So... validate the match states first.
dfa.accels.validate()?;
dfa.ms.validate(&dfa)?;
dfa.tt.validate(&dfa)?;
dfa.st.validate(&dfa)?;
// N.B. dfa.special doesn't have a way to do unchecked deserialization,
// so it has already been validated.
for state in dfa.states() {
// If the state is an accel state, then it must have a non-empty
// accelerator.
if dfa.is_accel_state(state.id()) {
let index = dfa.accelerator_index(state.id());
if index >= dfa.accels.len() {
return Err(DeserializeError::generic(
"found DFA state with invalid accelerator index",
));
}
let needles = dfa.accels.needles(index);
if !(1 <= needles.len() && needles.len() <= 3) {
return Err(DeserializeError::generic(
"accelerator needles has invalid length",
));
}
}
}
Ok((dfa, nread))
}
/// Deserialize a DFA with a specific state identifier representation in
/// constant time by omitting the verification of the validity of the
/// transition table and other data inside the DFA.
///
/// This is just like [`DFA::from_bytes`], except it can potentially return
/// a DFA that exhibits undefined behavior if its transition table contains
/// invalid state identifiers.
///
/// This routine is useful if you need to deserialize a DFA cheaply
/// and cannot afford the transition table validation performed by
/// `from_bytes`.
///
/// # Example
///
/// ```
/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
///
/// let initial = DFA::new("foo[0-9]+")?;
/// let (bytes, _) = initial.to_bytes_native_endian();
/// // SAFETY: This is guaranteed to be safe since the bytes given come
/// // directly from a compatible serialization routine.
/// let dfa: DFA<&[u32]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
///
/// let expected = Some(HalfMatch::must(0, 8));
/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub unsafe fn from_bytes_unchecked(
slice: &'a [u8],
) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> {
let mut nr = 0;
nr += wire::skip_initial_padding(slice);
wire::check_alignment::<StateID>(&slice[nr..])?;
nr += wire::read_label(&slice[nr..], LABEL)?;
nr += wire::read_endianness_check(&slice[nr..])?;
nr += wire::read_version(&slice[nr..], VERSION)?;
let _unused = wire::try_read_u32(&slice[nr..], "unused space")?;
nr += size_of::<u32>();
let (flags, nread) = Flags::from_bytes(&slice[nr..])?;
nr += nread;
let (tt, nread) = TransitionTable::from_bytes_unchecked(&slice[nr..])?;
nr += nread;
let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
nr += nread;
let (ms, nread) = MatchStates::from_bytes_unchecked(&slice[nr..])?;
nr += nread;
let (special, nread) = Special::from_bytes(&slice[nr..])?;
nr += nread;
special.validate_state_len(tt.len(), tt.stride2)?;
let (accels, nread) = Accels::from_bytes_unchecked(&slice[nr..])?;
nr += nread;
let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?;
nr += nread;
// Prefilters don't support serialization, so they're always absent.
let pre = None;
Ok((DFA { tt, st, ms, special, accels, pre, quitset, flags }, nr))
}
DFA::from_bytes is the highest-risk surface: it deserializes an untrusted serialized automaton. It calls the unsafe fn from_bytes_unchecked (which performs only the cheap, constant-time decode) and then runs a full validation chain whose order is documented and load-bearing: accels.validate, ms.validate, tt.validate, st.validate, special.validate_state_len, and a per-state accelerator-index/needle-length bounds check. tt.validate (dense.rs:3620) walks every state and every transition, rejecting any transition target that is not a valid state-ID boundary and any special-tagged state that is not actually special. This establishes the "all transition targets are valid state IDs" invariant that the search hot path relies on. The from_bytes_unchecked decode itself validates stride2 in 1..=9, rejects alphabet_len > stride, and uses the checked-arithmetic helpers wire::shl/wire::mul/wire::add plus check_slice_len and check_alignment::<StateID> before the single core::slice::from_raw_parts cast at dense.rs:3442 (the comment notes it is the only unsafe code in that function). Justifies impl-parser, parser-impl-safe, uses-unsafe, and unsafe-safe.