Make the reflect path parser utf-8-unaware (#9371)
# Objective All delimiter symbols used by the path parser are ASCII, this means we can entirely ignore UTF8 handling. This may improve performance. ## Solution Instead of storing the path as an `&str` + the parser offset, and reading the path using `&self.path[self.offset..]`, we store the parser state in a `&[u8]`. This allows two optimizations: 1. Avoid UTF8 checking on `&self.path[self.offset..]` 2. Avoid any kind of bound checking, since the length of what is left to read is stored in the `&[u8]`'s reference metadata, and is assumed valid by the compiler. This is a major improvement when comparing to the previous parser. 1. `access_following` and `next_token` now inline in `PathParser::next` 2. Benchmarking show a 20% performance increase (#9364) Please note that while we ignore UTF-8 handling, **utf-8 is still supported**. This is because we only handle "at the edges" what happens exactly before and after a recognized `SYMBOL`. utf-8 is handled transparently beyond that.
This commit is contained in:
parent
7083f0d593
commit
365cf3114a
@ -400,12 +400,12 @@ mod tests {
|
|||||||
#[derive(Reflect)]
|
#[derive(Reflect)]
|
||||||
struct B {
|
struct B {
|
||||||
foo: usize,
|
foo: usize,
|
||||||
bar: C,
|
łørđ: C,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Reflect)]
|
#[derive(Reflect)]
|
||||||
struct C {
|
struct C {
|
||||||
baz: f32,
|
mосква: f32,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Reflect)]
|
#[derive(Reflect)]
|
||||||
@ -418,7 +418,7 @@ mod tests {
|
|||||||
enum F {
|
enum F {
|
||||||
Unit,
|
Unit,
|
||||||
Tuple(u32, u32),
|
Tuple(u32, u32),
|
||||||
Struct { value: char },
|
Şķràźÿ { 東京: char },
|
||||||
}
|
}
|
||||||
|
|
||||||
fn a_sample() -> A {
|
fn a_sample() -> A {
|
||||||
@ -426,13 +426,13 @@ mod tests {
|
|||||||
w: 1,
|
w: 1,
|
||||||
x: B {
|
x: B {
|
||||||
foo: 10,
|
foo: 10,
|
||||||
bar: C { baz: 3.14 },
|
łørđ: C { mосква: 3.14 },
|
||||||
},
|
},
|
||||||
y: vec![C { baz: 1.0 }, C { baz: 2.0 }],
|
y: vec![C { mосква: 1.0 }, C { mосква: 2.0 }],
|
||||||
z: D(E(10.0, 42)),
|
z: D(E(10.0, 42)),
|
||||||
unit_variant: F::Unit,
|
unit_variant: F::Unit,
|
||||||
tuple_variant: F::Tuple(123, 321),
|
tuple_variant: F::Tuple(123, 321),
|
||||||
struct_variant: F::Struct { value: 'm' },
|
struct_variant: F::Şķràźÿ { 東京: 'm' },
|
||||||
array: [86, 75, 309],
|
array: [86, 75, 309],
|
||||||
tuple: (true, 1.23),
|
tuple: (true, 1.23),
|
||||||
}
|
}
|
||||||
@ -460,19 +460,19 @@ mod tests {
|
|||||||
&[(access_field("x"), 1), (access_field("foo"), 2)]
|
&[(access_field("x"), 1), (access_field("foo"), 2)]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&*ParsedPath::parse("x.bar.baz").unwrap().0,
|
&*ParsedPath::parse("x.łørđ.mосква").unwrap().0,
|
||||||
&[
|
&[
|
||||||
(access_field("x"), 1),
|
(access_field("x"), 1),
|
||||||
(access_field("bar"), 2),
|
(access_field("łørđ"), 2),
|
||||||
(access_field("baz"), 6)
|
(access_field("mосква"), 10)
|
||||||
]
|
]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&*ParsedPath::parse("y[1].baz").unwrap().0,
|
&*ParsedPath::parse("y[1].mосква").unwrap().0,
|
||||||
&[
|
&[
|
||||||
(access_field("y"), 1),
|
(access_field("y"), 1),
|
||||||
(Access::ListIndex(1), 2),
|
(Access::ListIndex(1), 2),
|
||||||
(access_field("baz"), 5)
|
(access_field("mосква"), 5)
|
||||||
]
|
]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@ -503,14 +503,14 @@ mod tests {
|
|||||||
|
|
||||||
let b = ParsedPath::parse("w").unwrap();
|
let b = ParsedPath::parse("w").unwrap();
|
||||||
let c = ParsedPath::parse("x.foo").unwrap();
|
let c = ParsedPath::parse("x.foo").unwrap();
|
||||||
let d = ParsedPath::parse("x.bar.baz").unwrap();
|
let d = ParsedPath::parse("x.łørđ.mосква").unwrap();
|
||||||
let e = ParsedPath::parse("y[1].baz").unwrap();
|
let e = ParsedPath::parse("y[1].mосква").unwrap();
|
||||||
let f = ParsedPath::parse("z.0.1").unwrap();
|
let f = ParsedPath::parse("z.0.1").unwrap();
|
||||||
let g = ParsedPath::parse("x#0").unwrap();
|
let g = ParsedPath::parse("x#0").unwrap();
|
||||||
let h = ParsedPath::parse("x#1#0").unwrap();
|
let h = ParsedPath::parse("x#1#0").unwrap();
|
||||||
let i = ParsedPath::parse("unit_variant").unwrap();
|
let i = ParsedPath::parse("unit_variant").unwrap();
|
||||||
let j = ParsedPath::parse("tuple_variant.1").unwrap();
|
let j = ParsedPath::parse("tuple_variant.1").unwrap();
|
||||||
let k = ParsedPath::parse("struct_variant.value").unwrap();
|
let k = ParsedPath::parse("struct_variant.東京").unwrap();
|
||||||
let l = ParsedPath::parse("struct_variant#0").unwrap();
|
let l = ParsedPath::parse("struct_variant#0").unwrap();
|
||||||
let m = ParsedPath::parse("array[2]").unwrap();
|
let m = ParsedPath::parse("array[2]").unwrap();
|
||||||
let n = ParsedPath::parse("tuple.1").unwrap();
|
let n = ParsedPath::parse("tuple.1").unwrap();
|
||||||
@ -580,15 +580,15 @@ mod tests {
|
|||||||
|
|
||||||
assert_eq!(*a.path::<usize>("w").unwrap(), 1);
|
assert_eq!(*a.path::<usize>("w").unwrap(), 1);
|
||||||
assert_eq!(*a.path::<usize>("x.foo").unwrap(), 10);
|
assert_eq!(*a.path::<usize>("x.foo").unwrap(), 10);
|
||||||
assert_eq!(*a.path::<f32>("x.bar.baz").unwrap(), 3.14);
|
assert_eq!(*a.path::<f32>("x.łørđ.mосква").unwrap(), 3.14);
|
||||||
assert_eq!(*a.path::<f32>("y[1].baz").unwrap(), 2.0);
|
assert_eq!(*a.path::<f32>("y[1].mосква").unwrap(), 2.0);
|
||||||
assert_eq!(*a.path::<usize>("z.0.1").unwrap(), 42);
|
assert_eq!(*a.path::<usize>("z.0.1").unwrap(), 42);
|
||||||
assert_eq!(*a.path::<usize>("x#0").unwrap(), 10);
|
assert_eq!(*a.path::<usize>("x#0").unwrap(), 10);
|
||||||
assert_eq!(*a.path::<f32>("x#1#0").unwrap(), 3.14);
|
assert_eq!(*a.path::<f32>("x#1#0").unwrap(), 3.14);
|
||||||
|
|
||||||
assert_eq!(*a.path::<F>("unit_variant").unwrap(), F::Unit);
|
assert_eq!(*a.path::<F>("unit_variant").unwrap(), F::Unit);
|
||||||
assert_eq!(*a.path::<u32>("tuple_variant.1").unwrap(), 321);
|
assert_eq!(*a.path::<u32>("tuple_variant.1").unwrap(), 321);
|
||||||
assert_eq!(*a.path::<char>("struct_variant.value").unwrap(), 'm');
|
assert_eq!(*a.path::<char>("struct_variant.東京").unwrap(), 'm');
|
||||||
assert_eq!(*a.path::<char>("struct_variant#0").unwrap(), 'm');
|
assert_eq!(*a.path::<char>("struct_variant#0").unwrap(), 'm');
|
||||||
|
|
||||||
assert_eq!(*a.path::<i32>("array[2]").unwrap(), 309);
|
assert_eq!(*a.path::<i32>("array[2]").unwrap(), 309);
|
||||||
@ -597,8 +597,8 @@ mod tests {
|
|||||||
*a.path_mut::<f32>("tuple.1").unwrap() = 3.21;
|
*a.path_mut::<f32>("tuple.1").unwrap() = 3.21;
|
||||||
assert_eq!(*a.path::<f32>("tuple.1").unwrap(), 3.21);
|
assert_eq!(*a.path::<f32>("tuple.1").unwrap(), 3.21);
|
||||||
|
|
||||||
*a.path_mut::<f32>("y[1].baz").unwrap() = 3.0;
|
*a.path_mut::<f32>("y[1].mосква").unwrap() = 3.0;
|
||||||
assert_eq!(a.y[1].baz, 3.0);
|
assert_eq!(a.y[1].mосква, 3.0);
|
||||||
|
|
||||||
*a.path_mut::<u32>("tuple_variant.0").unwrap() = 1337;
|
*a.path_mut::<u32>("tuple_variant.0").unwrap() = 1337;
|
||||||
assert_eq!(a.tuple_variant, F::Tuple(1337, 321));
|
assert_eq!(a.tuple_variant, F::Tuple(1337, 321));
|
||||||
@ -649,8 +649,8 @@ mod tests {
|
|||||||
&[(Access::TupleIndex(5), 1)]
|
&[(Access::TupleIndex(5), 1)]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&*ParsedPath::parse("[0].bar").unwrap().0,
|
&*ParsedPath::parse("[0].łørđ").unwrap().0,
|
||||||
&[(Access::ListIndex(0), 1), (access_field("bar"), 4)]
|
&[(Access::ListIndex(0), 1), (access_field("łørđ"), 4)]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
use std::{fmt, num::ParseIntError};
|
use std::{fmt, num::ParseIntError, str::from_utf8_unchecked};
|
||||||
|
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
|
||||||
@ -33,28 +33,37 @@ enum Error<'a> {
|
|||||||
|
|
||||||
pub(super) struct PathParser<'a> {
|
pub(super) struct PathParser<'a> {
|
||||||
path: &'a str,
|
path: &'a str,
|
||||||
offset: usize,
|
remaining: &'a [u8],
|
||||||
}
|
}
|
||||||
impl<'a> PathParser<'a> {
|
impl<'a> PathParser<'a> {
|
||||||
pub(super) fn new(path: &'a str) -> Self {
|
pub(super) fn new(path: &'a str) -> Self {
|
||||||
PathParser { path, offset: 0 }
|
let remaining = path.as_bytes();
|
||||||
|
PathParser { path, remaining }
|
||||||
}
|
}
|
||||||
|
|
||||||
fn next_token(&mut self) -> Option<Token<'a>> {
|
fn next_token(&mut self) -> Option<Token<'a>> {
|
||||||
let input = &self.path[self.offset..];
|
let to_parse = self.remaining;
|
||||||
|
|
||||||
// Return with `None` if empty.
|
// Return with `None` if empty.
|
||||||
let first_char = input.chars().next()?;
|
let (first_byte, remaining) = to_parse.split_first()?;
|
||||||
|
|
||||||
if let Some(token) = Token::symbol_from_char(first_char) {
|
if let Some(token) = Token::symbol_from_byte(*first_byte) {
|
||||||
self.offset += 1; // NOTE: we assume all symbols are ASCII
|
self.remaining = remaining; // NOTE: all symbols are ASCII
|
||||||
return Some(token);
|
return Some(token);
|
||||||
}
|
}
|
||||||
// We are parsing either `0123` or `field`.
|
// We are parsing either `0123` or `field`.
|
||||||
// If we do not find a subsequent token, we are at the end of the parse string.
|
// If we do not find a subsequent token, we are at the end of the parse string.
|
||||||
let ident = input.split_once(Token::SYMBOLS).map_or(input, |t| t.0);
|
let ident_len = to_parse.iter().position(|t| Token::SYMBOLS.contains(t));
|
||||||
|
let (ident, remaining) = to_parse.split_at(ident_len.unwrap_or(to_parse.len()));
|
||||||
|
// SAFETY: This relies on `self.remaining` always remaining valid UTF8:
|
||||||
|
// - self.remaining is a slice derived from self.path (valid &str)
|
||||||
|
// - The slice's end is either the same as the valid &str or
|
||||||
|
// the last byte before an ASCII utf-8 character (ie: it is a char
|
||||||
|
// boundary).
|
||||||
|
// - The slice always starts after a symbol ie: an ASCII character's boundary.
|
||||||
|
let ident = unsafe { from_utf8_unchecked(ident) };
|
||||||
|
|
||||||
self.offset += ident.len();
|
self.remaining = remaining;
|
||||||
Some(Token::Ident(Ident(ident)))
|
Some(Token::Ident(Ident(ident)))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -82,13 +91,17 @@ impl<'a> PathParser<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn offset(&self) -> usize {
|
||||||
|
self.path.len() - self.remaining.len()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
impl<'a> Iterator for PathParser<'a> {
|
impl<'a> Iterator for PathParser<'a> {
|
||||||
type Item = (Result<Access<'a>, ReflectPathError<'a>>, usize);
|
type Item = (Result<Access<'a>, ReflectPathError<'a>>, usize);
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
let token = self.next_token()?;
|
let token = self.next_token()?;
|
||||||
let offset = self.offset;
|
let offset = self.offset();
|
||||||
let err = |error| ReflectPathError::ParseError {
|
let err = |error| ReflectPathError::ParseError {
|
||||||
offset,
|
offset,
|
||||||
path: self.path,
|
path: self.path,
|
||||||
@ -114,33 +127,38 @@ impl<'a> Ident<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NOTE: We use repr(u8) so that the `match byte` in `Token::symbol_from_byte`
|
||||||
|
// becomes a "check `byte` is one of SYMBOLS and forward its value" this makes
|
||||||
|
// the optimizer happy, and shaves off a few cycles.
|
||||||
#[derive(Debug, PartialEq, Eq)]
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
|
#[repr(u8)]
|
||||||
enum Token<'a> {
|
enum Token<'a> {
|
||||||
Dot,
|
Dot = b'.',
|
||||||
Pound,
|
Pound = b'#',
|
||||||
OpenBracket,
|
OpenBracket = b'[',
|
||||||
CloseBracket,
|
CloseBracket = b']',
|
||||||
Ident(Ident<'a>),
|
Ident(Ident<'a>),
|
||||||
}
|
}
|
||||||
impl fmt::Display for Token<'_> {
|
impl fmt::Display for Token<'_> {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
match self {
|
let text = match self {
|
||||||
Token::Dot => f.write_str("."),
|
Token::Dot => ".",
|
||||||
Token::Pound => f.write_str("#"),
|
Token::Pound => "#",
|
||||||
Token::OpenBracket => f.write_str("["),
|
Token::OpenBracket => "[",
|
||||||
Token::CloseBracket => f.write_str("]"),
|
Token::CloseBracket => "]",
|
||||||
Token::Ident(ident) => f.write_str(ident.0),
|
Token::Ident(ident) => ident.0,
|
||||||
}
|
};
|
||||||
|
f.write_str(text)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl<'a> Token<'a> {
|
impl<'a> Token<'a> {
|
||||||
const SYMBOLS: &[char] = &['.', '#', '[', ']'];
|
const SYMBOLS: &[u8] = b".#[]";
|
||||||
fn symbol_from_char(char: char) -> Option<Self> {
|
fn symbol_from_byte(byte: u8) -> Option<Self> {
|
||||||
match char {
|
match byte {
|
||||||
'.' => Some(Self::Dot),
|
b'.' => Some(Self::Dot),
|
||||||
'#' => Some(Self::Pound),
|
b'#' => Some(Self::Pound),
|
||||||
'[' => Some(Self::OpenBracket),
|
b'[' => Some(Self::OpenBracket),
|
||||||
']' => Some(Self::CloseBracket),
|
b']' => Some(Self::CloseBracket),
|
||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user