๐Ÿฆ€ Functional Rust

163: Whitespace Parser

Difficulty: 3 Level: Advanced Skip, require, and wrap whitespace โ€” the invisible plumbing every real parser needs.

The Problem This Solves

Almost every language has whitespace between tokens: spaces between keywords, newlines between statements, indentation before blocks. A parser that doesn't handle whitespace will fail on `if x` because after parsing `"if"` it sees `" x"` โ€” not `"x"`. The naive fix is to manually call a whitespace-skipper before every token. That gets tedious fast and clutters your parser logic with noise. What you want is a small set of primitives: `ws0` (skip any amount), `ws1` (require at least one space), and `ws_wrap` (sandwich a parser in optional whitespace). Add `line_comment` and you can skip `// ...` lines too. These four combinators are so fundamental that virtually every real parser written in this style starts here. Once you have them, you can compose them with any other parser and whitespace becomes invisible.

The Intuition

`ws0` consumes zero or more whitespace characters and returns the trimmed input โ€” it never fails. `ws_wrap(p)` runs `ws0`, then `p`, then `ws0` again, so `ws_wrap(number)` parses `" 42 "` just fine.
input: "   hello world"
ws0  โ†’ consumes "   " โ†’ remaining: "hello world"

input: "( 1 + 2 )"
ws_wrap(tag("+")) โ†’ matches "+" even with spaces around it

How It Works in Rust

// ws0: trim leading whitespace, never fail
fn ws0(input: &str) -> ParseResult<()> {
 let rest = input.trim_start();  // built-in, O(n) over whitespace only
 Ok(((), rest))
}

// ws1: at least one whitespace character required
fn ws1(input: &str) -> ParseResult<()> {
 if input.starts_with(|c: char| c.is_ascii_whitespace()) {
     Ok(((), input.trim_start()))
 } else {
     Err(format!("expected whitespace, got {:?}", &input[..4.min(input.len())]))
 }
}

// ws_wrap: run ws0 โ†’ parser โ†’ ws0
fn ws_wrap<'a, T>(
 parser: impl Fn(&'a str) -> ParseResult<T>,
) -> impl Fn(&'a str) -> ParseResult<T> {
 move |input| {
     let (_, rest) = ws0(input)?;
     let (value, rest) = parser(rest)?;
     let (_, rest) = ws0(rest)?;
     Ok((value, rest))
 }
}

// line_comment: skip "// ..." or "# ..." to end of line
fn line_comment(input: &str) -> ParseResult<()> {
 // find() scans to the newline โ€” no char-by-char loop needed
 let rest = input.find('\n')
     .map(|i| &input[i + 1..])
     .unwrap_or("");  // comment at EOF โ†’ consume everything
 Ok(((), rest))
}

What This Unlocks

Key Differences

ConceptOCamlRust
Whitespace skip`many0 (satisfy is_ws ...)` builds a char list`trim_start()` โ€” direct slice, zero allocation
At least one`many1 (satisfy is_ws ...)`Check first char, then `trim_start()`
Comment scanIndex arithmetic`str::find('\n')`
Unicode whitespaceManual char list`char::is_whitespace()` or `is_ascii_whitespace()`
// Example 163: Whitespace Parser
// Parse and skip whitespace: ws0, ws1, ws_wrap

type ParseResult<'a, T> = Result<(T, &'a str), String>;
type Parser<'a, T> = Box<dyn Fn(&'a str) -> ParseResult<'a, T> + 'a>;

// ============================================================
// Approach 1: ws0 โ€” skip zero or more whitespace (always succeeds)
// ============================================================

fn ws0<'a>() -> Parser<'a, ()> {
    Box::new(|input: &'a str| {
        let trimmed = input.trim_start();
        Ok(((), trimmed))
    })
}

// ============================================================
// Approach 2: ws1 โ€” require at least one whitespace
// ============================================================

fn ws1<'a>() -> Parser<'a, ()> {
    Box::new(|input: &'a str| {
        match input.chars().next() {
            Some(c) if c.is_ascii_whitespace() => {
                let trimmed = input.trim_start();
                Ok(((), trimmed))
            }
            _ => Err("Expected whitespace".to_string()),
        }
    })
}

// ============================================================
// Approach 3: ws_wrap โ€” parse with surrounding whitespace
// ============================================================

fn ws_wrap<'a, T: 'a>(parser: Parser<'a, T>) -> Parser<'a, T> {
    Box::new(move |input: &'a str| {
        let trimmed = input.trim_start();
        let (value, rest) = parser(trimmed)?;
        let trimmed_rest = rest.trim_start();
        Ok((value, trimmed_rest))
    })
}

/// Line comment: skip from '#' to end of line
fn line_comment<'a>() -> Parser<'a, ()> {
    Box::new(|input: &'a str| {
        if input.starts_with('#') {
            match input.find('\n') {
                Some(pos) => Ok(((), &input[pos..])),
                None => Ok(((), "")),
            }
        } else {
            Err("Expected '#'".to_string())
        }
    })
}

fn tag<'a>(expected: &str) -> Parser<'a, &'a str> {
    let exp = expected.to_string();
    Box::new(move |input: &'a str| {
        if input.starts_with(&exp) {
            Ok((&input[..exp.len()], &input[exp.len()..]))
        } else {
            Err(format!("Expected \"{}\"", exp))
        }
    })
}

fn main() {
    println!("=== ws0 ===");
    println!("{:?}", ws0()("  hello")); // Ok(((), "hello"))
    println!("{:?}", ws0()("hello"));   // Ok(((), "hello"))

    println!("\n=== ws1 ===");
    println!("{:?}", ws1()("  hello")); // Ok(((), "hello"))
    println!("{:?}", ws1()("hello"));   // Err

    println!("\n=== ws_wrap ===");
    let p = ws_wrap(tag("hello"));
    println!("{:?}", p("  hello  rest")); // Ok(("hello", "rest"))

    println!("\n=== line_comment ===");
    println!("{:?}", line_comment()("# comment\ncode")); // Ok(((), "\ncode"))

    println!("\nโœ“ All examples completed");
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_ws0_spaces() {
        assert_eq!(ws0()("  hello"), Ok(((), "hello")));
    }

    #[test]
    fn test_ws0_no_spaces() {
        assert_eq!(ws0()("hello"), Ok(((), "hello")));
    }

    #[test]
    fn test_ws0_empty() {
        assert_eq!(ws0()(""), Ok(((), "")));
    }

    #[test]
    fn test_ws0_tabs_newlines() {
        assert_eq!(ws0()("\t\n  x"), Ok(((), "x")));
    }

    #[test]
    fn test_ws1_success() {
        assert_eq!(ws1()("  hello"), Ok(((), "hello")));
    }

    #[test]
    fn test_ws1_fail() {
        assert!(ws1()("hello").is_err());
    }

    #[test]
    fn test_ws_wrap() {
        let p = ws_wrap(tag("hello"));
        assert_eq!(p("  hello  rest"), Ok(("hello", "rest")));
    }

    #[test]
    fn test_ws_wrap_no_spaces() {
        let p = ws_wrap(tag("hello"));
        assert_eq!(p("hello"), Ok(("hello", "")));
    }

    #[test]
    fn test_line_comment() {
        assert_eq!(line_comment()("# comment\ncode"), Ok(((), "\ncode")));
    }

    #[test]
    fn test_line_comment_eof() {
        assert_eq!(line_comment()("# comment"), Ok(((), "")));
    }

    #[test]
    fn test_line_comment_not_hash() {
        assert!(line_comment()("code").is_err());
    }
}
(* Example 163: Whitespace Parser *)
(* Parse and skip whitespace: ws, ws0, ws1 *)

type 'a parse_result = ('a * string, string) result
type 'a parser = string -> 'a parse_result

let satisfy pred desc : char parser = fun input ->
  if String.length input > 0 && pred input.[0] then
    Ok (input.[0], String.sub input 1 (String.length input - 1))
  else Error (Printf.sprintf "Expected %s" desc)

let many0 p : 'a list parser = fun input ->
  let rec go acc r = match p r with Ok (v, r') -> go (v::acc) r' | Error _ -> Ok (List.rev acc, r)
  in go [] input

let many1 p : 'a list parser = fun input ->
  match p input with
  | Error e -> Error e
  | Ok (v, r) -> match many0 p r with Ok (vs, r') -> Ok (v::vs, r') | Error e -> Error e

let is_ws c = c = ' ' || c = '\t' || c = '\n' || c = '\r'

(* Approach 1: ws0 โ€” skip zero or more whitespace *)
let ws0 : unit parser = fun input ->
  match many0 (satisfy is_ws "whitespace") input with
  | Ok (_, rest) -> Ok ((), rest)
  | Error e -> Error e

(* Approach 2: ws1 โ€” require at least one whitespace *)
let ws1 : unit parser = fun input ->
  match many1 (satisfy is_ws "whitespace") input with
  | Ok (_, rest) -> Ok ((), rest)
  | Error e -> Error e

(* Approach 3: ws_wrap โ€” parse p surrounded by optional whitespace *)
let ws_wrap (p : 'a parser) : 'a parser = fun input ->
  match ws0 input with
  | Ok ((), r1) ->
    (match p r1 with
     | Ok (v, r2) ->
       (match ws0 r2 with
        | Ok ((), r3) -> Ok (v, r3)
        | Error e -> Error e)
     | Error e -> Error e)
  | Error e -> Error e

(* line comment: skip from # to newline *)
let line_comment : unit parser = fun input ->
  if String.length input > 0 && input.[0] = '#' then
    let rec skip i =
      if i >= String.length input || input.[i] = '\n' then i
      else skip (i + 1) in
    let end_pos = skip 1 in
    Ok ((), String.sub input end_pos (String.length input - end_pos))
  else Error "Expected '#'"

(* Tests *)
let () =
  assert (ws0 "  hello" = Ok ((), "hello"));
  assert (ws0 "hello" = Ok ((), "hello"));
  assert (ws0 "" = Ok ((), ""));

  assert (ws1 "  hello" = Ok ((), "hello"));
  assert (Result.is_error (ws1 "hello"));

  let tag s : string parser = fun input ->
    let len = String.length s in
    if String.length input >= len && String.sub input 0 len = s then
      Ok (s, String.sub input len (String.length input - len))
    else Error (Printf.sprintf "Expected \"%s\"" s) in

  assert (ws_wrap (tag "hello") "  hello  rest" = Ok ("hello", "rest"));
  assert (ws_wrap (tag "hello") "hello" = Ok ("hello", ""));

  assert (line_comment "# comment\ncode" = Ok ((), "code"));

  print_endline "โœ“ All tests passed"

๐Ÿ“Š Detailed Comparison

Comparison: Example 163 โ€” Whitespace Parser

ws0

OCaml:

๐Ÿช Show OCaml equivalent
let ws0 : unit parser = fun input ->
match many0 (satisfy is_ws "whitespace") input with
| Ok (_, rest) -> Ok ((), rest)
| Error e -> Error e

Rust:

fn ws0<'a>() -> Parser<'a, ()> {
 Box::new(|input: &'a str| {
     let trimmed = input.trim_start();
     Ok(((), trimmed))
 })
}

ws_wrap

OCaml:

๐Ÿช Show OCaml equivalent
let ws_wrap (p : 'a parser) : 'a parser = fun input ->
match ws0 input with
| Ok ((), r1) ->
 (match p r1 with
  | Ok (v, r2) ->
    (match ws0 r2 with
     | Ok ((), r3) -> Ok (v, r3)
     | Error e -> Error e)
  | Error e -> Error e)
| Error e -> Error e

Rust:

fn ws_wrap<'a, T: 'a>(parser: Parser<'a, T>) -> Parser<'a, T> {
 Box::new(move |input: &'a str| {
     let trimmed = input.trim_start();
     let (value, rest) = parser(trimmed)?;
     let trimmed_rest = rest.trim_start();
     Ok((value, trimmed_rest))
 })
}