๐Ÿฆ€ Functional Rust

165: Keyword Parser

Difficulty: 3 Level: Advanced Parse reserved words with word boundary checking โ€” so `"if"` matches in `"if x"` but not in `"iffy"`.

The Problem This Solves

Every language has keywords: `if`, `else`, `fn`, `return`, `let`. They look just like identifiers โ€” but they're reserved. A naive approach parses `"if"` with `tag("if")`, which matches correctly in `"if x = 1"`. But it also matches the start of `"iffy"`, treating the variable `iffy` as a keyword followed by garbage. The fix is word boundary detection: after matching the keyword string, check that the next character is not a letter, digit, or underscore. If it is, the match is inside a longer identifier and should fail. Add token mapping (parse `"if"` โ†’ `Token::If`) and multi-keyword choice (try `"else if"` before `"else"`) and you have the complete keyword layer that sits between your scanner and your grammar.

The Intuition

A keyword parser = `tag(keyword_str)` + boundary check. The boundary check peeks at the next character: if it could continue an identifier, reject the match.
input: "if x > 0"
tag("if") โ†’ ok, remaining: " x > 0"
next char: ' ' โ€” not an ident char โ†’ keyword match โœ“

input: "iffy_name"
tag("if") โ†’ ok, remaining: "fy_name"  
next char: 'f' โ€” IS an ident char โ†’ reject โœ—

How It Works in Rust

#[derive(Clone, Debug, PartialEq)]
enum Token { If, Else, Let, Fn, Return }

fn keyword<'a>(word: &'static str, tok: Token) -> impl Fn(&'a str) -> ParseResult<Token> {
 move |input| {
     if !input.starts_with(word) {
         return Err(format!("expected '{}'", word));
     }
     let rest = &input[word.len()..];

     // Word boundary: next char must NOT be an identifier character
     if rest.starts_with(|c: char| c.is_alphanumeric() || c == '_') {
         return Err(format!("'{}' is a prefix, not a keyword", word));
     }

     Ok((tok.clone(), rest))
 }
}

// Try multiple keywords, longest first to avoid prefix ambiguity
fn any_keyword(input: &str) -> ParseResult<Token> {
 // Sort by length descending: try "else if" before "else"
 let mut candidates = vec![
     ("if",     Token::If),
     ("else",   Token::Else),
     ("let",    Token::Let),
     ("fn",     Token::Fn),
     ("return", Token::Return),
 ];
 candidates.sort_by(|a, b| b.0.len().cmp(&a.0.len()));

 for (word, tok) in candidates {
     if let Ok(result) = keyword(word, tok)(input) {
         return Ok(result);
     }
 }
 Err("no keyword matched".to_string())
}

What This Unlocks

Key Differences

ConceptOCamlRust
Token type`type token = If \Then \Else``enum Token { If, Then, Else }`
Clone for reuseAutomatic (structural equality)`#[derive(Clone)]` required
Boundary check`String.get rest 0``rest.chars().next()`
Longest-first sort`List.sort (fun a b -> compare (len b) (len a))``Vec::sort_by(&#124;a, b&#124; b.0.len().cmp(&a.0.len()))`
// Example 165: Keyword Parser
// Keywords with word boundary checking

type ParseResult<'a, T> = Result<(T, &'a str), String>;
type Parser<'a, T> = Box<dyn Fn(&'a str) -> ParseResult<'a, T> + 'a>;

fn is_ident_char(c: char) -> bool {
    c.is_ascii_alphanumeric() || c == '_'
}

// ============================================================
// Approach 1: keyword โ€” match string + word boundary
// ============================================================

fn keyword<'a>(kw: &str) -> Parser<'a, &'a str> {
    let kw_owned = kw.to_string();
    Box::new(move |input: &'a str| {
        if !input.starts_with(&kw_owned) {
            return Err(format!("Expected \"{}\"", kw_owned));
        }
        let rest = &input[kw_owned.len()..];
        match rest.chars().next() {
            Some(c) if is_ident_char(c) => {
                Err(format!("\"{}\" not a complete keyword (followed by '{}')", kw_owned, c))
            }
            _ => Ok((&input[..kw_owned.len()], rest)),
        }
    })
}

// ============================================================
// Approach 2: keyword mapped to a token type
// ============================================================

#[derive(Debug, Clone, PartialEq)]
enum Token {
    If, Then, Else, Let, In, Fn,
}

fn keyword_token<'a>(kw: &str, tok: Token) -> Parser<'a, Token> {
    let kw_owned = kw.to_string();
    Box::new(move |input: &'a str| {
        if !input.starts_with(&kw_owned) {
            return Err(format!("Expected \"{}\"", kw_owned));
        }
        let rest = &input[kw_owned.len()..];
        match rest.chars().next() {
            Some(c) if is_ident_char(c) => {
                Err(format!("\"{}\" not a complete keyword", kw_owned))
            }
            _ => Ok((tok.clone(), rest)),
        }
    })
}

// ============================================================
// Approach 3: any_keyword โ€” try multiple, longest first
// ============================================================

fn any_keyword<'a>(keywords: Vec<(&str, Token)>) -> Parser<'a, Token> {
    let mut kws: Vec<(String, Token)> = keywords
        .into_iter()
        .map(|(s, t)| (s.to_string(), t))
        .collect();
    // Sort longest first to avoid prefix ambiguity
    kws.sort_by(|a, b| b.0.len().cmp(&a.0.len()));

    Box::new(move |input: &'a str| {
        for (kw, tok) in &kws {
            if input.starts_with(kw.as_str()) {
                let rest = &input[kw.len()..];
                match rest.chars().next() {
                    Some(c) if is_ident_char(c) => continue,
                    _ => return Ok((tok.clone(), rest)),
                }
            }
        }
        Err("Expected keyword".to_string())
    })
}

fn main() {
    println!("=== keyword ===");
    let p = keyword("if");
    println!("{:?}", p("if x"));   // Ok(("if", " x"))
    println!("{:?}", p("iffy"));   // Err
    println!("{:?}", p("if("));    // Ok(("if", "("))

    println!("\n=== keyword_token ===");
    let p = keyword_token("let", Token::Let);
    println!("{:?}", p("let x")); // Ok((Let, " x"))

    println!("\n=== any_keyword ===");
    let p = any_keyword(vec![
        ("if", Token::If), ("then", Token::Then), ("else", Token::Else),
        ("let", Token::Let), ("in", Token::In), ("fn", Token::Fn),
    ]);
    println!("{:?}", p("let x"));  // Ok((Let, " x"))
    println!("{:?}", p("fn x"));   // Ok((Fn, " x"))
    println!("{:?}", p("hello"));  // Err

    println!("\nโœ“ All examples completed");
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_keyword_match() {
        assert_eq!(keyword("if")("if x"), Ok(("if", " x")));
    }

    #[test]
    fn test_keyword_boundary() {
        assert!(keyword("if")("iffy").is_err());
    }

    #[test]
    fn test_keyword_paren() {
        assert_eq!(keyword("if")("if("), Ok(("if", "(")));
    }

    #[test]
    fn test_keyword_eof() {
        assert_eq!(keyword("if")("if"), Ok(("if", "")));
    }

    #[test]
    fn test_keyword_token() {
        assert_eq!(keyword_token("let", Token::Let)("let x"), Ok((Token::Let, " x")));
    }

    #[test]
    fn test_any_keyword() {
        let p = any_keyword(vec![
            ("if", Token::If), ("in", Token::In), ("let", Token::Let),
        ]);
        assert_eq!(p("let x"), Ok((Token::Let, " x")));
        assert_eq!(p("in "), Ok((Token::In, " ")));
    }

    #[test]
    fn test_any_keyword_none() {
        let p = any_keyword(vec![("if", Token::If)]);
        assert!(p("hello").is_err());
    }

    #[test]
    fn test_keyword_no_match() {
        assert!(keyword("else")("elseif").is_err());
    }
}
(* Example 165: Keyword Parser *)
(* Keywords with word boundary checking *)

type 'a parse_result = ('a * string, string) result
type 'a parser = string -> 'a parse_result

let tag expected : string parser = fun input ->
  let len = String.length expected in
  if String.length input >= len && String.sub input 0 len = expected then
    Ok (expected, String.sub input len (String.length input - len))
  else Error (Printf.sprintf "Expected \"%s\"" expected)

let is_ident_char c =
  (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
  (c >= '0' && c <= '9') || c = '_'

(* Approach 1: keyword โ€” match string + check word boundary *)
let keyword (kw : string) : string parser = fun input ->
  match tag kw input with
  | Error e -> Error e
  | Ok (matched, rest) ->
    if String.length rest > 0 && is_ident_char rest.[0] then
      Error (Printf.sprintf "\"%s\" is not a complete keyword (followed by '%c')" kw rest.[0])
    else
      Ok (matched, rest)

(* Approach 2: keyword mapping to enum-like value *)
type token = If | Then | Else | Let | In | Fun

let keyword_token (kw : string) (tok : token) : token parser = fun input ->
  match keyword kw input with
  | Ok (_, rest) -> Ok (tok, rest)
  | Error e -> Error e

(* Approach 3: any_keyword โ€” try multiple keywords *)
let any_keyword (keywords : (string * 'a) list) : 'a parser = fun input ->
  let rec try_kws = function
    | [] -> Error "Expected keyword"
    | (kw, tok) :: rest ->
      match keyword kw input with
      | Ok (_, rem) -> Ok (tok, rem)
      | Error _ -> try_kws rest
  in
  (* Sort longest first to avoid prefix issues *)
  let sorted = List.sort (fun (a, _) (b, _) ->
    compare (String.length b) (String.length a)) keywords in
  try_kws sorted

(* Tests *)
let () =
  assert (keyword "if" "if x" = Ok ("if", " x"));
  assert (Result.is_error (keyword "if" "iffy"));
  assert (keyword "if" "if(" = Ok ("if", "("));
  assert (keyword "else" "else{" = Ok ("else", "{"));

  assert (keyword_token "if" If "if x" = Ok (If, " x"));

  let kw_parser = any_keyword [
    ("if", If); ("then", Then); ("else", Else);
    ("let", Let); ("in", In); ("fun", Fun)] in
  assert (kw_parser "let x" = Ok (Let, " x"));
  assert (kw_parser "fun x" = Ok (Fun, " x"));
  assert (Result.is_error (kw_parser "hello"));

  print_endline "โœ“ All tests passed"

๐Ÿ“Š Detailed Comparison

Comparison: Example 165 โ€” Keyword Parser

keyword with boundary

OCaml:

๐Ÿช Show OCaml equivalent
let keyword (kw : string) : string parser = fun input ->
match tag kw input with
| Error e -> Error e
| Ok (matched, rest) ->
 if String.length rest > 0 && is_ident_char rest.[0] then
   Error (Printf.sprintf "\"%s\" is not a complete keyword" kw)
 else Ok (matched, rest)

Rust:

fn keyword<'a>(kw: &str) -> Parser<'a, &'a str> {
 let kw_owned = kw.to_string();
 Box::new(move |input: &'a str| {
     if !input.starts_with(&kw_owned) {
         return Err(format!("Expected \"{}\"", kw_owned));
     }
     let rest = &input[kw_owned.len()..];
     match rest.chars().next() {
         Some(c) if is_ident_char(c) => Err(format!("not a complete keyword")),
         _ => Ok((&input[..kw_owned.len()], rest)),
     }
 })
}

Token enum

OCaml:

๐Ÿช Show OCaml equivalent
type token = If | Then | Else | Let | In | Fun

Rust:

#[derive(Debug, Clone, PartialEq)]
enum Token { If, Then, Else, Let, In, Fn }