498 Fundamental

String Truncation

Functional Programming

Tutorial

The Problem

Naively truncating &s[..N] panics if N falls in the middle of a multi-byte character (e.g., slicing "café" at byte 4 lands inside é). Database column limits, UI text truncation, and API field limits are measured in bytes or characters — not always the same. A correct truncation implementation must: (1) find the nearest valid char boundary for byte-limited truncation, and (2) find the byte position of the Nth character for character-limited truncation.

🎯 Learning Outcomes

• Truncate to a byte limit safely by walking back to the nearest is_char_boundary

• Truncate to a character limit using char_indices().nth(max_chars) for the byte position

• Add an ellipsis … (U+2026, 3 bytes in UTF-8) when truncating display strings

• Handle the edge case where the string is shorter than the limit

• Apply saturating_sub to avoid underflow when reserving space for the ellipsis

Code Example

#![allow(clippy::all)]
// 498. Safe Unicode truncation
fn truncate_bytes(s: &str, max_bytes: usize) -> &str {
    if s.len() <= max_bytes {
        return s;
    }
    // floor_char_boundary available in Rust 1.72+
    // For compatibility, implement manually:
    let mut end = max_bytes;
    while end > 0 && !s.is_char_boundary(end) {
        end -= 1;
    }
    &s[..end]
}

fn truncate_chars(s: &str, max_chars: usize) -> &str {
    match s.char_indices().nth(max_chars) {
        Some((byte_pos, _)) => &s[..byte_pos],
        None => s, // shorter than max_chars
    }
}

fn truncate_with_ellipsis(s: &str, max_chars: usize) -> String {
    let char_count = s.chars().count();
    if char_count <= max_chars {
        return s.to_string();
    }
    let truncated = truncate_chars(s, max_chars.saturating_sub(1));
    format!("{}…", truncated)
}

#[cfg(test)]
mod tests {
    use super::*;
    #[test]
    fn test_truncate_bytes() {
        assert_eq!(truncate_bytes("hello", 3), "hel");
        assert_eq!(truncate_bytes("café", 3), "caf");
    }
    #[test]
    fn test_truncate_chars() {
        assert_eq!(truncate_chars("café", 3), "caf");
        assert_eq!(truncate_chars("hello", 10), "hello");
    }
    #[test]
    fn test_ellipsis() {
        assert_eq!(truncate_with_ellipsis("hello world", 8), "hello w…");
        assert_eq!(truncate_with_ellipsis("hi", 10), "hi");
    }
    #[test]
    fn test_emoji() {
        let s = "🌍🌎🌏";
        assert_eq!(truncate_chars(s, 2).chars().count(), 2);
    }
}

(* 498. Safe Unicode truncation – OCaml *)
let truncate_bytes s max_bytes =
  let n = String.length s in
  if n <= max_bytes then s
  else
    (* Walk back to find valid UTF-8 boundary *)
    let i = ref (min max_bytes n) in
    (* Skip continuation bytes (0x80..0xBF) *)
    while !i > 0 && Char.code s.[!i-1] land 0xC0 = 0x80 do decr i done;
    String.sub s 0 !i

let truncate_chars s max_chars =
  let chars = List.init (String.length s) (String.get s) in
  (* crude: count UTF-8 code points *)
  let rec take_chars bytes n =
    if n=0 || bytes=[] then []
    else let b = Char.code (List.hd bytes) in
         let len = if b land 0x80=0 then 1 else if b land 0xE0=0xC0 then 2
                   else if b land 0xF0=0xE0 then 3 else 4 in
         List.filteri (fun i _ -> i<len) bytes @ take_chars (List.filteri (fun i _ -> i>=len) bytes) (n-1)
  in
  let _ = take_chars in
  (* Simple approach: count chars *)
  let count = ref 0 and i = ref 0 and n = String.length s in
  while !i < n && !count < max_chars do
    let b = Char.code s.[!i] in
    let len = if b land 0x80=0 then 1 else if b land 0xE0=0xC0 then 2
              else if b land 0xF0=0xE0 then 3 else 4 in
    i := !i + len; incr count
  done;
  String.sub s 0 !i

let () =
  let s = "Hello, caf\xc3\xa9 world!" in
  Printf.printf "bytes: %d\n" (String.length s);
  Printf.printf "trunc 8 bytes: %s\n" (truncate_bytes s 8);
  Printf.printf "trunc 5 chars: %s\n" (truncate_chars s 5)

Key Differences

**is_char_boundary**: Rust provides this as a standard str method; OCaml needs Uutf to check UTF-8 boundaries.

Zero-copy truncation: Both truncate_bytes and truncate_chars return &str pointing into the original — no allocation; OCaml's String.sub always allocates.

**char_indices().nth(n)**: Rust's O(N) character indexing via char_indices is explicit about its cost; OCaml's fold is equally O(N) but less idiomatically readable.

**Ellipsis as char**: '…' is 3 UTF-8 bytes; using saturating_sub(1) reserves one character position, not one byte — correct in Rust's char-counting truncate_chars.

OCaml Approach

let truncate_bytes s max_bytes =
  if String.length s <= max_bytes then s
  else
    (* Walk back to UTF-8 boundary using Uutf *)
    let i = ref max_bytes in
    while !i > 0 && not (Uutf.String.is_char_boundary s !i) do decr i done;
    String.sub s 0 !i

let truncate_chars s max_chars =
  let i = ref 0 and count = ref 0 in
  Uutf.String.fold_utf_8 (fun () p _ ->
    if !count < max_chars then (i := p; incr count)) () s;
  if !count <= max_chars then s
  else String.sub s 0 !i

OCaml has no standard is_char_boundary — it requires Uutf for correct Unicode truncation. String.sub allocates a new string.

Full Source

#![allow(clippy::all)]
// 498. Safe Unicode truncation
fn truncate_bytes(s: &str, max_bytes: usize) -> &str {
    if s.len() <= max_bytes {
        return s;
    }
    // floor_char_boundary available in Rust 1.72+
    // For compatibility, implement manually:
    let mut end = max_bytes;
    while end > 0 && !s.is_char_boundary(end) {
        end -= 1;
    }
    &s[..end]
}

fn truncate_chars(s: &str, max_chars: usize) -> &str {
    match s.char_indices().nth(max_chars) {
        Some((byte_pos, _)) => &s[..byte_pos],
        None => s, // shorter than max_chars
    }
}

fn truncate_with_ellipsis(s: &str, max_chars: usize) -> String {
    let char_count = s.chars().count();
    if char_count <= max_chars {
        return s.to_string();
    }
    let truncated = truncate_chars(s, max_chars.saturating_sub(1));
    format!("{}…", truncated)
}

#[cfg(test)]
mod tests {
    use super::*;
    #[test]
    fn test_truncate_bytes() {
        assert_eq!(truncate_bytes("hello", 3), "hel");
        assert_eq!(truncate_bytes("café", 3), "caf");
    }
    #[test]
    fn test_truncate_chars() {
        assert_eq!(truncate_chars("café", 3), "caf");
        assert_eq!(truncate_chars("hello", 10), "hello");
    }
    #[test]
    fn test_ellipsis() {
        assert_eq!(truncate_with_ellipsis("hello world", 8), "hello w…");
        assert_eq!(truncate_with_ellipsis("hi", 10), "hi");
    }
    #[test]
    fn test_emoji() {
        let s = "🌍🌎🌏";
        assert_eq!(truncate_chars(s, 2).chars().count(), 2);
    }
}

(* 498. Safe Unicode truncation – OCaml *)
let truncate_bytes s max_bytes =
  let n = String.length s in
  if n <= max_bytes then s
  else
    (* Walk back to find valid UTF-8 boundary *)
    let i = ref (min max_bytes n) in
    (* Skip continuation bytes (0x80..0xBF) *)
    while !i > 0 && Char.code s.[!i-1] land 0xC0 = 0x80 do decr i done;
    String.sub s 0 !i

let truncate_chars s max_chars =
  let chars = List.init (String.length s) (String.get s) in
  (* crude: count UTF-8 code points *)
  let rec take_chars bytes n =
    if n=0 || bytes=[] then []
    else let b = Char.code (List.hd bytes) in
         let len = if b land 0x80=0 then 1 else if b land 0xE0=0xC0 then 2
                   else if b land 0xF0=0xE0 then 3 else 4 in
         List.filteri (fun i _ -> i<len) bytes @ take_chars (List.filteri (fun i _ -> i>=len) bytes) (n-1)
  in
  let _ = take_chars in
  (* Simple approach: count chars *)
  let count = ref 0 and i = ref 0 and n = String.length s in
  while !i < n && !count < max_chars do
    let b = Char.code s.[!i] in
    let len = if b land 0x80=0 then 1 else if b land 0xE0=0xC0 then 2
              else if b land 0xF0=0xE0 then 3 else 4 in
    i := !i + len; incr count
  done;
  String.sub s 0 !i

let () =
  let s = "Hello, caf\xc3\xa9 world!" in
  Printf.printf "bytes: %d\n" (String.length s);
  Printf.printf "trunc 8 bytes: %s\n" (truncate_bytes s 8);
  Printf.printf "trunc 5 chars: %s\n" (truncate_chars s 5)

✓ Tests Rust test suite

#[cfg(test)]
mod tests {
    use super::*;
    #[test]
    fn test_truncate_bytes() {
        assert_eq!(truncate_bytes("hello", 3), "hel");
        assert_eq!(truncate_bytes("café", 3), "caf");
    }
    #[test]
    fn test_truncate_chars() {
        assert_eq!(truncate_chars("café", 3), "caf");
        assert_eq!(truncate_chars("hello", 10), "hello");
    }
    #[test]
    fn test_ellipsis() {
        assert_eq!(truncate_with_ellipsis("hello world", 8), "hello w…");
        assert_eq!(truncate_with_ellipsis("hi", 10), "hi");
    }
    #[test]
    fn test_emoji() {
        let s = "🌍🌎🌏";
        assert_eq!(truncate_chars(s, 2).chars().count(), 2);
    }
}

Exercises

**floor_char_boundary**: Rust 1.72 added str::floor_char_boundary(n); rewrite truncate_bytes to use it and add a cfg! fallback for older Rust versions.

Truncate by display width: Use the unicode-width crate to truncate based on terminal column width (CJK characters are 2 columns wide).

Sentence truncation: Write truncate_sentence(s: &str, max_chars: usize) -> String that truncates at the last sentence boundary (./!/?) before max_chars rather than mid-word.

Open Source Repos

functional-rust

View the source for this example on GitHub — OCaml and Rust side by side in the repo.

Rust