๐Ÿฆ€ Functional Rust

498: Safe Unicode Truncation

Difficulty: 1 Level: Intermediate Truncate a UTF-8 string to a maximum length without splitting a multi-byte character in the middle.

The Problem This Solves

Rust strings are UTF-8. A character like `รฉ` takes 2 bytes, a CJK ideograph takes 3, and an emoji takes 4. If you write `&s[..10]`, Rust will panic at runtime if byte 10 falls in the middle of a multi-byte sequence โ€” there is no partial character. This matters everywhere you display strings in constrained spaces: UI labels, log prefixes, database column limits, API responses. You need a `truncate` that is both safe (never panics) and correct (never produces garbled output). There are actually two different meanings of "length" in play: byte length (what the database cares about) and character count (what the user sees). A good truncation library exposes both, plus an ellipsis variant for display use.

The Intuition

Walk the string using `char_indices()`, which yields `(byte_position, char)` pairs. Stop at the nth character โ€” the byte position at that point is exactly the safe cut. For byte-limited truncation, walk backwards from the desired byte position until you land on a character boundary (`is_char_boundary`).

How It Works in Rust

Byte truncation โ€” walk back to a boundary:
fn truncate_bytes(s: &str, max_bytes: usize) -> &str {
 if s.len() <= max_bytes { return s; }
 let mut end = max_bytes;
 while end > 0 && !s.is_char_boundary(end) { end -= 1; }
 &s[..end]
}
`is_char_boundary` returns `true` at positions where a character starts (or at `s.len()`). Walking backwards always terminates because byte 0 is always a boundary. Character truncation โ€” use `char_indices().nth()`:
fn truncate_chars(s: &str, max_chars: usize) -> &str {
 match s.char_indices().nth(max_chars) {
     Some((byte_pos, _)) => &s[..byte_pos],
     None => s,
 }
}
`nth(n)` returns the `(byte_offset, char)` of the nth character, or `None` if the string is shorter. The byte offset is the exact slice boundary we need. Ellipsis variant:
fn truncate_with_ellipsis(s: &str, max_chars: usize) -> String {
 if s.chars().count() <= max_chars { return s.to_string(); }
 let truncated = truncate_chars(s, max_chars.saturating_sub(1));
 format!("{}โ€ฆ", truncated)
}
Reserve one character position for `โ€ฆ` (U+2026, a single Unicode character). Since Rust 1.72, `str::floor_char_boundary(n)` does the walk-back for you in the standard library.

What This Unlocks

Key Differences

ConceptOCamlRust
String encodingBytes (raw), UTF-8 by conventionUTF-8 guaranteed by type
Safe substring`String.sub` (byte positions, UB if mid-char)`&s[..]` panics on bad boundary; use `is_char_boundary`
Char iteration with positions`String.foldi``s.char_indices()` yields `(usize, char)`
Char count vs byte count`String.length` = bytes`.chars().count()` = chars, `.len()` = bytes
// 498. Safe Unicode truncation
fn truncate_bytes(s: &str, max_bytes: usize) -> &str {
    if s.len() <= max_bytes { return s; }
    // floor_char_boundary available in Rust 1.72+
    // For compatibility, implement manually:
    let mut end = max_bytes;
    while end > 0 && !s.is_char_boundary(end) { end -= 1; }
    &s[..end]
}

fn truncate_chars(s: &str, max_chars: usize) -> &str {
    match s.char_indices().nth(max_chars) {
        Some((byte_pos, _)) => &s[..byte_pos],
        None => s, // shorter than max_chars
    }
}

fn truncate_with_ellipsis(s: &str, max_chars: usize) -> String {
    let char_count = s.chars().count();
    if char_count <= max_chars { return s.to_string(); }
    let truncated = truncate_chars(s, max_chars.saturating_sub(1));
    format!("{}โ€ฆ", truncated)
}

fn main() {
    let texts = ["Hello, World!", "cafรฉ au lait", "Hello ๐ŸŒ๐ŸŒŽ๐ŸŒ World", "็Ÿญใ„"];

    for s in &texts {
        let t_bytes = truncate_bytes(s, 10);
        let t_chars = truncate_chars(s, 5);
        let t_ell   = truncate_with_ellipsis(s, 8);
        println!("'{}' bytes={} chars={}", s, s.len(), s.chars().count());
        println!("  trunc_bytes(10)='{}' trunc_chars(5)='{}' ellipsis(8)='{}'", t_bytes, t_chars, t_ell);
    }

    // floor_char_boundary (Rust 1.72+)
    let s = "cafรฉ";
    // byte 3 is mid-'รฉ' (2-byte char starting at 3)
    println!("is_char_boundary(3): {}", s.is_char_boundary(3)); // false
    println!("is_char_boundary(4): {}", s.is_char_boundary(4)); // false
    println!("is_char_boundary(5): {}", s.is_char_boundary(5)); // true (end of รฉ)
}

#[cfg(test)]
mod tests {
    use super::*;
    #[test] fn test_truncate_bytes() { assert_eq!(truncate_bytes("hello",3),"hel"); assert_eq!(truncate_bytes("cafรฉ",3),"caf"); }
    #[test] fn test_truncate_chars() { assert_eq!(truncate_chars("cafรฉ",3),"caf"); assert_eq!(truncate_chars("hello",10),"hello"); }
    #[test] fn test_ellipsis()       { assert_eq!(truncate_with_ellipsis("hello world",8),"hello wโ€ฆ"); assert_eq!(truncate_with_ellipsis("hi",10),"hi"); }
    #[test] fn test_emoji()          { let s="๐ŸŒ๐ŸŒŽ๐ŸŒ"; assert_eq!(truncate_chars(s,2).chars().count(),2); }
}
(* 498. Safe Unicode truncation โ€“ OCaml *)
let truncate_bytes s max_bytes =
  let n = String.length s in
  if n <= max_bytes then s
  else
    (* Walk back to find valid UTF-8 boundary *)
    let i = ref (min max_bytes n) in
    (* Skip continuation bytes (0x80..0xBF) *)
    while !i > 0 && Char.code s.[!i-1] land 0xC0 = 0x80 do decr i done;
    String.sub s 0 !i

let truncate_chars s max_chars =
  let chars = List.init (String.length s) (String.get s) in
  (* crude: count UTF-8 code points *)
  let rec take_chars bytes n =
    if n=0 || bytes=[] then []
    else let b = Char.code (List.hd bytes) in
         let len = if b land 0x80=0 then 1 else if b land 0xE0=0xC0 then 2
                   else if b land 0xF0=0xE0 then 3 else 4 in
         List.filteri (fun i _ -> i<len) bytes @ take_chars (List.filteri (fun i _ -> i>=len) bytes) (n-1)
  in
  let _ = take_chars in
  (* Simple approach: count chars *)
  let count = ref 0 and i = ref 0 and n = String.length s in
  while !i < n && !count < max_chars do
    let b = Char.code s.[!i] in
    let len = if b land 0x80=0 then 1 else if b land 0xE0=0xC0 then 2
              else if b land 0xF0=0xE0 then 3 else 4 in
    i := !i + len; incr count
  done;
  String.sub s 0 !i

let () =
  let s = "Hello, caf\xc3\xa9 world!" in
  Printf.printf "bytes: %d\n" (String.length s);
  Printf.printf "trunc 8 bytes: %s\n" (truncate_bytes s 8);
  Printf.printf "trunc 5 chars: %s\n" (truncate_chars s 5)