822 Fundamental

Burrows-Wheeler Transform

Functional Programming

Tutorial Video

Text description (accessibility)

This video demonstrates the "Burrows-Wheeler Transform" functional Rust example. Difficulty level: Fundamental. Key concepts covered: Functional Programming. Data compression algorithms like bzip2 achieve high compression ratios by first applying the Burrows-Wheeler Transform (BWT) to rearrange text so that similar characters cluster together, making run-length encoding and move-to-front coding highly effective. Key difference from OCaml: | Aspect | Rust | OCaml |

Tutorial

The Problem

Data compression algorithms like bzip2 achieve high compression ratios by first applying the Burrows-Wheeler Transform (BWT) to rearrange text so that similar characters cluster together, making run-length encoding and move-to-front coding highly effective. The BWT is a reversible transformation: it takes a string, generates all rotations, sorts them lexicographically, and returns the last column. The magic is that similar contexts cluster the last-column characters. The inverse BWT recovers the original string exactly. BWT is also used in bioinformatics for FM-index construction, enabling compressed full-text indexes that power genome alignment tools like BWA and Bowtie.

🎯 Learning Outcomes

• Understand BWT as the last column of the sorted rotation matrix

• Implement forward BWT: generate all rotations, sort, take last characters, record original row index

• Implement inverse BWT using the first/last column correspondence and the rank property

• Recognize why BWT aids compression: characters with similar contexts appear together

• Connect BWT to the FM-index and suffix array for full-text search

Code Example

#![allow(clippy::all)]
//! Placeholder

(* Burrows-Wheeler Transform in OCaml *)

(* Compare two rotations of s: rotation starting at i vs rotation starting at j *)
let compare_rotations (s : string) (n : int) (i : int) (j : int) : int =
  let rec cmp k =
    if k = n then 0
    else
      let ci = s.[(i + k) mod n] and cj = s.[(j + k) mod n] in
      if ci < cj then -1
      else if ci > cj then 1
      else cmp (k + 1)
  in
  cmp 0

(* Forward BWT: returns (transformed_string, index_of_original_row) *)
let bwt (input : string) : string * int =
  let s = input ^ "$" in
  let n = String.length s in
  (* Sort rotation indices *)
  let indices = Array.init n (fun i -> i) in
  Array.sort (compare_rotations s n) indices;
  (* Last column = character before the start of each sorted rotation *)
  let transformed = String.init n (fun i -> s.[(indices.(i) + n - 1) mod n]) in
  (* Find the row corresponding to the original string *)
  let original_row =
    let found = ref 0 in
    Array.iteri (fun row i -> if i = 0 then found := row) indices;
    !found
  in
  (transformed, original_row)

(* Inverse BWT using the LF-mapping *)
let ibwt (bwt_str : string) (original_row : int) : string =
  let n = String.length bwt_str in
  let l = Array.init n (String.get bwt_str) in
  (* First column F = sorted last column L *)
  let f = Array.copy l in
  Array.sort Char.compare f;
  (* Count occurrences of each char in F (prefix counts) *)
  (* LF-mapping: next.(i) = j where l.(i) maps to f.(j) *)
  (* Build rank array: rank.(i) = how many times l.(i) appeared before i *)
  let rank = Array.make n 0 in
  let seen = Hashtbl.create 26 in
  Array.iteri (fun i c ->
    let cnt = match Hashtbl.find_opt seen c with None -> 0 | Some v -> v in
    rank.(i) <- cnt;
    Hashtbl.replace seen c (cnt + 1)
  ) l;
  (* For each char c, first_occ.(c) = first position of c in f *)
  let first_occ = Hashtbl.create 26 in
  Array.iteri (fun i c ->
    if not (Hashtbl.mem first_occ c) then
      Hashtbl.add first_occ c i
  ) f;
  (* Recover original string by following LF-mapping n-1 times *)
  let result = Buffer.create (n - 1) in
  let row = ref original_row in
  for _ = 0 to n - 2 do
    let c = l.(!row) in
    Buffer.add_char result c;
    row := (Hashtbl.find first_occ c) + rank.(!row)
  done;
  (* The recovered string is reversed and includes '$', strip it *)
  let s = Buffer.contents result in
  let reversed = String.init (String.length s) (fun i -> s.[String.length s - 1 - i]) in
  (* Remove trailing '$' *)
  String.sub reversed 0 (String.length reversed - 1)

let () =
  let tests = ["banana"; "abracadabra"; "mississippi"; "hello"] in
  List.iter (fun s ->
    let (t, row) = bwt s in
    let recovered = ibwt t row in
    Printf.printf "BWT(%S) = %S (row=%d), inverse = %S, ok=%b\n"
      s t row recovered (recovered = s)
  ) tests

Key Differences

Aspect	Rust	OCaml
Rotation comparison	`chain` iterator, zero allocation	`String.sub` concat or simulated
BWT string	`String` collected from iterator	`String.init` with `Array.get`
Inverse LF-mapping	Rank array via counting	Array sort + index correspondence
Memory (sort)	O(n) indices + O(n log n) sort	Same approach
Rotation materialization	Avoided via index sort	Avoided similarly
Compression use	Foundation for bzip2-like	Same theoretical role

OCaml Approach

OCaml implements BWT with Array.init n (fun i -> i) for rotation indices and Array.sort with a comparator simulating rotation. String.get s ((i + n - 1) mod n) gets the last character. OCaml's String.init builds the BWT string. The inverse BWT uses a Array.sort-built rank table mapping characters to their position in the first column. OCaml's functional style uses Array.fold_left for the inverse reconstruction loop. The String.concat "" [s1; s2] approach for rotation comparison is simpler but allocates O(n) per comparison.

Full Source

#![allow(clippy::all)]
//! Placeholder

(* Burrows-Wheeler Transform in OCaml *)

(* Compare two rotations of s: rotation starting at i vs rotation starting at j *)
let compare_rotations (s : string) (n : int) (i : int) (j : int) : int =
  let rec cmp k =
    if k = n then 0
    else
      let ci = s.[(i + k) mod n] and cj = s.[(j + k) mod n] in
      if ci < cj then -1
      else if ci > cj then 1
      else cmp (k + 1)
  in
  cmp 0

(* Forward BWT: returns (transformed_string, index_of_original_row) *)
let bwt (input : string) : string * int =
  let s = input ^ "$" in
  let n = String.length s in
  (* Sort rotation indices *)
  let indices = Array.init n (fun i -> i) in
  Array.sort (compare_rotations s n) indices;
  (* Last column = character before the start of each sorted rotation *)
  let transformed = String.init n (fun i -> s.[(indices.(i) + n - 1) mod n]) in
  (* Find the row corresponding to the original string *)
  let original_row =
    let found = ref 0 in
    Array.iteri (fun row i -> if i = 0 then found := row) indices;
    !found
  in
  (transformed, original_row)

(* Inverse BWT using the LF-mapping *)
let ibwt (bwt_str : string) (original_row : int) : string =
  let n = String.length bwt_str in
  let l = Array.init n (String.get bwt_str) in
  (* First column F = sorted last column L *)
  let f = Array.copy l in
  Array.sort Char.compare f;
  (* Count occurrences of each char in F (prefix counts) *)
  (* LF-mapping: next.(i) = j where l.(i) maps to f.(j) *)
  (* Build rank array: rank.(i) = how many times l.(i) appeared before i *)
  let rank = Array.make n 0 in
  let seen = Hashtbl.create 26 in
  Array.iteri (fun i c ->
    let cnt = match Hashtbl.find_opt seen c with None -> 0 | Some v -> v in
    rank.(i) <- cnt;
    Hashtbl.replace seen c (cnt + 1)
  ) l;
  (* For each char c, first_occ.(c) = first position of c in f *)
  let first_occ = Hashtbl.create 26 in
  Array.iteri (fun i c ->
    if not (Hashtbl.mem first_occ c) then
      Hashtbl.add first_occ c i
  ) f;
  (* Recover original string by following LF-mapping n-1 times *)
  let result = Buffer.create (n - 1) in
  let row = ref original_row in
  for _ = 0 to n - 2 do
    let c = l.(!row) in
    Buffer.add_char result c;
    row := (Hashtbl.find first_occ c) + rank.(!row)
  done;
  (* The recovered string is reversed and includes '$', strip it *)
  let s = Buffer.contents result in
  let reversed = String.init (String.length s) (fun i -> s.[String.length s - 1 - i]) in
  (* Remove trailing '$' *)
  String.sub reversed 0 (String.length reversed - 1)

let () =
  let tests = ["banana"; "abracadabra"; "mississippi"; "hello"] in
  List.iter (fun s ->
    let (t, row) = bwt s in
    let recovered = ibwt t row in
    Printf.printf "BWT(%S) = %S (row=%d), inverse = %S, ok=%b\n"
      s t row recovered (recovered = s)
  ) tests

Deep Comparison

OCaml vs Rust: Burrows Wheeler

Overview

See the example.rs and example.ml files for detailed implementations.

Key Differences

Aspect	OCaml	Rust
Type system	Hindley-Milner	Ownership + traits
Memory	GC	Zero-cost abstractions
Mutability	Explicit ref	mut keyword
Error handling	Option/Result	Result<T, E>

See README.md for detailed comparison.

Exercises

Implement the inverse BWT to recover the original string from (bwt, original_row).

Integrate BWT with move-to-front encoding and run-length encoding to build a simple compressor.

Measure compression ratio on English text vs. random bytes to demonstrate BWT's effectiveness.

Build a simplified FM-index using the BWT and a rank/select data structure for O(m) pattern search.

Compare BWT-based compression ratios with deflate (gzip) on natural language text.

Open Source Repos

functional-rust

View the source for this example on GitHub — OCaml and Rust side by side in the repo.

Rust