πŸ¦€ Functional Rust

764: Binary Serialization: Length-Prefixed Records

Difficulty: 3 Level: Intermediate Encode structured data as compact bytes β€” length-prefixed strings, big-endian integers, booleans β€” and decode them back exactly, with safe error handling for truncated input.

The Problem This Solves

Text formats like JSON and CSV are convenient but expensive. Parsing a JSON float requires scanning character by character and calling a string-to-float converter. A binary `f64` is 8 bytes read directly into memory β€” no parsing, no allocation. For high-throughput systems (network protocols, file formats, message queues), binary serialization is often 10-100x faster and produces significantly smaller payloads. Binary formats also appear in protocol implementations: network packets, database file formats, binary message queues, firmware communication. When you're talking to hardware, you're always dealing with binary. When you're writing a file format, binary gives you control over exact byte layout and compatibility across architectures. The central challenge in binary serialization is length prefixes. Unlike JSON where a string ends at `"`, binary strings need their length encoded first. "Alice" becomes `[0, 0, 0, 5, 65, 108, 105, 99, 101]` β€” 4 bytes of length, then 5 bytes of UTF-8. The reader knows exactly how many bytes to consume.

The Intuition

Think of TCP/IP framing: each packet has a header that says how long the payload is. Length-prefixed records work the same way. Writer prepends length; reader reads length first, then reads exactly that many bytes. Big-endian byte order (`to_be_bytes`, `from_be_bytes`) is the network standard. It means the most significant byte comes first β€” `256u32` is `[0, 0, 1, 0]`. All systems agree on this order, unlike native endian which varies by CPU architecture. The `BinaryWriter` / `BinaryReader` pair is a classic I/O abstraction: one type appends bytes; the other tracks a cursor into a `&[u8]` slice. The cursor advances as you read β€” if you ever try to read past the end, you get `DecodeError::UnexpectedEof` rather than a panic or silent wrong data.

How It Works in Rust

// Writer β€” accumulates bytes into a Vec
pub struct BinaryWriter(Vec<u8>);

impl BinaryWriter {
 pub fn write_u32_be(&mut self, v: u32) {
     self.0.extend_from_slice(&v.to_be_bytes()); // 4 bytes, big-endian
 }

 pub fn write_string(&mut self, s: &str) {
     self.write_u32_be(s.len() as u32);      // length prefix
     self.0.extend_from_slice(s.as_bytes()); // then the content
 }

 pub fn finish(self) -> Vec<u8> { self.0 }
}

// Reader β€” cursor into a byte slice
pub struct BinaryReader<'a> { data: &'a [u8], pos: usize }

impl<'a> BinaryReader<'a> {
 fn consume(&mut self, n: usize) -> Result<&'a [u8], DecodeError> {
     if self.pos + n > self.data.len() {
         return Err(DecodeError::UnexpectedEof);  // safe: no panic
     }
     let slice = &self.data[self.pos..self.pos + n];
     self.pos += n;
     Ok(slice)
 }

 pub fn read_u32_be(&mut self) -> Result<u32, DecodeError> {
     let b = self.consume(4)?;
     Ok(u32::from_be_bytes(b.try_into().unwrap()))  // slice β†’ [u8;4] β†’ u32
 }

 pub fn read_string(&mut self) -> Result<&'a str, DecodeError> {
     let len = self.read_u32_be()? as usize;        // read the length
     let bytes = self.consume(len)?;                 // read exactly that many bytes
     std::str::from_utf8(bytes).map_err(|_| DecodeError::InvalidUtf8)
 }
}

// Encode/decode a domain type
impl Person {
 pub fn encode(&self) -> Vec<u8> {
     let mut w = BinaryWriter::new();
     w.write_string(&self.name); // [4 bytes len][name bytes]
     w.write_u32_be(self.age);   // [4 bytes]
     w.write_bool(self.active);  // [1 byte: 0 or 1]
     w.finish()
 }

 pub fn decode(data: &[u8]) -> Result<Self, DecodeError> {
     let mut r = BinaryReader::new(data);
     let name   = r.read_string()?.to_string();
     let age    = r.read_u32_be()?;
     let active = r.read_bool()?;
     Ok(Person { name, age, active })
 }
}

// Multi-record stream: length-prefix each record for framing
let mut buf = Vec::new();
for person in &people {
 let encoded = person.encode();
 buf.extend_from_slice(&(encoded.len() as u32).to_be_bytes()); // frame length
 buf.extend_from_slice(&encoded);
}
Key points:

What This Unlocks

Key Differences

ConceptOCamlRust
Binary I/O`Bytes.create`, `Bytes.set_int32_be``Vec<u8>` + `u32::to_be_bytes()`
Reading`Buffer.contents`, `Bytes.get_int32_be``BinaryReader` cursor struct
Endianness`Bytes.set_int32_be` / `Bytes.get_int32_be``.to_be_bytes()` / `from_be_bytes()`
Truncated inputException`Result<_, DecodeError::UnexpectedEof>`
Length-prefixed stringManual`write_u32_be(len); write bytes; read_u32_be(); consume(len)`
Production library`bin_prot`, `marshal``bincode`, `prost` (protobuf), `rkyv`
// 764. Binary Serialization: Length-Prefixed Records
// TLV-style binary format, std-only

// ── Encoder ────────────────────────────────────────────────────────────────────

pub struct BinaryWriter(Vec<u8>);

impl BinaryWriter {
    pub fn new() -> Self { Self(Vec::new()) }

    pub fn write_u8(&mut self, v: u8) {
        self.0.push(v);
    }

    pub fn write_u32_be(&mut self, v: u32) {
        self.0.extend_from_slice(&v.to_be_bytes());
    }

    pub fn write_u64_be(&mut self, v: u64) {
        self.0.extend_from_slice(&v.to_be_bytes());
    }

    pub fn write_bool(&mut self, v: bool) {
        self.write_u8(if v { 1 } else { 0 });
    }

    /// Length-prefixed string: u32 length then UTF-8 bytes
    pub fn write_string(&mut self, s: &str) {
        self.write_u32_be(s.len() as u32);
        self.0.extend_from_slice(s.as_bytes());
    }

    pub fn finish(self) -> Vec<u8> { self.0 }
}

// ── Decoder ────────────────────────────────────────────────────────────────────

#[derive(Debug)]
pub enum DecodeError {
    UnexpectedEof,
    InvalidUtf8,
}

pub struct BinaryReader<'a> {
    data: &'a [u8],
    pos: usize,
}

impl<'a> BinaryReader<'a> {
    pub fn new(data: &'a [u8]) -> Self { Self { data, pos: 0 } }

    fn consume(&mut self, n: usize) -> Result<&'a [u8], DecodeError> {
        if self.pos + n > self.data.len() { return Err(DecodeError::UnexpectedEof); }
        let slice = &self.data[self.pos..self.pos + n];
        self.pos += n;
        Ok(slice)
    }

    pub fn read_u8(&mut self) -> Result<u8, DecodeError> {
        Ok(self.consume(1)?[0])
    }

    pub fn read_u32_be(&mut self) -> Result<u32, DecodeError> {
        let b = self.consume(4)?;
        Ok(u32::from_be_bytes(b.try_into().unwrap()))
    }

    pub fn read_u64_be(&mut self) -> Result<u64, DecodeError> {
        let b = self.consume(8)?;
        Ok(u64::from_be_bytes(b.try_into().unwrap()))
    }

    pub fn read_bool(&mut self) -> Result<bool, DecodeError> {
        Ok(self.read_u8()? != 0)
    }

    pub fn read_string(&mut self) -> Result<&'a str, DecodeError> {
        let len = self.read_u32_be()? as usize;
        let bytes = self.consume(len)?;
        std::str::from_utf8(bytes).map_err(|_| DecodeError::InvalidUtf8)
    }
}

// ── Domain type ────────────────────────────────────────────────────────────────

#[derive(Debug, PartialEq)]
pub struct Person {
    pub name: String,
    pub age: u32,
    pub active: bool,
}

impl Person {
    pub fn encode(&self) -> Vec<u8> {
        let mut w = BinaryWriter::new();
        w.write_string(&self.name);
        w.write_u32_be(self.age);
        w.write_bool(self.active);
        w.finish()
    }

    pub fn decode(data: &[u8]) -> Result<Self, DecodeError> {
        let mut r = BinaryReader::new(data);
        let name   = r.read_string()?.to_string();
        let age    = r.read_u32_be()?;
        let active = r.read_bool()?;
        Ok(Person { name, age, active })
    }
}

fn hex_dump(data: &[u8]) -> String {
    data.iter().map(|b| format!("{b:02X}")).collect::<Vec<_>>().join(" ")
}

fn main() {
    let alice = Person { name: "Alice".into(), age: 30, active: true };
    let encoded = alice.encode();
    println!("Encoded ({} bytes): {}", encoded.len(), hex_dump(&encoded));

    let decoded = Person::decode(&encoded).expect("decode failed");
    println!("Decoded: {decoded:?}");

    // Multiple records in one buffer
    let records = vec![
        Person { name: "Bob".into(),   age: 25, active: false },
        Person { name: "Carol".into(), age: 35, active: true  },
    ];
    let mut buf = Vec::new();
    for r in &records {
        let enc = r.encode();
        buf.extend_from_slice(&(enc.len() as u32).to_be_bytes());
        buf.extend_from_slice(&enc);
    }
    println!("\nMulti-record buffer ({} bytes): {}", buf.len(), hex_dump(&buf));

    // Decode multi-record
    let mut pos = 0;
    while pos + 4 <= buf.len() {
        let len = u32::from_be_bytes(buf[pos..pos+4].try_into().unwrap()) as usize;
        pos += 4;
        let p = Person::decode(&buf[pos..pos+len]).unwrap();
        println!("  Record: {p:?}");
        pos += len;
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn round_trip() {
        let p = Person { name: "Dave".into(), age: 40, active: true };
        assert_eq!(p, Person::decode(&p.encode()).unwrap());
    }

    #[test]
    fn utf8_name() {
        let p = Person { name: "Ümür".into(), age: 33, active: false };
        assert_eq!(p, Person::decode(&p.encode()).unwrap());
    }

    #[test]
    fn eof_error() {
        assert!(matches!(Person::decode(&[]), Err(DecodeError::UnexpectedEof)));
    }

    #[test]
    fn length_prefix_correct() {
        let p = Person { name: "Ed".into(), age: 1, active: false };
        let enc = p.encode();
        // first 4 bytes = 2 (length of "Ed")
        assert_eq!(&enc[..4], &[0, 0, 0, 2]);
        // next 2 bytes = "Ed"
        assert_eq!(&enc[4..6], b"Ed");
    }
}
(* Binary format: length-prefixed records in OCaml *)

(* ── Encoder ────────────────────────────────────────────────────────────────── *)
let encode_u32 n =
  let b = Bytes.create 4 in
  Bytes.set_uint8 b 0 ((n lsr 24) land 0xFF);
  Bytes.set_uint8 b 1 ((n lsr 16) land 0xFF);
  Bytes.set_uint8 b 2 ((n lsr  8) land 0xFF);
  Bytes.set_uint8 b 3 ( n         land 0xFF);
  b

let encode_string s =
  let len_bytes = encode_u32 (String.length s) in
  Bytes.cat len_bytes (Bytes.of_string s)

(* Record: tag(1 byte) + string name + u32 age + bool active *)
type person = { name: string; age: int; active: bool }

let encode p =
  let buf = Buffer.create 64 in
  Buffer.add_bytes buf (encode_string p.name);
  Buffer.add_bytes buf (encode_u32 p.age);
  Buffer.add_uint8 buf (if p.active then 1 else 0);
  Buffer.to_bytes buf

(* ── Decoder ────────────────────────────────────────────────────────────────── *)
let decode_u32 bytes pos =
  let b0 = Bytes.get_uint8 bytes  pos    in
  let b1 = Bytes.get_uint8 bytes (pos+1) in
  let b2 = Bytes.get_uint8 bytes (pos+2) in
  let b3 = Bytes.get_uint8 bytes (pos+3) in
  (b0 lsl 24) lor (b1 lsl 16) lor (b2 lsl 8) lor b3, pos + 4

let decode_string bytes pos =
  let len, pos = decode_u32 bytes pos in
  (Bytes.sub_string bytes pos len), pos + len

let decode bytes =
  let pos = ref 0 in
  let name, p = decode_string bytes !pos in pos := p;
  let age,  p = decode_u32    bytes !pos in pos := p;
  let active  = Bytes.get_uint8 bytes !pos = 1 in
  ignore pos;
  { name; age; active }

let hex_dump bytes =
  Bytes.to_seq bytes
  |> Seq.map (fun b -> Printf.sprintf "%02X" (Char.code b))
  |> List.of_seq
  |> String.concat " "

let () =
  let alice = { name = "Alice"; age = 30; active = true } in
  let encoded = encode alice in
  Printf.printf "Encoded (%d bytes): %s\n" (Bytes.length encoded) (hex_dump encoded);
  let decoded = decode encoded in
  Printf.printf "Decoded: name=%s age=%d active=%b\n"
    decoded.name decoded.age decoded.active