Efficient Compression

Summary

Segmentation masks are highly sparse. We benchmark MedMask vs NIfTI (.nii.gz) and NumPy (.npz) for binary and multi-label masks, and explain why MedMask’s PackBits + Zstandard pipeline excels.

1. Unified Benchmark Harness

import os, time, gzip, tempfile
import numpy as np, nibabel as nib, pandas as pd
import zstandard as zstd
from pathlib import Path
from medmask import SegmentationMask
from spacetransformer import Space

mask_dir = Path('dicube-testdata/mask/s0000')
with open(mask_dir / 'nonzero_masks.txt') as f:
    valid_files = [line.strip() for line in f]
print(f"Loaded {len(valid_files)} masks")

def run_benchmark(format_type, data, space=None, label_mapping=None, original_path=None):
    stats = {'size': 0, 'encode_time': 0, 'decode_time': 0}
    with tempfile.TemporaryDirectory() as tmpdir:
        tmp_path = Path(tmpdir) / f"tempfile.{format_type.split('_')[0]}"
        start = time.time()
        if format_type == 'nifti':
            if original_path:
                stats['size'] = original_path.stat().st_size
            else:
                affine = np.eye(4)
                nib.save(nib.Nifti1Image(data.astype(np.uint8), affine), tmp_path.with_suffix('.nii.gz'))
                stats['size'] = tmp_path.with_suffix('.nii.gz').stat().st_size
        elif format_type == 'npz':
            np.savez_compressed(tmp_path, mask=data)
            stats['size'] = tmp_path.stat().st_size
        elif format_type == 'medmask':
            SegmentationMask(data, label_mapping, space=space).save(tmp_path)
            stats['size'] = tmp_path.stat().st_size
        stats['encode_time'] = (time.time()-start)*1000

        if stats['size'] > 0:
            start = time.time()
            if format_type == 'npz':
                _ = np.load(tmp_path)['mask']
            elif format_type == 'medmask':
                loaded = SegmentationMask.load(tmp_path)
                if label_mapping:
                    for name in label_mapping:
                        _ = loaded.get_binary_mask_by_names(name)
            stats['decode_time'] = (time.time()-start)*1000
    return stats

Loaded 21 masks

2. Binary Masks

binary_files = [
    'gluteus_maximus_right.nii.gz',
    'urinary_bladder.nii.gz',
    'colon.nii.gz',
    'iliopsoas_left.nii.gz',
    'iliac_artery_left.nii.gz',
    'small_bowel.nii.gz'
]

rows = []
for fname in binary_files:
    original_path = mask_dir / fname
    nii = nib.load(original_path)
    data = nii.get_fdata().astype(np.uint8)
    space = Space.from_nifti(nii)
    organ = fname.replace('.nii.gz', '')

    stats_nifti = run_benchmark('nifti', data=data, original_path=original_path)
    stats_npz = run_benchmark('npz', data=data)
    stats_medmask = run_benchmark('medmask', data=data, space=space, label_mapping={organ:1})

    rows.append({
        'Mask': organ,
        'Nonzero': int(np.count_nonzero(data)),
        'NIfTI (KB)': stats_nifti['size']/1024,
        'NPZ (KB)': stats_npz['size']/1024,
        'MedMask (KB)': stats_medmask['size']/1024,
        'NPZ encode (ms)': stats_npz['encode_time'],
        'MedMask encode (ms)': stats_medmask['encode_time'],
        'NPZ decode (ms)': stats_npz['decode_time'],
        'MedMask decode (ms)': stats_medmask['decode_time']
    })

df_binary = pd.DataFrame(rows)
if not df_binary.empty:
    avg = df_binary.select_dtypes(include=np.number).mean()
    avg['Mask'] = 'Average'
    df_binary = pd.concat([df_binary, avg.to_frame().T], ignore_index=True)

df_binary['MedMask vs NIfTI'] = df_binary['NIfTI (KB)'] / df_binary['MedMask (KB)']
df_binary['NPZ vs NIfTI'] = df_binary['NIfTI (KB)'] / df_binary['NPZ (KB)']

display(df_binary.style.format({
    'NIfTI (KB)': '{:.1f}', 'NPZ (KB)': '{:.1f}', 'MedMask (KB)': '{:.1f}',
    'NPZ encode (ms)': '{:.1f}', 'MedMask encode (ms)': '{:.1f}',
    'NPZ decode (ms)': '{:.1f}', 'MedMask decode (ms)': '{:.1f}',
    'MedMask vs NIfTI': '{:.1f}×', 'NPZ vs NIfTI': '{:.1f}×',
    'Nonzero': '{:,}'
}).hide(axis='index'))

Mask	Nonzero	NIfTI (KB)	NPZ (KB)	MedMask (KB)	NPZ encode (ms)	MedMask encode (ms)	NPZ decode (ms)	MedMask decode (ms)	MedMask vs NIfTI	NPZ vs NIfTI
gluteus_maximus_right	150,852	72.2	25.5	15.9	48.3	63.6	23.6	17.2	4.5×	2.8×
urinary_bladder	68,016	54.8	15.0	8.4	54.2	66.8	33.5	22.1	6.5×	3.6×
colon	59,305	55.8	16.4	10.2	41.8	64.6	24.5	9.0	5.4×	3.4×
iliopsoas_left	20,865	49.5	13.3	4.5	45.6	57.8	25.8	9.1	10.9×	3.7×
iliac_artery_left	1,155	44.2	10.5	1.5	40.2	66.7	25.0	9.2	30.3×	4.2×
small_bowel	244	43.4	9.9	1.1	44.4	57.3	25.0	7.0	39.4×	4.4×
Average	50,072.833333333336	53.3	15.1	6.9	45.8	62.8	26.2	12.3	7.7×	3.5×

MedMask dramatically shrinks sparse masks and matches NPZ timings.

3. Multi-Label Masks

test_groups = [
    {'name': 'Gluteus muscles', 'files': ['gluteus_maximus_left.nii.gz','gluteus_maximus_right.nii.gz','gluteus_medius_left.nii.gz','gluteus_medius_right.nii.gz']},
    {'name': 'Femur + Hip', 'files': ['femur_left.nii.gz','femur_right.nii.gz','hip_left.nii.gz','hip_right.nii.gz']},
    {'name': 'Pelvic organs', 'files': ['urinary_bladder.nii.gz','colon.nii.gz','small_bowel.nii.gz']}
]

results = []
for group in test_groups:
    first_img = nib.load(mask_dir / group['files'][0])
    space = Space.from_nifti(first_img)
    multi = np.zeros(first_img.shape, dtype=np.uint8)
    mapping = {}

    for idx, fname in enumerate(group['files'], start=1):
        fpath = mask_dir / fname
        if not fpath.exists():
            continue
        data = nib.load(fpath).get_fdata().astype(np.uint8)
        multi[data > 0] = idx
        mapping[fname.replace('.nii.gz','')] = idx

    stats_nifti = run_benchmark('nifti', data=multi, space=space)
    stats_npz = run_benchmark('npz', data=multi)
    stats_medmask = run_benchmark('medmask', data=multi, space=space, label_mapping=mapping)

    results.append({
        'Group': group['name'],
        'Labels': len(group['files']),
        'NIfTI (KB)': stats_nifti['size']/1024,
        'NPZ (KB)': stats_npz['size']/1024,
        'MedMask (KB)': stats_medmask['size']/1024,
        'NPZ encode (ms)': stats_npz['encode_time'],
        'MedMask encode (ms)': stats_medmask['encode_time'],
        'NPZ decode (ms)': stats_npz['decode_time'],
        'MedMask decode (ms)': stats_medmask['decode_time']
    })

df_multi = pd.DataFrame(results)
if not df_multi.empty:
    avg = df_multi.select_dtypes(include=np.number).mean()
    avg['Group'] = 'Average'
    df_multi = pd.concat([df_multi, avg.to_frame().T], ignore_index=True)

df_multi['MedMask vs NIfTI'] = df_multi['NIfTI (KB)'] / df_multi['MedMask (KB)']
df_multi['NPZ vs NIfTI'] = df_multi['NIfTI (KB)'] / df_multi['NPZ (KB)']

display(df_multi.style.format({
    'NIfTI (KB)': '{:.1f}', 'NPZ (KB)': '{:.1f}', 'MedMask (KB)': '{:.1f}',
    'NPZ encode (ms)': '{:.1f}', 'MedMask encode (ms)': '{:.1f}',
    'NPZ decode (ms)': '{:.1f}', 'MedMask decode (ms)': '{:.1f}',
    'MedMask vs NIfTI': '{:.1f}×', 'NPZ vs NIfTI': '{:.1f}×'
}).hide(axis='index'))

Group	Labels	NIfTI (KB)	NPZ (KB)	MedMask (KB)	NPZ encode (ms)	MedMask encode (ms)	NPZ decode (ms)	MedMask decode (ms)	MedMask vs NIfTI	NPZ vs NIfTI
Gluteus muscles	4	108.9	34.3	31.6	50.1	4.2	23.0	15.7	3.4×	3.2×
Femur + Hip	4	103.0	36.5	31.4	48.5	4.7	31.1	7.4	3.3×	2.8×
Pelvic organs	3	66.6	27.0	16.7	47.7	5.1	24.1	10.6	4.0×	2.5×
Average	3.666667	92.8	32.6	26.6	48.8	4.7	26.1	11.2	3.5×	2.8×

MedMask stays smaller even when storing many labels together.

4. PackBits + Zstandard

Binary masks benefit from bit packing before Zstd compression.

mask_path = mask_dir / 'urinary_bladder.nii.gz'
img = nib.load(mask_path)
data = img.get_fdata()>0
raw = data.tobytes()
packbits = np.packbits(data).tobytes()

raw_size = len(raw)
packbits_size = len(packbits)

gzip_size = len(gzip.compress(raw))
zstd_size = len(zstd.ZstdCompressor().compress(raw))
combo_size = len(zstd.ZstdCompressor().compress(packbits))

print("Raw (KB):", raw_size/1024)
print("PackBits (KB):", packbits_size/1024)
print("Gzip (KB):", gzip_size/1024)
print("Zstd (KB):", zstd_size/1024)
print("PackBits + Zstd (KB):", combo_size/1024)

Raw (KB): 9867.375
PackBits (KB): 1233.421875
Gzip (KB): 14.5283203125
Zstd (KB): 8.013671875
PackBits + Zstd (KB): 4.390625

PackBits removes the structural waste of storing booleans in bytes (~8× reduction). Zstd then compresses the compact stream further.

Conclusion

Binary masks: PackBits preprocessing + Zstd delivers dramatic size reduction with comparable speed
Multi-label masks: Zstd alone still beats gzip/NPZ
MedMask adapts per mask type, yielding strong compression with fast encode/decode for clinical pipelines