import pydicom, os
from pathlib import Path
dicom_dir = "dicube-testdata/dicom/sample_200"
dicom_files = list(Path(dicom_dir).glob("*"))[:2]
ds1 = pydicom.dcmread(dicom_files[0], stop_before_pixels=True)
ds2 = pydicom.dcmread(dicom_files[1], stop_before_pixels=True)
print("PatientName same:", ds1.PatientName == ds2.PatientName)
print("SeriesInstanceUID same:", ds1.SeriesInstanceUID == ds2.SeriesInstanceUID)
print("InstanceNumber same:", ds1.InstanceNumber == ds2.InstanceNumber)Metadata Storage Mechanism
1. Legacy DICOM Pattern: Redundant Headers
DICOM’s one‑file‑per‑slice model leads to heavy repetition of series‑level metadata across hundreds of files. Only a few fields vary per slice (e.g., InstanceNumber, ImagePositionPatient). This bloats storage and hurts I/O when scanning entire series for a single field.
2. Embracing DICOM JSON
DICOM PS3.18 defines a JSON model for headers—human‑readable and tool‑friendly. DiCube adopts it internally to maximize interoperability and future‑proofing.
import json
ds = pydicom.dcmread(dicom_files[0], stop_before_pixels=True)
dicom_json_str = ds.to_json()
data = json.loads(dicom_json_str)
for tag in ["00100010","00080021","00200013"]: # PatientName, SeriesDate, InstanceNumber
if tag in data:
vr = data[tag]["vr"]
value = data[tag].get("Value", ["N/A"])[0]
print(f"Tag {tag} (VR: {vr}): {value}")4. Extreme Compression: JSON + Zstandard
Structured, repetitive text (JSON) compresses extremely well with Zstandard (zstd).
from pathlib import Path
import zstandard as zstd
import numpy as np
all_files = list(Path(dicom_dir).glob("*"))
dicom_header_total_size = 0
for f in all_files:
total_size = os.path.getsize(f)
ds = pydicom.dcmread(f)
pixel_size = ds.pixel_array.nbytes if hasattr(ds,'pixel_array') else 0
dicom_header_total_size += (total_size - pixel_size)
meta_json_str = meta.to_json()
compressed = zstd.ZstdCompressor(level=9).compress(meta_json_str.encode('utf-8'))
print("DICOM header total (est.):", dicom_header_total_size/1024, "KB")
print("DiCube meta (zstd):", len(compressed)/1024, "KB")6. Performance Leap
Vectorized metadata access vs file‑by‑file parsing delivers order‑of‑magnitude wins.
import time
start = time.time();
dicom_instance_numbers = []
for f in all_files:
ds = pydicom.dcmread(f, stop_before_pixels=True)
dicom_instance_numbers.append(int(ds.InstanceNumber))
dicom_ms = (time.time()-start)*1000
start = time.time();
dicube_instance_numbers = meta.get_values(CommonTags.InstanceNumber)
dicube_ms = (time.time()-start)*1000
print(f"DICOM: {dicom_ms:.2f} ms, DiCube: {dicube_ms:.2f} ms, Speedup: {dicom_ms/dicube_ms:.1f}×")
os.remove("temp_demo.dcbs")7. Summary
- Remove redundancy by splitting shared/per‑slice; JSON + zstd yields tiny headers
- Millisecond‑level access for common queries; ideal for listing, previews, and AI
- Standards‑aligned (DICOM JSON), future‑proof and interoperable