This is a quick reference cheatsheet for NumPack. For detailed documentation, see the full API reference.
- Installation
- Basic Usage
- API Cheatsheet
- Decision Trees
- Common Patterns
- Performance Tips
- Troubleshooting
# From PyPI
pip install numpack
# From source
pip install maturin>=1.0,<2.0
git clone https://github.com/BirchKwok/NumPack.git
cd NumPack
maturin developRequirements:
- Python >= 3.9
- NumPy >= 1.26.0
import numpy as np
from numpack import NumPack
# Using context manager (recommended)
with NumPack("data.npk") as npk:
npk.save({'array': np.random.rand(1000, 100)})# Eager loading (full array in memory)
with NumPack("data.npk") as npk:
arr = npk.load('array')
# Lazy loading (memory-mapped, on-demand)
with NumPack("data.npk") as npk:
lazy_arr = npk.load('array', lazy=True)
subset = lazy_arr[100:200]with NumPack("data.npk") as npk:
# Replace rows (397x faster than NPY)
npk.replace({'array': new_data}, [0, 1, 2])
# Append rows (405x faster than NPY)
npk.append({'array': more_data})
# Drop rows
npk.drop('array', [0, 1, 2])# Basic
npk = NumPack("file.npk")
# With options
npk = NumPack(
"file.npk",
drop_if_exists=True, # Delete if exists
strict_context_mode=True, # Require 'with' statement
force_gc_on_close=False # GC on close (default: False)
)# Context manager (recommended)
with NumPack("file.npk") as npk:
# Operations...
pass
# Manual management
npk = NumPack("file.npk")
npk.open()
# Operations...
npk.close()# Save single array
npk.save({'array': data})
# Save multiple arrays
npk.save({
'features': features,
'labels': labels
})# Eager load
arr = npk.load('array')
# Lazy load
lazy = npk.load('array', lazy=True)
# Dictionary-style
arr = npk['array']# Replace rows
npk.replace({'array': new_data}, [0, 1, 2])
npk.replace({'array': new_data}, slice(0, 10))
# Append rows
npk.append({'array': new_data})
# Drop entire array
npk.drop('array')
# Drop specific rows
npk.drop('array', [0, 1, 2])# Get specific rows
rows = npk.getitem('array', [0, 10, 20])
# Single row
row = npk.getitem('array', 0)# Stream in batches
for batch in npk.stream_load('array', buffer_size=1000):
process(batch)from numpack.io import (
# Memory β .npk
from_torch, to_torch, # PyTorch
from_arrow, to_arrow, # PyArrow
from_parquet_table, to_parquet_table, # Parquet
from_safetensors, to_safetensors, # SafeTensors
# File β .npk (streaming)
from_torch_file, to_torch_file,
from_feather_file, to_feather_file,
from_parquet_file, to_parquet_file,
from_safetensors_file, to_safetensors_file,
)
# Memory β .npk
from_torch(tensor, 'output.npk', array_name='embeddings')
from_arrow(table, 'output.npk', array_name='data')
# .npk β Memory
tensor = to_torch('input.npk', array_name='embeddings')
table = to_arrow('input.npk', array_name='data')
# File β .npk (streaming)
from_torch_file('model.pt', 'output.npk')
from_parquet_file('data.parquet', 'output.npk')
# .npk β File (streaming)
to_torch_file('input.npk', 'output.pt')
to_parquet_file('input.npk', 'output.parquet')# Get shape
shape = npk.get_shape('array')
# List all arrays
arrays = npk.get_member_list()
# Check existence
exists = npk.has_array('array')
# Get modification time
timestamp = npk.get_modify_time('array')
# Clone an array
npk.clone('source_array', 'target_array')
# Reset (clear all)
npk.reset()
# Compact after deletions
npk.update('array')
# Get I/O statistics
stats = npk.get_io_stats()# Batch mode (25-37x speedup)
with npk.batch_mode():
for i in range(100):
arr = npk.load('array')
arr *= 2
npk.save({'array': arr})
# Writable batch mode (174x speedup)
with npk.writable_batch_mode() as wb:
arr = wb.load('array')
for i in range(100):
arr *= 2 # Direct modificationlazy = npk.load('array', lazy=True)
# Indexing
row = lazy[0]
rows = lazy[100:200]
selection = lazy[[0, 10, 20]]
# Shape info
shape = lazy.shape
dtype = lazy.dtype
size = lazy.size
# Convert to numpy
numpy_arr = np.asarray(lazy)Need to modify data frequently?
ββ Yes
β ββ Array > 100MB?
β β ββ Yes β Use writable_batch_mode (174x speedup)
β β ββ No β Use batch_mode (25-37x speedup)
β ββ No (single modification)
β ββ Use normal replace/append (397-405x faster)
β
ββ No (read-only)
ββ Need full array?
β ββ Yes β Use eager load (1.3x faster)
β ββ No β Use lazy load (54x faster init)
ββ Sequential access?
ββ Use stream_load (memory-efficient)
Need to change array shape?
ββ Yes β Use batch_mode
ββ No
ββ Array size > available RAM * 0.5?
β ββ Yes β Use writable_batch_mode
β ββ No β Continue
ββ Array size > 100MB?
β ββ Yes β Use writable_batch_mode
β ββ No β Use batch_mode
What do you want to do?
ββ Create new array or replace entire array
β ββ Use save()
ββ Modify specific rows
β ββ Use replace() (397x faster)
ββ Add new rows
β ββ Use append() (405x faster)
ββ Remove rows
ββ Use drop()
with NumPack("data.npk") as npk:
# Create
npk.save({'data': np.random.rand(100, 50)})
# Read
data = npk.load('data')
# Update
npk.replace({'data': new_data}, [0, 1, 2])
# Delete
npk.drop('data', [0, 1, 2])with NumPack("data.npk") as npk:
with npk.writable_batch_mode() as wb:
arr = wb.load('data') # Load once
# Multiple modifications (174x faster)
arr *= 2.0
arr += 1.0
arr[arr < 0] = 0with NumPack("large.npk") as npk:
total = 0
# Process in batches
for batch in npk.stream_load('data', buffer_size=10000):
total += batch.sum()
mean = total / npk.get_shape('data')[0]with NumPack("data.npk") as npk:
lazy = npk.load('features', lazy=True)
# Quick stats on subset
sample = lazy[:1000]
print(f"Mean: {sample.mean()}")
print(f"Std: {sample.std()}")with NumPack("input.npk") as src:
with NumPack("output.npk", drop_if_exists=True) as dst:
first = True
for batch in src.stream_load('data', buffer_size=10000):
processed = transform(batch)
if first:
dst.save({'data': processed})
first = False
else:
dst.append({'data': processed})with NumPack("train.npk") as npk:
features = npk.load('features', lazy=True)
labels = npk.load('labels', lazy=True)
for epoch in range(10):
for i in range(0, len(features), batch_size):
X = features[i:i+batch_size]
y = labels[i:i+batch_size]
train_step(X, y)with NumPack("data.npk") as npk:
with npk.writable_batch_mode() as wb:
labels = wb.load('labels')
samples = npk.load('samples', lazy=True)
for i, sample in enumerate(samples):
display(sample)
labels[i] = get_user_input()
# Auto-saveddef save_checkpoint(npk, epoch, weights, optimizer):
with npk.batch_mode():
for name, w in weights.items():
npk.save({f'weight_{name}': w})
for name, s in optimizer.items():
npk.save({f'optimizer_{name}': s})
npk.save({'epoch': np.array([epoch])})
def load_checkpoint(npk):
weights = {}
optimizer = {}
for name in npk.get_member_list():
if name.startswith('weight_'):
weights[name[7:]] = npk.load(name)
elif name.startswith('optimizer_'):
optimizer[name[10:]] = npk.load(name)
epoch = npk.load('epoch')[0]
return epoch, weights, optimizer# Use context manager
with NumPack("data.npk") as npk:
# Operations
# Load once, use multiple times
with NumPack("data.npk") as npk:
arr = npk.load('data')
for i in range(100):
process(arr)
# Use batch mode for frequent modifications
with npk.writable_batch_mode() as wb:
arr = wb.load('data')
for i in range(100):
arr *= 1.1
# Use lazy load for partial access
lazy = npk.load('data', lazy=True)
subset = lazy[1000:2000]
# Use streaming for large data
for batch in npk.stream_load('data', buffer_size=10000):
process(batch)
# Use appropriate dtype
data = np.random.rand(100, 50).astype(np.float32)# Don't create instance in loop
for i in range(100):
with NumPack("data.npk") as npk:
data = npk.load('data')
# Don't load repeatedly
with NumPack("data.npk") as npk:
for i in range(100):
arr = npk.load('data') # Reloads every time
value = arr[i]
# Don't use normal mode for frequent modifications
for i in range(100):
arr = npk.load('data')
arr *= 2
npk.save({'data': arr}) # Slow
# Don't eager load for partial access
arr = npk.load('large_data') # Loads 10GB
subset = arr[:100] # Only need 1MB
# Don't use float64 unnecessarily
data = np.random.rand(100, 50) # float64, 2x size| Operation | NumPack | NPY | Advantage |
|---|---|---|---|
| Replace 100 rows | 0.047ms | 18.51ms | 397x faster |
| Append 100 rows | 0.067ms | 27.09ms | 405x faster |
| Lazy load init | 0.002ms | 0.107ms | 54x faster |
| Full load | 8.27ms | 10.51ms | 1.3x faster |
| Save (initial) | 16.15ms | 7.19ms | 2.2x slower |
| Mode | Time (100 ops) | Speedup |
|---|---|---|
| Normal | 856ms | 1x |
| Batch mode | 34ms | 25x |
| Writable batch | 4.9ms | 174x |
Cause: Trying to use NumPack before opening file
Solution:
# Use context manager
with NumPack("data.npk") as npk:
npk.save({'data': arr})
# Or manually open
npk = NumPack("data.npk")
npk.open()
npk.save({'data': arr})
npk.close()Cause: Using strict mode without context manager
Solution:
npk = NumPack("data.npk", strict_context_mode=True)
with npk:
npk.save({'data': arr})Cause: Windows file handle management
Solution:
# Best: Use context manager
with NumPack("data.npk") as npk:
# Operations
# Or suppress warning
npk = NumPack("data.npk", warn_no_context=False)Cause: Loading large array eagerly
Solutions:
# 1. Use lazy loading
lazy = npk.load('large', lazy=True)
subset = lazy[1000:2000]
# 2. Use streaming
for batch in npk.stream_load('large', buffer_size=1000):
process(batch)
# 3. Use writable batch mode (zero memory)
with npk.writable_batch_mode() as wb:
arr = wb.load('large') # mmap, not in RAM
arr *= 2Cause: Not using batch mode
Solution:
# Instead of normal mode
for i in range(100):
arr = npk.load('data')
arr *= 2
npk.save({'data': arr})
# Use writable batch mode (174x faster)
with npk.writable_batch_mode() as wb:
arr = wb.load('data')
for i in range(100):
arr *= 2Cause: Logical deletion, space not reclaimed
Solution:
# After dropping rows
npk.drop('array', [0, 1, 2])
# Physically compact
npk.update('array')Cause: Array doesn't exist in file
Solution:
# Check before loading
if npk.has_array('array'):
arr = npk.load('array')
else:
print("Array not found")
# Or list all arrays
arrays = npk.get_member_list()
print(f"Available arrays: {arrays}")Cause: Column count doesn't match
Solution:
# Check shape first
shape = npk.get_shape('array')
rows, cols = shape
# Ensure replacement has same column count
new_data = np.random.rand(10, cols)
npk.replace({'array': new_data}, list(range(10)))# Always use context manager
with NumPack("data.npk") as npk:
# Operations
# Force cleanup if needed
from numpack import force_cleanup_windows_handles
force_cleanup_windows_handles()# Context manager recommended but not required
npk = NumPack("data.npk")
npk.open()
# Operations
npk.close()from numpack import get_backend_info
info = get_backend_info()
print(info)
# {
# 'backend_type': 'rust',
# 'platform': 'Darwin',
# 'is_windows': False,
# 'version': '0.4.1'
# }# Supported NumPy dtypes
types = {
'bool': np.bool_,
'int8': np.int8,
'int16': np.int16,
'int32': np.int32,
'int64': np.int64,
'uint8': np.uint8,
'uint16': np.uint16,
'uint32': np.uint32,
'uint64': np.uint64,
'float16': np.float16,
'float32': np.float32,
'float64': np.float64,
'complex64': np.complex64,
'complex128': np.complex128,
}
# Recommended for most cases
recommended = np.float32 # Good balance of precision and sizedef estimate_memory(rows, cols, dtype):
"""Estimate memory usage"""
dtype_sizes = {
np.float32: 4,
np.float64: 8,
np.int32: 4,
np.int64: 8,
np.uint8: 1,
}
bytes_total = rows * cols * dtype_sizes[dtype]
mb = bytes_total / 1024 / 1024
return mb
# Example
rows, cols = 1000000, 100
print(f"float32: {estimate_memory(rows, cols, np.float32):.1f} MB") # 381 MB
print(f"float64: {estimate_memory(rows, cols, np.float64):.1f} MB") # 762 MBFor complete documentation, see Documentation Index