update speed benchmark script to test npy and npz

fangq · fangq · commit 9c7465396130 · 2022-08-31T17:41:46.000-04:00
diff --git a/README.md b/README.md
@@ -108,7 +108,7 @@ to HDF5 filters. Currently supported codecs include `zlib`, `gzip`, `lz4`, `lzma
 `blosc2zstd`. To apply a selected compression method, one simply set `{'compression':'method'}` as
 the option to `jdata.encode` or `jdata.save` function; `jdata.load` or `jdata.decode` automatically
 decompress the data based on the `_ArrayZipType_` annotation present in the data. Only `blosc2`
-compression methods support multi-threading. To set the thread number, one should define a `nthread`
+compression methods support multi-threading. To set the thread number, one should define an `nthread`
 value in the option (`opt`) for both encoding and decoding.
 
 
diff --git a/test/benchcodecs.py b/test/benchcodecs.py
@@ -1,30 +1,58 @@
+"""
+    Speed benchmark for saving/loading numpy arrays using various compression codecs
+"""
 import jdata as jd
 import numpy as np
 import time
 import os
 
 print("jdata version:" + jd.__version__)
 
-codecs = ["zlib", "lzma", "lz4", "blosc2blosclz", "blosc2lz4", "blosc2lz4hc", "blosc2zlib", "blosc2zstd"]
+codecs = ["npy", "npz", "zlib", "lzma", "lz4", "blosc2blosclz", "blosc2lz4", "blosc2lz4hc", "blosc2zlib", "blosc2zstd"]
+nthread = 8
+
 
 def benchmark(codec, x):
     t0 = time.time()
-    jd.save(x, "matrix_" + codec + suffix, {"compression": codec, "nthread": 8})
+    ext = suffix
+    if codec == "npy":
+        ext = "." + codec
+        np.save("matrix_" + codec + ext, x)
+    elif codec == "npz":
+        ext = "." + codec
+        np.savez_compressed("matrix_" + codec + ext, x)
+    else:
+        jd.save(x, "matrix_" + codec + ext, {"compression": codec, "nthread": nthread})
     dt = time.time() - t0  # saving time
     res = {"codec": codec, "save": dt}
-    y = jd.load("matrix_" + codec + suffix, {"nthread": 8})  # loading
-    res["load"] = time.time() - t0 - dt  # loading time
-    res["size"] = os.path.getsize("matrix_" + codec + suffix)
+    if codec == "npy":
+        y = np.load("matrix_" + codec + ext)
+    elif codec == "npz":
+        y = np.load("matrix_" + codec + ext)["arr_0"]
+    else:
+        y = jd.load("matrix_" + codec + ext, {"nthread": nthread})  # loading
     res["sum"] = y.sum()
+    res["load"] = time.time() - t0 - dt  # loading time
+    res["size"] = os.path.getsize("matrix_" + codec + ext)
     print(res)
     return res
 
 
+## a highly compressible matrix
 x = np.eye(10000)
-suffix = '.jdb'
+
+## a less compressible random matrix
+# np.random.seed(0)
+# x = np.random.rand(2000,2000)
+
+print("\n- Testing binary JSON (BJData) files (.jdb) ...")
+
+suffix = ".jdb"
 res = list(map(benchmark, codecs, [x] * len(codecs)))
 # print(np.array(res))
 
-suffix = '.jdt'
+print("\n- Testing text-based JSON files (.jdt) ...")
+
+suffix = ".jdt"
 res = list(map(benchmark, codecs, [x] * len(codecs)))
 # print(np.array(res))