diff --git a/.github/workflows/makefile.yml b/.github/workflows/makefile.yml new file mode 100644 index 0000000..65127d1 --- /dev/null +++ b/.github/workflows/makefile.yml @@ -0,0 +1,28 @@ +name: Makefile CI + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@main + + - name : Set up build environment + run: sudo apt install -y libssl-dev libxxhash-dev + working-directory: build + + - name: Check base build + run: make all + working-directory: build + + - name: Check SIMD build + run: make simd_all + working-directory: build + diff --git a/.gitignore b/.gitignore index 200344d..da1d92a 100644 --- a/.gitignore +++ b/.gitignore @@ -35,4 +35,8 @@ .vscode/* # MacOS files -**/.DS_Store \ No newline at end of file +**/.DS_Store + +results.txt +results_*.txt +results_graph.png \ No newline at end of file diff --git a/build/Makefile b/build/Makefile index 7659ec5..7dd71fc 100644 --- a/build/Makefile +++ b/build/Makefile @@ -1,4 +1,5 @@ -export ${EXTRA_COMPILER_FLAGS} +# Note: The default build target 'all' does not use SIMD acceleration + DEDUP_BUILD_PATH = ../dedup/build MEASURE_DEDUP_BUILD_PATH = ../measure-dedup/build MEASURE_LOW_ENTROPY_BUILD_PATH = ../measure-low-entropy/build @@ -25,8 +26,41 @@ all: cp $(SUPPORTING_TOOLS_PATH)/archive_extract.sh . cp $(SUPPORTING_TOOLS_PATH)/archive_ctl_path.cfg . +# To enable acceleration, pass the appropriate flags as EXTRA_COMPILER_FLAGS. +# For SSE-128, pass '-msse -msse2 -msse3 -msse4.1' as EXTRA_COMPILER_FLAGS +# For AVX-256, pass '-mavx -mavx2' as EXTRA_COMPILER_FLAGS +# For BMI2, pass '-mbmi -mbmi2' as EXTRA_COMPILER_FLAGS (used only in VSEQ) +# For AVX-512, pass '-mavx512f -mavx512vl -mavx512bw' as EXTRA_COMPILER_FLAGS + + +.PHONY: sse128 +sse128: + $(MAKE) EXTRA_COMPILER_FLAGS="-msse -msse2 -msse3 -msse4.1" all + +.PHONY: avx256 +avx256: + $(MAKE) EXTRA_COMPILER_FLAGS="-mavx -mavx2" all + +.PHONY: avx512 +avx512: + $(MAKE) EXTRA_COMPILER_FLAGS="-mavx512f -mavx512vl -mavx512bw" all + +.PHONY: arm_neon128 +arm_neon128: + $(MAKE) all +.PHONY: ibm_altivec128 +ibm_altivec128: + $(MAKE) EXTRA_COMPILER_FLAGS="-maltivec=le -mpowerpc64le" all +.PHONY: simd_all +simd_all: + $(MAKE) EXTRA_COMPILER_FLAGS="-msse -msse2 -msse3 -msse4.1 -mavx -mavx2 -mbmi -mbmi2" all + +.PHONY: simd512_all +simd512_all: + $(MAKE) EXTRA_COMPILER_FLAGS="-msse -msse2 -msse3 -msse4.1 -mavx -mavx2 -mbmi -mbmi2 -mavx512f -mavx512vl -mavx512bw" all + .PHONY: clean clean: cd $(DEDUP_BUILD_PATH) && make $@ diff --git a/build/config.txt b/build/config.txt index 6053a09..e34d58d 100644 --- a/build/config.txt +++ b/build/config.txt @@ -35,6 +35,10 @@ ae_avg_block_size=8448 ram_max_block_size=32768 ram_avg_block_size=8448 +# MAXP Parameters +maxp_window_size = 960 +maxp_max_block_size = 32768 + # TTTD Parameters tttd_min_block_size=4096 tttd_avg_block_size=8192 diff --git a/build/config_simd512_8kb/aemax_8kb.conf b/build/config_simd512_8kb/aemax_8kb.conf new file mode 100644 index 0000000..b62c77a --- /dev/null +++ b/build/config_simd512_8kb/aemax_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_simd512_8kb/aemax_8kb.out +simd_mode=none + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=max +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd512_8kb/aemin_8kb.conf b/build/config_simd512_8kb/aemin_8kb.conf new file mode 100644 index 0000000..10862ed --- /dev/null +++ b/build/config_simd512_8kb/aemin_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_simd512_8kb/aemin_8kb.out +simd_mode=none + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=min +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd512_8kb/crc32_8kb.conf b/build/config_simd512_8kb/crc32_8kb.conf new file mode 100644 index 0000000..fcbbec3 --- /dev/null +++ b/build/config_simd512_8kb/crc32_8kb.conf @@ -0,0 +1,15 @@ +# General Parameters +chunking_algo = crc +hashing_algo = xxhash128 +output_file = ./hashes_simd512_8kb/crc32_8kb.out +buffer_size = 32768 +simd_mode=none + +# 8k Avg Chunk Size +crc_hash_bits=13 +crc_window_size=256 +crc_window_step_size=1 + +crc_min_block_size=1024 +crc_avg_block_size=8192 +crc_max_block_size=32768 diff --git a/build/config_simd512_8kb/fastcdc_8kb.conf b/build/config_simd512_8kb/fastcdc_8kb.conf new file mode 100644 index 0000000..9edce20 --- /dev/null +++ b/build/config_simd512_8kb/fastcdc_8kb.conf @@ -0,0 +1,14 @@ +# General Parameters +chunking_algo=fastcdc +hashing_algo=xxhash128 +output_file=./hashes_simd512_8kb/fastcdc_8kb.out +simd_mode=none + +buffer_size=32768 + +# FastCDC Parameters +fastcdc_min_block_size=2048 +fastcdc_avg_block_size=8192 +fastcdc_max_block_size=32768 +fastcdc_normalization_level=2 +fastcdc_disable_normalization=false \ No newline at end of file diff --git a/build/config_simd512_8kb/gear_8kb.conf b/build/config_simd512_8kb/gear_8kb.conf new file mode 100644 index 0000000..c1e4f0c --- /dev/null +++ b/build/config_simd512_8kb/gear_8kb.conf @@ -0,0 +1,12 @@ +# General Parameters +chunking_algo=gear +hashing_algo=xxhash128 +output_file=./hashes_simd512_8kb/gear_8kb.out +simd_mode=none + +buffer_size=32768 + +# Gear Chunking parameters +gear_min_block_size=2048 +gear_avg_block_size=8192 +gear_max_block_size=32768 diff --git a/build/config_simd512_8kb/maxp_8kb.conf b/build/config_simd512_8kb/maxp_8kb.conf new file mode 100644 index 0000000..b651540 --- /dev/null +++ b/build/config_simd512_8kb/maxp_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=maxp +hashing_algo=xxhash128 +output_file=./hashes_simd512_8kb/maxp_8kb.out +simd_mode=none + +buffer_size=32768 + +# MAXP Parameters +maxp_window_size = 960 +maxp_max_block_size = 32768 \ No newline at end of file diff --git a/build/config_simd512_8kb/rabins_8kb.conf b/build/config_simd512_8kb/rabins_8kb.conf new file mode 100644 index 0000000..bb9f2b4 --- /dev/null +++ b/build/config_simd512_8kb/rabins_8kb.conf @@ -0,0 +1,13 @@ +# General Parameters +chunking_algo=rabins +hashing_algo=xxhash128 +output_file=./hashes_simd512_8kb/rabins_8kb.out +simd_mode=none + +buffer_size=32768 + +# Rabin Chunking Parameters +rabinc_min_block_size=2048 +rabinc_avg_block_size=8192 +rabinc_max_block_size=32768 +rabinc_window_size=48 diff --git a/build/config_simd512_8kb/ram_8kb.conf b/build/config_simd512_8kb/ram_8kb.conf new file mode 100644 index 0000000..26b2aa5 --- /dev/null +++ b/build/config_simd512_8kb/ram_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ram +hashing_algo=xxhash128 +output_file=./hashes_simd512_8kb/ram_8kb.out +simd_mode=none + +buffer_size=32768 + +# RAM Parameters +ram_max_block_size=32768 +ram_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd512_8kb/tttd_8kb.conf b/build/config_simd512_8kb/tttd_8kb.conf new file mode 100644 index 0000000..65c22ac --- /dev/null +++ b/build/config_simd512_8kb/tttd_8kb.conf @@ -0,0 +1,18 @@ +# General Parameters +chunking_algo=tttd +hashing_algo=xxhash128 +output_file=./hashes_simd512_8kb/tttd_8kb.out +simd_mode=none + +buffer_size=32768 + +# TTTD Parameters +tttd_min_block_size=2048 +tttd_avg_block_size=8192 +tttd_max_block_size=32768 + +# Rabin Chunking Parameters +rabinc_min_block_size=2048 +rabinc_avg_block_size=8192 +rabinc_max_block_size=32768 +rabinc_window_size=48 diff --git a/build/config_simd512_8kb/vaemax512_8kb.conf b/build/config_simd512_8kb/vaemax512_8kb.conf new file mode 100644 index 0000000..956fef8 --- /dev/null +++ b/build/config_simd512_8kb/vaemax512_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_simd512_8kb/vaemax512_8kb.out +simd_mode=avx512 + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=max +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd512_8kb/vaemin512_8kb.conf b/build/config_simd512_8kb/vaemin512_8kb.conf new file mode 100644 index 0000000..49254e2 --- /dev/null +++ b/build/config_simd512_8kb/vaemin512_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_simd512_8kb/vaemin512_8kb.out +simd_mode=avx512 + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=min +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd512_8kb/vmaxp512_8kb.conf b/build/config_simd512_8kb/vmaxp512_8kb.conf new file mode 100644 index 0000000..cc27192 --- /dev/null +++ b/build/config_simd512_8kb/vmaxp512_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=maxp +hashing_algo=xxhash128 +output_file=./hashes_simd512_8kb/vmaxp512_8kb.out +simd_mode=avx512 + +buffer_size=32768 + +# MAXP Parameters +maxp_window_size = 960 +maxp_max_block_size = 32768 \ No newline at end of file diff --git a/build/config_simd512_8kb/vram512_8kb.conf b/build/config_simd512_8kb/vram512_8kb.conf new file mode 100644 index 0000000..b60fc47 --- /dev/null +++ b/build/config_simd512_8kb/vram512_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ram +hashing_algo=xxhash128 +output_file=./hashes_simd512_8kb/vram512_8kb.out +simd_mode=avx512 + +buffer_size=32768 + +# RAM Parameters +ram_max_block_size=32768 +ram_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_8kb copy/aemax_8kb.conf b/build/config_simd_8kb copy/aemax_8kb.conf new file mode 100644 index 0000000..d62bff7 --- /dev/null +++ b/build/config_simd_8kb copy/aemax_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_simd_8kb/aemax_8kb.out +simd_mode=none + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=max +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_8kb copy/aemin_8kb.conf b/build/config_simd_8kb copy/aemin_8kb.conf new file mode 100644 index 0000000..f19eeda --- /dev/null +++ b/build/config_simd_8kb copy/aemin_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_simd_8kb/aemin_8kb.out +simd_mode=none + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=min +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_8kb copy/crc32_8kb.conf b/build/config_simd_8kb copy/crc32_8kb.conf new file mode 100644 index 0000000..6eaf0b1 --- /dev/null +++ b/build/config_simd_8kb copy/crc32_8kb.conf @@ -0,0 +1,15 @@ +# General Parameters +chunking_algo = crc +hashing_algo = xxhash128 +output_file = ./hashes_simd_8kb/crc32_8kb.out +buffer_size = 32768 +simd_mode=none + +# 8k Avg Chunk Size +crc_hash_bits=13 +crc_window_size=256 +crc_window_step_size=1 + +crc_min_block_size=1024 +crc_avg_block_size=8192 +crc_max_block_size=32768 diff --git a/build/config_simd_8kb copy/fastcdc_8kb.conf b/build/config_simd_8kb copy/fastcdc_8kb.conf new file mode 100644 index 0000000..5e2e6b4 --- /dev/null +++ b/build/config_simd_8kb copy/fastcdc_8kb.conf @@ -0,0 +1,14 @@ +# General Parameters +chunking_algo=fastcdc +hashing_algo=xxhash128 +output_file=./hashes_simd_8kb/fastcdc_8kb.out +simd_mode=none + +buffer_size=32768 + +# FastCDC Parameters +fastcdc_min_block_size=2048 +fastcdc_avg_block_size=8192 +fastcdc_max_block_size=32768 +fastcdc_normalization_level=2 +fastcdc_disable_normalization=false \ No newline at end of file diff --git a/build/config_simd_8kb copy/gear_8kb.conf b/build/config_simd_8kb copy/gear_8kb.conf new file mode 100644 index 0000000..306e70a --- /dev/null +++ b/build/config_simd_8kb copy/gear_8kb.conf @@ -0,0 +1,12 @@ +# General Parameters +chunking_algo=gear +hashing_algo=xxhash128 +output_file=./hashes_simd_8kb/gear_8kb.out +simd_mode=none + +buffer_size=32768 + +# Gear Chunking parameters +gear_min_block_size=2048 +gear_avg_block_size=8192 +gear_max_block_size=32768 diff --git a/build/config_simd_8kb copy/maxp_8kb.conf b/build/config_simd_8kb copy/maxp_8kb.conf new file mode 100644 index 0000000..65e48e7 --- /dev/null +++ b/build/config_simd_8kb copy/maxp_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=maxp +hashing_algo=xxhash128 +output_file=./hashes_simd_8kb/maxp_8kb.out +simd_mode=none + +buffer_size=32768 + +# MAXP Parameters +maxp_window_size = 960 +maxp_max_block_size = 32768 \ No newline at end of file diff --git a/build/config_simd_8kb copy/rabins_8kb.conf b/build/config_simd_8kb copy/rabins_8kb.conf new file mode 100644 index 0000000..b5b2919 --- /dev/null +++ b/build/config_simd_8kb copy/rabins_8kb.conf @@ -0,0 +1,13 @@ +# General Parameters +chunking_algo=rabins +hashing_algo=xxhash128 +output_file=./hashes_simd_8kb/rabins_8kb.out +simd_mode=none + +buffer_size=32768 + +# Rabin Chunking Parameters +rabinc_min_block_size=2048 +rabinc_avg_block_size=8192 +rabinc_max_block_size=32768 +rabinc_window_size=48 diff --git a/build/config_simd_8kb copy/ram_8kb.conf b/build/config_simd_8kb copy/ram_8kb.conf new file mode 100644 index 0000000..1ec14ef --- /dev/null +++ b/build/config_simd_8kb copy/ram_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ram +hashing_algo=xxhash128 +output_file=./hashes_simd_8kb/ram_8kb.out +simd_mode=none + +buffer_size=32768 + +# RAM Parameters +ram_max_block_size=32768 +ram_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_8kb copy/tttd_8kb.conf b/build/config_simd_8kb copy/tttd_8kb.conf new file mode 100644 index 0000000..aa168d0 --- /dev/null +++ b/build/config_simd_8kb copy/tttd_8kb.conf @@ -0,0 +1,18 @@ +# General Parameters +chunking_algo=tttd +hashing_algo=xxhash128 +output_file=./hashes_simd_8kb/tttd_8kb.out +simd_mode=none + +buffer_size=32768 + +# TTTD Parameters +tttd_min_block_size=2048 +tttd_avg_block_size=8192 +tttd_max_block_size=32768 + +# Rabin Chunking Parameters +rabinc_min_block_size=2048 +rabinc_avg_block_size=8192 +rabinc_max_block_size=32768 +rabinc_window_size=48 diff --git a/build/config_simd_8kb copy/vaemax256_8kb.conf b/build/config_simd_8kb copy/vaemax256_8kb.conf new file mode 100644 index 0000000..1f02697 --- /dev/null +++ b/build/config_simd_8kb copy/vaemax256_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_simd_8kb/vaemax256_8kb.out +simd_mode=avx256 + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=max +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_8kb copy/vaemin256_8kb.conf b/build/config_simd_8kb copy/vaemin256_8kb.conf new file mode 100644 index 0000000..985dc71 --- /dev/null +++ b/build/config_simd_8kb copy/vaemin256_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_simd_8kb/vaemin256_8kb.out +simd_mode=avx256 + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=min +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_8kb copy/vmaxp256_8kb.conf b/build/config_simd_8kb copy/vmaxp256_8kb.conf new file mode 100644 index 0000000..0fe58b9 --- /dev/null +++ b/build/config_simd_8kb copy/vmaxp256_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=maxp +hashing_algo=xxhash128 +output_file=./hashes_simd_8kb/vmaxp256_8kb.out +simd_mode=avx256 + +buffer_size=32768 + +# MAXP Parameters +maxp_window_size = 960 +maxp_max_block_size = 32768 \ No newline at end of file diff --git a/build/config_simd_8kb copy/vram256_8kb.conf b/build/config_simd_8kb copy/vram256_8kb.conf new file mode 100644 index 0000000..10e0484 --- /dev/null +++ b/build/config_simd_8kb copy/vram256_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ram +hashing_algo=xxhash128 +output_file=./hashes_simd_8kb/vram256_8kb.out +simd_mode=avx256 + +buffer_size=32768 + +# RAM Parameters +ram_max_block_size=32768 +ram_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_compare_8kb/aemax_8kb.conf b/build/config_simd_compare_8kb/aemax_8kb.conf new file mode 100644 index 0000000..c556b88 --- /dev/null +++ b/build/config_simd_compare_8kb/aemax_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_simd_compare_8kb/aemax_8kb.out +simd_mode=none + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=max +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_compare_8kb/aemin_8kb.conf b/build/config_simd_compare_8kb/aemin_8kb.conf new file mode 100644 index 0000000..b18d789 --- /dev/null +++ b/build/config_simd_compare_8kb/aemin_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_simd_compare_8kb/aemin_8kb.out +simd_mode=none + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=min +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_compare_8kb/maxp_8kb.conf b/build/config_simd_compare_8kb/maxp_8kb.conf new file mode 100644 index 0000000..7603a4a --- /dev/null +++ b/build/config_simd_compare_8kb/maxp_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=maxp +hashing_algo=xxhash128 +output_file=./hashes_simd_compare_8kb/maxp_8kb.out +simd_mode=none + +buffer_size=32768 + +# MAXP Parameters +maxp_window_size = 960 +maxp_max_block_size = 32768 \ No newline at end of file diff --git a/build/config_simd_compare_8kb/ram_8kb.conf b/build/config_simd_compare_8kb/ram_8kb.conf new file mode 100644 index 0000000..d0f978f --- /dev/null +++ b/build/config_simd_compare_8kb/ram_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ram +hashing_algo=xxhash128 +output_file=./hashes_simd_compare_8kb/ram_8kb.out +simd_mode=none + +buffer_size=32768 + +# RAM Parameters +ram_max_block_size=32768 +ram_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_compare_8kb/vaemax128_8kb.conf b/build/config_simd_compare_8kb/vaemax128_8kb.conf new file mode 100644 index 0000000..6ae1bad --- /dev/null +++ b/build/config_simd_compare_8kb/vaemax128_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_simd_compare_8kb/vaemax128_8kb.out +simd_mode=sse128 + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=max +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_compare_8kb/vaemax256_8kb.conf b/build/config_simd_compare_8kb/vaemax256_8kb.conf new file mode 100644 index 0000000..db54b5a --- /dev/null +++ b/build/config_simd_compare_8kb/vaemax256_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_simd_compare_8kb/vaemax256_8kb.out +simd_mode=avx256 + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=max +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_compare_8kb/vaemax512_8kb.conf b/build/config_simd_compare_8kb/vaemax512_8kb.conf new file mode 100644 index 0000000..b31813d --- /dev/null +++ b/build/config_simd_compare_8kb/vaemax512_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_simd_compare_8kb/vaemax512_8kb.out +simd_mode=avx512 + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=max +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_compare_8kb/vaemin128_8kb.conf b/build/config_simd_compare_8kb/vaemin128_8kb.conf new file mode 100644 index 0000000..ffde8ba --- /dev/null +++ b/build/config_simd_compare_8kb/vaemin128_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_simd_compare_8kb/vaemin128_8kb.out +simd_mode=sse128 + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=min +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_compare_8kb/vaemin256_8kb.conf b/build/config_simd_compare_8kb/vaemin256_8kb.conf new file mode 100644 index 0000000..92834eb --- /dev/null +++ b/build/config_simd_compare_8kb/vaemin256_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_simd_compare_8kb/vaemin256_8kb.out +simd_mode=avx256 + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=min +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_compare_8kb/vaemin512_8kb.conf b/build/config_simd_compare_8kb/vaemin512_8kb.conf new file mode 100644 index 0000000..7c50faf --- /dev/null +++ b/build/config_simd_compare_8kb/vaemin512_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_simd_compare_8kb/vaemin512_8kb.out +simd_mode=avx512 + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=min +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_compare_8kb/vmaxp128_8kb.conf b/build/config_simd_compare_8kb/vmaxp128_8kb.conf new file mode 100644 index 0000000..110befa --- /dev/null +++ b/build/config_simd_compare_8kb/vmaxp128_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=maxp +hashing_algo=xxhash128 +output_file=./hashes_simd_compare_8kb/vmaxp128_8kb.out +simd_mode=sse128 + +buffer_size=32768 + +# MAXP Parameters +maxp_window_size = 960 +maxp_max_block_size = 32768 \ No newline at end of file diff --git a/build/config_simd_compare_8kb/vmaxp256_8kb.conf b/build/config_simd_compare_8kb/vmaxp256_8kb.conf new file mode 100644 index 0000000..0e55d71 --- /dev/null +++ b/build/config_simd_compare_8kb/vmaxp256_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=maxp +hashing_algo=xxhash128 +output_file=./hashes_simd_compare_8kb/vmaxp256_8kb.out +simd_mode=avx256 + +buffer_size=32768 + +# MAXP Parameters +maxp_window_size = 960 +maxp_max_block_size = 32768 \ No newline at end of file diff --git a/build/config_simd_compare_8kb/vmaxp512_8kb.conf b/build/config_simd_compare_8kb/vmaxp512_8kb.conf new file mode 100644 index 0000000..e6f45c6 --- /dev/null +++ b/build/config_simd_compare_8kb/vmaxp512_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=maxp +hashing_algo=xxhash128 +output_file=./hashes_simd_compare_8kb/vmaxp512_8kb.out +simd_mode=avx512 + +buffer_size=32768 + +# MAXP Parameters +maxp_window_size = 960 +maxp_max_block_size = 32768 \ No newline at end of file diff --git a/build/config_simd_compare_8kb/vram128_8kb.conf b/build/config_simd_compare_8kb/vram128_8kb.conf new file mode 100644 index 0000000..b7fcb85 --- /dev/null +++ b/build/config_simd_compare_8kb/vram128_8kb.conf @@ -0,0 +1,12 @@ +# General Parameters +chunking_algo=ram +hashing_algo=xxhash128 +output_file=./hashes_simd_compare_8kb/vram128_8kb.out +simd_mode=sse128 + +buffer_size=32768 + +# RAM Parameters +ram_max_block_size=32768 +ram_avg_block_size=8448 + diff --git a/build/config_simd_compare_8kb/vram256_8kb.conf b/build/config_simd_compare_8kb/vram256_8kb.conf new file mode 100644 index 0000000..3c0913c --- /dev/null +++ b/build/config_simd_compare_8kb/vram256_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ram +hashing_algo=xxhash128 +output_file=./hashes_simd_compare_8kb/vram256_8kb.out +simd_mode=avx256 + +buffer_size=32768 + +# RAM Parameters +ram_max_block_size=32768 +ram_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_simd_compare_8kb/vram512_8kb.conf b/build/config_simd_compare_8kb/vram512_8kb.conf new file mode 100644 index 0000000..c81790f --- /dev/null +++ b/build/config_simd_compare_8kb/vram512_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ram +hashing_algo=xxhash128 +output_file=./hashes_simd_compare_8kb/vram512_8kb.out +simd_mode=avx512 + +buffer_size=32768 + +# RAM Parameters +ram_max_block_size=32768 +ram_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_unaccelerated_8kb/aemax_8kb.conf b/build/config_unaccelerated_8kb/aemax_8kb.conf new file mode 100644 index 0000000..5f86e5a --- /dev/null +++ b/build/config_unaccelerated_8kb/aemax_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_unaccelerated_8kb/aemax_8kb.out +simd_mode=none + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=max +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_unaccelerated_8kb/aemin_8kb.conf b/build/config_unaccelerated_8kb/aemin_8kb.conf new file mode 100644 index 0000000..9d95668 --- /dev/null +++ b/build/config_unaccelerated_8kb/aemin_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ae +hashing_algo=xxhash128 +output_file=./hashes_unaccelerated_8kb/aemin_8kb.out +simd_mode=none + +buffer_size=32768 + +# AE Parameters +ae_extreme_mode=min +ae_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_unaccelerated_8kb/crc32_8kb.conf b/build/config_unaccelerated_8kb/crc32_8kb.conf new file mode 100644 index 0000000..7ba397c --- /dev/null +++ b/build/config_unaccelerated_8kb/crc32_8kb.conf @@ -0,0 +1,15 @@ +# General Parameters +chunking_algo = crc +hashing_algo = xxhash128 +output_file = ./hashes_unaccelerated_8kb/crc32_8kb.out +buffer_size = 32768 +simd_mode=none + +# 8k Avg Chunk Size +crc_hash_bits=13 +crc_window_size=256 +crc_window_step_size=1 + +crc_min_block_size=1024 +crc_avg_block_size=8192 +crc_max_block_size=32768 diff --git a/build/config_unaccelerated_8kb/fastcdc_8kb.conf b/build/config_unaccelerated_8kb/fastcdc_8kb.conf new file mode 100644 index 0000000..ad6c3ff --- /dev/null +++ b/build/config_unaccelerated_8kb/fastcdc_8kb.conf @@ -0,0 +1,14 @@ +# General Parameters +chunking_algo=fastcdc +hashing_algo=xxhash128 +output_file=./hashes_unaccelerated_8kb/fastcdc_8kb.out +simd_mode=none + +buffer_size=32768 + +# FastCDC Parameters +fastcdc_min_block_size=2048 +fastcdc_avg_block_size=8192 +fastcdc_max_block_size=32768 +fastcdc_normalization_level=2 +fastcdc_disable_normalization=false \ No newline at end of file diff --git a/build/config_unaccelerated_8kb/gear_8kb.conf b/build/config_unaccelerated_8kb/gear_8kb.conf new file mode 100644 index 0000000..1a375b2 --- /dev/null +++ b/build/config_unaccelerated_8kb/gear_8kb.conf @@ -0,0 +1,12 @@ +# General Parameters +chunking_algo=gear +hashing_algo=xxhash128 +output_file=./hashes_unaccelerated_8kb/gear_8kb.out +simd_mode=none + +buffer_size=32768 + +# Gear Chunking parameters +gear_min_block_size=2048 +gear_avg_block_size=8192 +gear_max_block_size=32768 diff --git a/build/config_unaccelerated_8kb/maxp_8kb.conf b/build/config_unaccelerated_8kb/maxp_8kb.conf new file mode 100644 index 0000000..1b01b6c --- /dev/null +++ b/build/config_unaccelerated_8kb/maxp_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=maxp +hashing_algo=xxhash128 +output_file=./hashes_unaccelerated_8kb/maxp_8kb.out +simd_mode=none + +buffer_size=32768 + +# MAXP Parameters +maxp_window_size = 960 +maxp_max_block_size = 32768 \ No newline at end of file diff --git a/build/config_unaccelerated_8kb/rabins_8kb.conf b/build/config_unaccelerated_8kb/rabins_8kb.conf new file mode 100644 index 0000000..bc7bc6e --- /dev/null +++ b/build/config_unaccelerated_8kb/rabins_8kb.conf @@ -0,0 +1,13 @@ +# General Parameters +chunking_algo=rabins +hashing_algo=xxhash128 +output_file=./hashes_unaccelerated_8kb/rabins_8kb.out +simd_mode=none + +buffer_size=32768 + +# Rabin Chunking Parameters +rabinc_min_block_size=2048 +rabinc_avg_block_size=8192 +rabinc_max_block_size=32768 +rabinc_window_size=48 diff --git a/build/config_unaccelerated_8kb/ram_8kb.conf b/build/config_unaccelerated_8kb/ram_8kb.conf new file mode 100644 index 0000000..ffb7afa --- /dev/null +++ b/build/config_unaccelerated_8kb/ram_8kb.conf @@ -0,0 +1,11 @@ +# General Parameters +chunking_algo=ram +hashing_algo=xxhash128 +output_file=./hashes_unaccelerated_8kb/ram_8kb.out +simd_mode=none + +buffer_size=32768 + +# RAM Parameters +ram_max_block_size=32768 +ram_avg_block_size=8448 \ No newline at end of file diff --git a/build/config_unaccelerated_8kb/tttd_8kb.conf b/build/config_unaccelerated_8kb/tttd_8kb.conf new file mode 100644 index 0000000..7fa9420 --- /dev/null +++ b/build/config_unaccelerated_8kb/tttd_8kb.conf @@ -0,0 +1,18 @@ +# General Parameters +chunking_algo=tttd +hashing_algo=xxhash128 +output_file=./hashes_unaccelerated_8kb/tttd_8kb.out +simd_mode=none + +buffer_size=32768 + +# TTTD Parameters +tttd_min_block_size=2048 +tttd_avg_block_size=8192 +tttd_max_block_size=32768 + +# Rabin Chunking Parameters +rabinc_min_block_size=2048 +rabinc_avg_block_size=8192 +rabinc_max_block_size=32768 +rabinc_window_size=48 diff --git a/build/dedup_script.sh b/build/dedup_script.sh index 98c0e17..0552747 100755 --- a/build/dedup_script.sh +++ b/build/dedup_script.sh @@ -11,9 +11,7 @@ function display_help() { echo "Usage: $0 [OPTIONS] " echo "Options:" echo " -h, --help Show this help message" - echo " -c, Compress all files in the given directory" - echo " -s, Silent mode (no output to console)" - echo " -t, current time or suffix for output naming" + echo " -c, config directory name (without config_ included) Eg: -c simd_8kb" exit 1 } @@ -24,14 +22,6 @@ while (( "$#" )); do display_help ;; -c) - COMPRESS=true - shift - ;; - -s) - SILENT=true - shift - ;; - -t) now="$2" shift 2 ;; @@ -53,17 +43,12 @@ if [[ -z "$DIRECTORY" ]]; then exit 1 fi -if [[ $COMPRESS == true ]]; then - # Compress all files in the given directory - if [[ $SILENT == false ]]; then echo "Compressing files in directory $DIRECTORY"; fi - gzip $DIRECTORY/* -fi - # Execute dedup.exe with each config file if [[ $SILENT == false ]]; then echo "Running dedup.exe and ./measure-dedup.exe for each configuration file"; fi -rm results.txt -rm -rf ./hashes_${now}/ -mkdir ./hashes_${now}/ +rm -f results.txt +rm -rf "./hashes_${now}/" +mkdir "./hashes_${now}/" +echo "Dataset path: $DIRECTORY" >> ./results.txt for config_file in $(ls config_${now}); do echo "==================" >> ./results.txt echo $config_file >> ./results.txt diff --git a/build/plot_results.py b/build/plot_results.py new file mode 100755 index 0000000..801b9fa --- /dev/null +++ b/build/plot_results.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +import seaborn as sns +import matplotlib.pyplot as plt +import os ,sys + +# Dictionary to hold throughput values +throughput_vals = {} +space_savings_vals = {} +dataset_name = "" + +rename_dict = { "aemin":"AE-Min", + "aemax":"AE-Max", + "crc32":"CRC32", + "fastcdc":"FastCDC", + "gear":"Gear", + "maxp":"MAXP", + "rabins":"Rabin", + "ram":"RAM", + "tttd":"TTTD", + "vaemax128":"VAEMax-128", + "vaemax256":"VAEMax-256", + "vaemax512":"VAEMax-512", + "vaemin128":"VAEMin-128", + "vaemin256":"VAEMin-256", + "vaemin512":"VAEMin-512", + "vmaxp128":"VMAXP-128", + "vmaxp256":"VMAXP-256", + "vmaxp512":"VMAXP-512", + "vram128":"VRAM-128", + "vram256":"VRAM-256", + "vram512":"VRAM-512", +} + +def build_custom_order(keys): + custom_order = [] + if("CRC32" in keys): + custom_order.append("CRC32") + if("FastCDC" in keys): + custom_order.append("FastCDC") + if("Gear" in keys): + custom_order.append("Gear") + if("Rabin" in keys): + custom_order.append("Rabin") + if("TTTD" in keys): + custom_order.append("TTTD") + + if("AE-Max" in keys): + custom_order.append("AE-Max") + if("VAEMax-128" in keys): + custom_order.append("VAEMax-128") + if("VAEMax-256" in keys): + custom_order.append("VAEMax-256") + if("VAEMax-512" in keys): + custom_order.append("VAEMax-512") + + if("AE-Min" in keys): + custom_order.append("AE-Min") + if("VAEMin-128" in keys): + custom_order.append("VAEMin-128") + if("VAEMin-256" in keys): + custom_order.append("VAEMin-256") + if("VAEMin-512" in keys): + custom_order.append("VAEMin-512") + + if("MAXP" in keys): + custom_order.append("MAXP") + if("VMAXP-128" in keys): + custom_order.append("VMAXP-128") + if("VMAXP-256" in keys): + custom_order.append("VMAXP-256") + if("VMAXP-512" in keys): + custom_order.append("VMAXP-512") + + if("RAM" in keys): + custom_order.append("RAM") + if("VRAM-128" in keys): + custom_order.append("VRAM-128") + if("VRAM-256" in keys): + custom_order.append("VRAM-256") + if("VRAM-512" in keys): + custom_order.append("VRAM-512") + + return custom_order + + +# Read results.txt generated by dedup_script.sh +def read_src_throughput(results_file_path): + global throughput_vals, dataset_name + with open(results_file_path, 'r') as results_file: + curr_algo = None + curr_throughput = None + for line in results_file: + if("Dataset path:" in line): + dataset_name = line.split(":")[1].strip() + if(dataset_name[-1] == "/"): + dataset_name = dataset_name[:-1] + dataset_name = os.path.basename(dataset_name) + if(".conf" in line): + curr_algo = line.strip().split(".")[0].strip().split("_")[0].strip() + if(curr_algo in rename_dict.keys()): + curr_algo = rename_dict[curr_algo] + elif("Chunking Throughput" in line): + curr_throughput = float(line.strip().split(":")[1].strip()) / 1000 # Convert throughput to GB/s + if(curr_algo == None): + raise Exception("Throughput value detected but no chunking technique? Check results file format for errors.") + throughput_vals[curr_algo] = curr_throughput + curr_throughput = None + curr_algo = None + +def read_src_spacesavings(results_file_path): + global space_savings_vals + with open(results_file_path, 'r') as results_file: + curr_algo = None + space_savings = None + for line in results_file: + if(".conf" in line): + curr_algo = line.strip().split(".")[0].strip().split("_")[0].strip() + if(curr_algo in rename_dict.keys()): + curr_algo = rename_dict[curr_algo] + elif("Space savings" in line): + space_savings = float(line.strip().split(":")[1].strip().strip("%")) + if(curr_algo == None): + raise Exception("Space savings value detected but no chunking technique? Check results file format for errors.") + space_savings_vals[curr_algo] = space_savings + space_savings = None + curr_algo = None + +# Plot bar graph +def plot_results(): + global throughput_vals + sns.set_style('darkgrid') + sns.set_context("poster") + plt.rcParams['figure.dpi'] = 300 + plt.tick_params(axis='x', labelsize=26) + plt.tick_params(axis='y', labelsize=26) + + fig, ax = plt.subplots(2, 1, figsize=(32, 24)) + + custom_order = build_custom_order(space_savings_vals.keys()) + + sns.barplot(x = space_savings_vals.keys(), y = space_savings_vals.values(), ax=ax[0], gap=0.5, hue=space_savings_vals.keys(), legend=False, palette="inferno", edgecolor='black', order=custom_order, hue_order=custom_order) + ax[0].set_xlabel("CDC Algorithm", fontsize=30) + ax[0].set_ylabel("Space Savings (%)", fontsize=30) + ax[0].set_title("Space Savings", fontsize=45) + + custom_order = build_custom_order(throughput_vals.keys()) + + sns.barplot(x = throughput_vals.keys(), y = throughput_vals.values(), ax=ax[1], gap=0.5, hue=throughput_vals.keys(), legend=False, palette="inferno", edgecolor='black', order=custom_order, hue_order=custom_order) + ax[1].set_xlabel("CDC Algorithm", fontsize=30) + ax[1].set_ylabel("Throughput (GB/s)", fontsize=30) + ax[1].set_title("Chunking Throughput", fontsize=45) + + plt.suptitle("Dataset statistics for " + dataset_name, fontsize=60) + plt.tight_layout() + + plt.savefig('results_graph.png') + + + + +if(__name__ == "__main__"): + if(len(sys.argv) != 2): + print("Usage: python3 plot_results.py ") + print("\t : Path to results.txt file generated by dedup_script.sh") + sys.exit(0) + + results_path = sys.argv[1] + read_src_throughput(results_path) + read_src_spacesavings(results_path) + plot_results() + + print("Throughput graph plotted and saved as results_graph.png.") + diff --git a/build/plot_throughput_graph.py b/build/plot_throughput_graph.py deleted file mode 100644 index 5028a17..0000000 --- a/build/plot_throughput_graph.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -import seaborn as sns -import matplotlib.pyplot as plt -import os ,sys - -# Dictionary to hold throughput values -throughput_vals = {} - -# Read results.txt generated by dedup_script.sh -def read_src(resultS_file_path): - global throughput_vals - with open(resultS_file_path, 'r') as results_file: - curr_algo = None - curr_throughput = None - for line in results_file: - if(".conf" in line): - curr_algo = line.strip().split(".")[0].strip().split("_")[0].strip() - elif("Chunking Throughput" in line): - curr_throughput = float(line.strip().split(":")[1].strip()) - throughput_vals[curr_algo] = curr_throughput - if(curr_algo == None): - raise Exception("Throughput value detected but no chunking technique? Check results file format for errors.") - curr_throughput = None - curr_algo = None - -# Plot bar graph -def plot_results(): - global throughput_vals - sns.barplot(x = throughput_vals.keys(), y = throughput_vals.values()) - plt.tick_params(axis='x', labelsize=8) - plt.xlabel("CDC Algorithm") - plt.ylabel("Throughput (MB/s)") - plt.suptitle("Throughput of CDC Algorithms on given dataset") - plt.savefig('results_graph.png') - - - - -if(__name__ == "__main__"): - if(len(sys.argv) != 2): - print("Usage: python3 plot_throughput_graph.py ") - print("\t : Path to results.txt file generated by dedup_script.sh") - sys.exit(0) - - results_path = sys.argv[1] - read_src(results_path) - plot_results() - - print("Throughput graph plotted and saved as results_graph.png.") - diff --git a/dedup/build/Makefile b/dedup/build/Makefile index 329bb00..bd1bd99 100644 --- a/dedup/build/Makefile +++ b/dedup/build/Makefile @@ -20,7 +20,7 @@ INCLUDE_FLAGS += -I ${INCLUDE_PATH_HASHING} INCLUDE_FLAGS += -I ${INCLUDE_PATH_CONFIG} INCLUDE_FLAGS += -I ${INCLUDE_PATH_OPENSSL} -LD_FLAGS = -L /usr/local/opt/openssl@3/lib -lcrypto +LD_FLAGS = -L /usr/local/opt/openssl@3/lib -lcrypto -lxxhash SRC_PATH = ../src SRC_MAIN = $(wildcard $(SRC_PATH)/*.cpp) @@ -41,14 +41,20 @@ SRC_CONFIG = $(wildcard $(SRC_PATH_CONFIG)/*.cpp) OBJS_CONFIG = $(SRC_CONFIG:$(SRC_PATH_CONFIG)/%.cpp=%.o) DEPS_CONFIG = $(wildcard $(INCLUDE_PATH_CONFIG)/*.hpp) +# Only compiles unaccelerated code by default. +# To enable acceleration, pass the appropriate flags as EXTRA_COMPILER_FLAGS. +# For SSE-128, pass '-msse -msse2 -msse3 -msse4.1' as EXTRA_COMPILER_FLAGS +# For AVX-256, pass '-mavx -mavx2' as EXTRA_COMPILER_FLAGS +# For BMI2, pass '-mbmi -mbmi2' as EXTRA_COMPILER_FLAGS (used only in VSEQ) +# For AVX-512, pass '-mavx512f -mavx512vl -mavx512bw' as EXTRA_COMPILER_FLAGS -COMPILER_FLAGS= -std=c++17 -Wall -Wextra -Wno-format -msse -msse2 -msse3 -msse4.1 -mavx -mavx2 -O3 ${EXTRA_COMPILER_FLAGS} +# For everything except AVX-512, pass '-msse -msse2 -msse3 -msse4.1 -mavx -mavx2 -mbmi -mbmi2' as EXTRA_COMPILER_FLAGS + +COMPILER_FLAGS= -std=c++17 -Wall -Wextra -Wno-format -O3 -Wno-implicit-fallthrough ${EXTRA_COMPILER_FLAGS} CC = g++ RM = rm -f -# For AVX-512, pass '-mavx512f -mavx512vl -mavx512bw' as EXTRA_COMPILER_FLAGS - DEBUG = 1 ifeq ($(DEBUG), 1) diff --git a/dedup/include/chunking/ae_chunking.hpp b/dedup/include/chunking/ae_chunking.hpp index bc772ba..1947736 100644 --- a/dedup/include/chunking/ae_chunking.hpp +++ b/dedup/include/chunking/ae_chunking.hpp @@ -4,20 +4,43 @@ #include #include -#include "chunking_common.hpp" +#include "avx_chunking_common.hpp" #include "config.hpp" #include #define DEFAULT_AE_AVG_BLOCK_SIZE 4096 -class AE_Chunking : public virtual Chunking_Technique { +class AE_Chunking : public virtual AVX_Chunking_Technique { private: uint64_t avg_block_size; uint64_t window_size; uint64_t curr_pos; AE_Mode extreme_mode; + /** + * @brief SSE128, AVX256 and AVX512 arrays for SIMD operations + * + */ + #if defined(__SSE3__) + __m128i* sse_array; + #endif + + #if defined(__AVX2__) + __m256i* avx256_array; + #endif + + #if defined(__AVX512F__) + __m512i* avx512_array; + #endif + + #if defined(__ARM_NEON) + uint8x16_t* neon_array; + #endif + + #if defined(__ALTIVEC__) + __vector unsigned char* altivec_array; + #endif /** * @brief finds the next cut point in an array of bytes @@ -26,8 +49,30 @@ class AE_Chunking : public virtual Chunking_Technique { * @return: cutpoint position in the buffer */ uint64_t find_cutpoint(char* buff, uint64_t size) override; + uint64_t find_cutpoint_native(char* buff, uint64_t size); + + #if defined(__SSE3__) + uint64_t find_cutpoint_sse128(char* buff, uint64_t size); + #endif + + #if defined(__AVX2__) + uint64_t find_cutpoint_avx256(char* buff, uint64_t size); + #endif + + #if defined(__AVX512F__) + uint64_t find_cutpoint_avx512(char* buff, uint64_t size); + #endif + + #if defined(__ARM_NEON) + uint64_t find_cutpoint_neon(char* buff, uint64_t size); + #endif + + #if defined(__ALTIVEC__) + uint64_t find_cutpoint_altivec(char* buff, uint64_t size); + #endif + - public: + public: /** * @brief Default constructor. * @return: void diff --git a/dedup/include/chunking/avx_chunking_common.hpp b/dedup/include/chunking/avx_chunking_common.hpp index d584193..7d0752b 100644 --- a/dedup/include/chunking/avx_chunking_common.hpp +++ b/dedup/include/chunking/avx_chunking_common.hpp @@ -2,7 +2,16 @@ #define _AVX_COMMON_CHUNKING_ #include -#include + +#if defined(__SSE3__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__BMI2__) + #include +#elif defined(__ARM_NEON) + #include +#elif defined(__ALTIVEC__) + #include + #undef vector // Avoid conflict with std::vector + #undef bool // Avoid conflict with std::bool +#endif #include #include @@ -19,27 +28,211 @@ #define AVX512_REGISTER_SIZE_BYTES 64 #define AVX512_REGISTER_SIZE_INT32 16 +#define NEON_REGISTER_SIZE_BITS 128 +#define NEON_REGISTER_SIZE_BYTES 16 + +#define ALTIVEC_REGISTER_SIZE_BITS 128 +#define ALTIVEC_REGISTER_SIZE_BYTES 16 + class AVX_Chunking_Technique: public virtual Chunking_Technique{ protected: SIMD_Mode simd_mode; - const __m128i K_INV_ZERO = _mm_set1_epi8(0xFF);//_mm_set1_epi8(-1); + + #ifdef __SSE3__ + const __m128i K_INV_ZERO = _mm_set1_epi8(0xFF); + #endif + + #ifdef __AVX2__ + const __m256i K_INV_ZERO_256 = _mm256_set1_epi8(0xFF); + #endif + + // For debugging only + uint64_t chunk_counter; public: /** - * @brief Finds maximum value in the region between within the data stream *buff using 128-bit SSE instructions + * @brief Finds maximum value in the region between within the data stream *buff using SIMD instructions * * @param buff Data Stream * @param start_pos Starting position of scanned region * @param end_pos Ending position of scanned region * @return uint8_t Maximum Value */ - uint8_t find_maximum_sse128(char *buff, uint64_t start_pos, uint64_t end_pos, __m128i *xmm_array); - uint8_t find_maximum_avx256(char *buff, uint64_t start_pos, uint64_t end_pos, __m256i *xmm_array); + #ifdef __SSE3__ + uint8_t find_maximum_sse128(char *buff, uint64_t start_pos, uint64_t end_pos, __m128i *xmm_array); + #endif + + #ifdef __AVX2__ + uint8_t find_maximum_avx256(char *buff, uint64_t start_pos, uint64_t end_pos, __m256i *ymm_array); + #endif + + #if defined(__AVX512F__) + uint8_t find_maximum_avx512(char *buff, uint64_t start_pos, uint64_t end_pos, __m512i *zmm_array); + #endif + + #if defined(__ARM_NEON) + uint8_t find_maximum_neon(char *buff, uint64_t start_pos, uint64_t end_pos, uint8x16_t *neon_array); + #endif + + #ifdef __ALTIVEC__ + uint8_t find_maximum_altivec(char *buff, uint64_t start_pos, uint64_t end_pos, __vector unsigned char *vec_array); + #endif + + /** + * @brief Finds minimum value in the region between within the data stream *buff using SIMD instructions + * + * @param buff Data Stream + * @param start_pos Starting position of scanned region + * @param end_pos Ending position of scanned region + * @return uint8_t Minimum Value + */ + + #ifdef __SSE3__ + uint8_t find_minimum_sse128(char *buff, uint64_t start_pos, uint64_t end_pos, __m128i *xmm_array); + #endif + + #ifdef __AVX2__ + uint8_t find_minimum_avx256(char *buff, uint64_t start_pos, uint64_t end_pos, __m256i *ymm_array); + #endif + + #if defined(__AVX512F__) + uint8_t find_minimum_avx512(char *buff, uint64_t start_pos, uint64_t end_pos, __m512i *zmm_array); + #endif + + #if defined(__ARM_NEON) + uint8_t find_minimum_neon(char *buff, uint64_t start_pos, uint64_t end_pos, uint8x16_t *neon_array); + #endif + + #ifdef __ALTIVEC__ + uint8_t find_minimum_altivec(char *buff, uint64_t start_pos, uint64_t end_pos, __vector unsigned char *vec_array); + #endif + + /** + * @brief Executes a range scan comparing bytes serially against the target value between within the data stream *buff using SIMD instructions. + * Comparison operator: GEQ (>=) + * + * @param buff Data Stream + * @param start_pos Starting position of scanned region + * @param end_pos Ending position of scanned region + * @param target_value Target value + * @return uint64_t Position of first matching candidate. + * @return end_position if no match found + */ + #ifdef __SSE3__ + uint64_t range_scan_geq_sse128(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + #ifdef __AVX2__ + uint64_t range_scan_geq_avx256(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + #if defined(__AVX512F__) + uint64_t range_scan_geq_avx512(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + #if defined(__ARM_NEON) + uint64_t range_scan_geq_neon(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + #if defined(__ALTIVEC__) + uint64_t range_scan_geq_altivec(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + /** + * @brief Executes a range scan comparing bytes serially against the target value between within the data stream *buff using SIMD instructions. + * Comparison operator: GT (>) + * + * @param buff Data Stream + * @param start_pos Starting position of scanned region + * @param end_pos Ending position of scanned region + * @param target_value Target value + * @return uint64_t Position of first matching candidate. + * @return end_position if no match found + */ + #ifdef __SSE3__ + uint64_t range_scan_gt_sse128(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + #ifdef __AVX2__ + uint64_t range_scan_gt_avx256(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + #if defined(__AVX512F__) + uint64_t range_scan_gt_avx512(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + #if defined(__ARM_NEON) + uint64_t range_scan_gt_neon(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + #if defined(__ALTIVEC__) + uint64_t range_scan_gt_altivec(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + /** + * @brief Executes a range scan comparing bytes serially against the target value between within the data stream *buff using SIMD instructions. + * Comparison operator: LEQ (<=) + * + * @param buff Data Stream + * @param start_pos Starting position of scanned region + * @param end_pos Ending position of scanned region + * @param target_value Target value + * @return uint64_t Position of first matching candidate. + * @return end_position if no match found + */ + #ifdef __SSE3__ + uint64_t range_scan_leq_sse128(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + #ifdef __AVX2__ + uint64_t range_scan_leq_avx256(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + #if defined(__AVX512F__) + uint64_t range_scan_leq_avx512(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + #if defined(__ARM_NEON) + uint64_t range_scan_leq_neon(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + #ifdef __ALTIVEC__ + uint64_t range_scan_leq_altivec(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + /** + * @brief Executes a range scan comparing bytes serially against the target value between within the data stream *buff using SIMD instructions. + * Comparison operator: LT (<) + * + * @param buff Data Stream + * @param start_pos Starting position of scanned region + * @param end_pos Ending position of scanned region + * @param target_value Target value + * @return uint64_t Position of first matching candidate. + * @return end_position if no match found + */ + #ifdef __SSE3__ + uint64_t range_scan_lt_sse128(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + #ifdef __AVX2__ + uint64_t range_scan_lt_avx256(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + #if defined(__AVX512F__) - uint8_t find_maximum_avx512(char *buff, uint64_t start_pos, uint64_t end_pos, __m512i *ymm_array); + uint64_t range_scan_lt_avx512(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + #if defined(__ARM_NEON) + uint64_t range_scan_lt_neon(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); #endif + #ifdef __ALTIVEC__ + uint64_t range_scan_lt_altivec(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value); + #endif + + /** * @brief Helper functions to compare 2 128-bit XMM registers * @@ -47,28 +240,42 @@ class AVX_Chunking_Technique: public virtual Chunking_Technique{ * @param b Second XMM register containing 8-bit packed values * @return __m128i Result of bytewise comparison */ - inline __m128i Greater8uSSE(__m128i a, __m128i b){ - return _mm_andnot_si128(_mm_cmpeq_epi8(_mm_min_epu8(a, b), a), K_INV_ZERO); - } + #ifdef __SSE3__ + inline __m128i Greater8uSSE(__m128i a, __m128i b){ + return _mm_andnot_si128(_mm_cmpeq_epi8(_mm_min_epu8(a, b), a), K_INV_ZERO); + } - inline __m128i GreaterOrEqual8uSSE(__m128i a, __m128i b){ - return _mm_cmpeq_epi8(_mm_max_epu8(a, b), a); - } + inline __m128i GreaterOrEqual8uSSE(__m128i a, __m128i b){ + return _mm_cmpeq_epi8(_mm_max_epu8(a, b), a); + } - inline __m128i Lesser8uSSE(__m128i a, __m128i b){ - return _mm_andnot_si128(_mm_cmpeq_epi8(_mm_max_epu8(a, b), a), K_INV_ZERO); - } + inline __m128i Lesser8uSSE(__m128i a, __m128i b){ + return _mm_andnot_si128(_mm_cmpeq_epi8(_mm_max_epu8(a, b), a), K_INV_ZERO); + } - inline __m128i LesserOrEqual8uSSE(__m128i a, __m128i b){ - return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a); - } - inline __m128i NotEqual8uSSE(__m128i a, __m128i b){ - return _mm_andnot_si128(_mm_cmpeq_epi8(a, b), K_INV_ZERO); - } + inline __m128i LesserOrEqual8uSSE(__m128i a, __m128i b){ + return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a); + } + inline __m128i NotEqual8uSSE(__m128i a, __m128i b){ + return _mm_andnot_si128(_mm_cmpeq_epi8(a, b), K_INV_ZERO); + } + #endif - inline __m256i GreaterOrEqual8uAVX256(__m256i a, __m256i b){ - return _mm256_cmpeq_epi8(_mm256_max_epu8(a, b), a); - } + #ifdef __AVX2__ + inline __m256i Greater8uAVX256(__m256i a, __m256i b){ + return _mm256_andnot_si256(_mm256_cmpeq_epi8(_mm256_min_epu8(a, b), a), K_INV_ZERO_256); + } + inline __m256i Lesser8uAVX256(__m256i a, __m256i b){ + return _mm256_andnot_si256(_mm256_cmpeq_epi8(_mm256_max_epu8(a, b), a), K_INV_ZERO_256); + } + + inline __m256i GreaterOrEqual8uAVX256(__m256i a, __m256i b){ + return _mm256_cmpeq_epi8(_mm256_max_epu8(a, b), a); + } + inline __m256i LesserOrEqual8uAVX256(__m256i a, __m256i b){ + return _mm256_cmpeq_epi8(_mm256_min_epu8(a, b), a); + } + #endif @@ -108,20 +315,6 @@ class AVX_Chunking_Technique: public virtual Chunking_Technique{ } #endif - - /** - * @brief Check if an integer has any byte with zero in it. - * Found this off the Stanford bitchacks page: https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord - * @param v Integer (4 bytes) - * @return true If any byte is zero - * @return false If all bytes are > 0 - */ - - inline bool hasZeroByte(unsigned int v) - { - return ~((((v & 0x7F7F7F7F) + 0x7F7F7F7F) | v) | 0x7F7F7F7F); - } - }; #endif \ No newline at end of file diff --git a/dedup/include/chunking/chunking_common.hpp b/dedup/include/chunking/chunking_common.hpp index 5e0b167..ef3b7bc 100644 --- a/dedup/include/chunking/chunking_common.hpp +++ b/dedup/include/chunking/chunking_common.hpp @@ -17,6 +17,9 @@ #include #include #include +#include +#include + #include "hash.hpp" #include "config.hpp" #include "file_chunk.hpp" @@ -47,13 +50,18 @@ class Chunking_Technique{ std::chrono::duration total_time_hashing = std::chrono::duration::zero(); /** - * @brief Chunk a file using a chunking technique and return the struct File_Chunks from this operation + * @brief Chunk a buffer using a chunking technique and return a single chunk boundary from this operation * - * @param file_path: String containing path to file - * @return: Vector of struct File_Chunk + * @param buffer: Data stream of bytes + * @param buffer_size: Size of buffer + * @return: uint64_t indicating boundary position */ - virtual uint64_t find_cutpoint(char*, uint64_t buffer_size){ - return buffer_size; + virtual uint64_t find_cutpoint(char* buffer, uint64_t buffer_size){ + if(buffer != nullptr) + return buffer_size; + else + std::cout << "Null buffer received" << std::endl; + return 0; } /** diff --git a/dedup/include/chunking/maxp_chunking.hpp b/dedup/include/chunking/maxp_chunking.hpp new file mode 100644 index 0000000..c203421 --- /dev/null +++ b/dedup/include/chunking/maxp_chunking.hpp @@ -0,0 +1,88 @@ +#ifndef _MAXP_CHUNKING_ +#define _MAXP_CHUNKING_ + +#include +#include + +#include "avx_chunking_common.hpp" +#include "chunking_common.hpp" +#include "config.hpp" + +#include + +#define DEFAULT_MAXP_WINDOW_SIZE 128 +#define DEFAULT_MAXP_MAX_BLOCK_SIZE 65536 + +class MAXP_Chunking : public virtual AVX_Chunking_Technique { + private: + uint64_t max_block_size; + uint64_t window_size; + + #ifdef __SSE3__ + __m128i *xmm_array; + #endif + + #ifdef __AVX2__ + __m256i *ymm_array; + #endif + + #if defined(__AVX512F__) + __m512i *zmm_array; + #endif + + #if defined(__ARM_NEON) + uint8x16_t *neon_array; + #endif + + #ifdef __ALTIVEC__ + __vector unsigned char *altivec_array; + #endif + + /** + * @brief finds the next cut point in an array of bytes + * @param buff: the buff to find the cutpoint in. + * @param size: the size of the buffer + * @return: cutpoint position in the buffer + */ + uint64_t find_cutpoint(char* buff, uint64_t size) override; + uint64_t find_cutpoint_native(char *buff, uint64_t size); + + #ifdef __SSE3__ + uint64_t find_cutpoint_sse128(char *buff, uint64_t size); + #endif + + #ifdef __AVX2__ + uint64_t find_cutpoint_avx256(char *buff, uint64_t size); + #endif + + #if defined(__AVX512F__) + uint64_t find_cutpoint_avx512(char *buff, uint64_t size); + #endif + + #if defined(__ARM_NEON) + uint64_t find_cutpoint_neon(char *buff, uint64_t size); + #endif + + #ifdef __ALTIVEC__ + uint64_t find_cutpoint_altivec(char *buff, uint64_t size); + #endif + + public: + /** + * @brief Default constructor. + * @return: void + */ + MAXP_Chunking(); + + /** + * @brief Constructor with custom config from a config object + * @param config: the config object + * @return: void + */ + MAXP_Chunking(const Config& config); + + ~MAXP_Chunking(); + +}; + +#endif \ No newline at end of file diff --git a/dedup/include/chunking/ram_chunking.hpp b/dedup/include/chunking/ram_chunking.hpp index 912c6fd..ecf89e7 100644 --- a/dedup/include/chunking/ram_chunking.hpp +++ b/dedup/include/chunking/ram_chunking.hpp @@ -18,9 +18,25 @@ class RAM_Chunking : public virtual AVX_Chunking_Technique { uint64_t window_size; uint64_t curr_pos; - __m128i *sse_array; - __m256i *avx256_array; - __m512i *avx512_array; + #ifdef __SSE3__ + __m128i *sse_array; + #endif + + #ifdef __AVX2__ + __m256i *avx256_array; + #endif + + #if defined(__AVX512F__) + __m512i *avx512_array; + #endif + + #ifdef __ARM_NEON + uint8x16_t *neon_array; + #endif + + #ifdef __ALTIVEC__ + __vector unsigned char *altivec_array; + #endif /** * @brief finds the next cut point in an array of bytes @@ -38,12 +54,6 @@ class RAM_Chunking : public virtual AVX_Chunking_Technique { * @param max_value Max value within fixed size window * @return uint64_t Return position in stream */ - uint64_t get_return_position_sse128(char *buff, uint64_t start_position, uint64_t end_position, uint8_t max_value); - uint64_t get_return_position_avx256(char *buff, uint64_t start_position, uint64_t end_position, uint8_t max_value); - #if defined(__AVX512F__) - uint64_t get_return_position_avx512(char *buff, uint64_t start_position, uint64_t end_position, uint8_t max_value); - #endif - public: /** diff --git a/dedup/include/config/config.hpp b/dedup/include/config/config.hpp index 3b97c06..c2fc5c2 100644 --- a/dedup/include/config/config.hpp +++ b/dedup/include/config/config.hpp @@ -3,8 +3,11 @@ #define _CONFIG_ #include "parser.hpp" +#include + #define CHUNKING_TECH "chunking_algo" #define HASHING_TECH "hashing_algo" +#define CHUNKING_MODE "chunking_mode" #define SIMD_MODE_STRING "simd_mode" #define FC_SIZE "fc_size" @@ -32,6 +35,8 @@ #define CRC_WINDOW_STEP_SIZE "crc_window_step_size" #define CRC_HASH_BITS "crc_hash_bits" #define BUFFER_SIZE "buffer_size" +#define MAXP_WINDOW_SIZE "maxp_window_size" +#define MAXP_MAX_BLOCK_SIZE "maxp_max_block_size" #define SEQ_JUMP_TRIGGER "seq_jump_trigger" #define SEQ_CHUNK_BOUNDARY_THRESHOLD "seq_sequence_threshold" #define SEQ_JUMP_SIZE "seq_jump_size" @@ -57,6 +62,7 @@ enum class ChunkingTech { AE, GEAR, FASTCDC, + MAXP, RAM, EXPERIMENT, CRC, @@ -69,13 +75,18 @@ enum class SIMD_Mode{ SSE128_NOSLIDE, SSE128, AVX256, - AVX512 + AVX512, + NEON, + ALTIVEC }; // define the possible hashing algorithms -enum class HashingTech { MD5, SHA1, SHA256, SHA512 }; +enum class HashingTech { MD5, SHA1, SHA256, SHA512, XXHASH128, MURMURHASH3 }; + // define the the extreme value type of AE algorithm enum AE_Mode { MAX, MIN }; + +// define SeqCDC operating modes enum Seq_Op_Mode { INCREASING, DECREASING }; @@ -93,27 +104,28 @@ class Config { */ ChunkingTech get_chunking_tech() const; - /** - * @brief Get the hashing algorithm specified in the config file. + /** + * @brief Get the chunking mode specified in the config file. * throws ConfigError if the key does not exist or if the value is invalid * - * @return HashingTech + * @return ChunkingMode */ HashingTech get_hashing_tech() const; + /** * @brief Get the SIMD mode for chunking technique * * @return SIMD_Mode */ SIMD_Mode get_simd_mode() const; - + /** * @brief Get the size (in number of bytes) of a chunk when using fixed-size * chunking throws ConfigError if the key does not exist or if the value is * invalid * - * @return HashingTech + * @return uint64_t */ uint64_t get_fc_size() const; @@ -408,6 +420,15 @@ class Config { * * @return uint64_t */ + + uint64_t get_maxp_window_size() const; + uint64_t get_maxp_max_block_size() const; + + /** + * @brief: Get MAXP window size + * + * @return uint64_t + */ }; #endif diff --git a/dedup/include/hashing/murmurhash3_hashing.hpp b/dedup/include/hashing/murmurhash3_hashing.hpp new file mode 100644 index 0000000..b560bbb --- /dev/null +++ b/dedup/include/hashing/murmurhash3_hashing.hpp @@ -0,0 +1,57 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) && (_MSC_VER < 1600) + +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned __int64 uint64_t; + +// Other compilers + +#else // defined(_MSC_VER) + +#include + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ + +// Additions for dedup-bench +//----------------------------------------------------------------------------- + +#include "hashing_common.hpp" + +#define MURMURHASH3_DIGEST_LENGTH 16 + +class MurmurHash3_Hashing : public Hashing_Technique { + /** + * @brief Class to implement MurmurHash3 Hashing + * This class implements the MurmurHash3 hashing technique. + */ + public: + // Function to hash a given chunk + void hash_chunk(File_Chunk& file_chunk) override; + MurmurHash3_Hashing() { + technique_name = "MurmurHash3-Hashing"; + } +}; diff --git a/dedup/include/hashing/xxhash_hashing.hpp b/dedup/include/hashing/xxhash_hashing.hpp new file mode 100644 index 0000000..9eed996 --- /dev/null +++ b/dedup/include/hashing/xxhash_hashing.hpp @@ -0,0 +1,37 @@ +/** + * @author: WASL + * @date: 2025-07-05 + * @brief: Header file for xxHash Hashing Technique + * This file defines the XXHash_Hashing class which implements the Hashing_Technique interface. + * It provides the functionality to hash file chunks using the xxHash algorithm. + * Note: I could only find xxHash128 on GitHub (Cyan4973/xxHash).) + */ + +#ifndef _XXHASH_HASHING_ +#define _XXHASH_HASHING_ + +#define XXHASH_INLINE_ALL /* enable inlining for all functions */ + +#include "hashing_common.hpp" +#include "xxhash.h" + +// Define the digest length for xxHash128 +// This is the size of the hash output in bytes +#define XXH128_DIGEST_LENGTH 16 + +class XXHash_Hashing: public virtual Hashing_Technique{ + /** + * @brief Class to implement xxHash Hashing + * + */ + public: + // Function to hash a given chunk + void hash_chunk(File_Chunk& file_chunk) override; + + XXHash_Hashing() { + technique_name = "xxHash-Hashing"; + } +}; + +#endif + diff --git a/dedup/src/chunking/ae_chunking.cpp b/dedup/src/chunking/ae_chunking.cpp index 0536eac..06a843e 100644 --- a/dedup/src/chunking/ae_chunking.cpp +++ b/dedup/src/chunking/ae_chunking.cpp @@ -16,22 +16,506 @@ AE_Chunking::AE_Chunking() { avg_block_size = DEFAULT_AE_AVG_BLOCK_SIZE; extreme_mode = MAX; technique_name = "AE Chunking"; + + chunk_counter = 0; + + #if defined(__SSE3__) + sse_array = nullptr; + #endif + + #if defined(__AVX2__) + avx256_array = nullptr; + #endif + + #ifdef __AVX512F__ + avx512_array = nullptr; + #endif + + #ifdef __ARM_NEON + neon_array = nullptr; + #endif + + #ifdef __ALTIVEC__ + altivec_array = nullptr; + #endif + + + simd_mode = SIMD_Mode::NONE; } AE_Chunking::AE_Chunking(const Config& config) { extreme_mode = config.get_ae_extreme_mode(); avg_block_size = config.get_ae_avg_block_size(); window_size = avg_block_size - 256; + // window_size = avg_block_size / (exp(1) - 1); // avg_block size / e-1 + - // window_size = avg_block_size / (exp(1) - 1); // avg_block size / e-1 + // Initialize SIMD arrays based on the chosen SIMD mode + + #if defined(__SSE3__) + sse_array = nullptr; + #endif + + #if defined(__AVX2__) + avx256_array = nullptr; + #endif + + #ifdef __AVX512F__ + avx512_array = nullptr; + #endif + + #ifdef __ARM_NEON + neon_array = nullptr; + #endif + + #ifdef __ALTIVEC__ + altivec_array = nullptr; + #endif + + simd_mode = config.get_simd_mode(); + if (simd_mode == SIMD_Mode::NONE) { + + } + #if defined(__SSE3__) + else if (simd_mode == SIMD_Mode::SSE128) { + if(window_size % SSE_REGISTER_SIZE_BYTES != 0 || (window_size / SSE_REGISTER_SIZE_BYTES) % 2 != 0) { + // Check if window size is a multiple of 16 + // Check if window size makes an even number of vectors for find_maximum_sse128 + std::cout << "AE window size currently unsupported by SSE128. Please use an even multiple of SSE128_REGISTER_SIZE_BYTES (default 16)." << std::endl; + exit(1); + } + sse_array = new __m128i[avg_block_size / SSE_REGISTER_SIZE_BYTES]; + if (sse_array == nullptr) { + std::cout << "Error allocating memory for 128-bit vectors(__m128i)" << std::endl; + exit(1); + } + } + #endif + + #if defined(__AVX2__) + else if (simd_mode == SIMD_Mode::AVX256) { + if(window_size % AVX256_REGISTER_SIZE_BYTES != 0 || (window_size / AVX256_REGISTER_SIZE_BYTES % 2 != 0)) { + // Check if window size is a multiple of 32 + // Check if window size makes an even number of vectors for find_maximum_AVX256 + std::cout << "AE window size currently unsupported by AVX256. Please use an even multiple of AVX_REGISTER_SIZE_BYTES (default 32)." << std::endl; + exit(1); + } + avx256_array = new __m256i[avg_block_size / AVX256_REGISTER_SIZE_BYTES]; + if (avx256_array == nullptr) { + std::cout << "Error allocating memory for 256-bit vectors(__m256i)" << std::endl; + exit(1); + } + } + #endif + #ifdef __AVX512F__ + else if (simd_mode == SIMD_Mode::AVX512) { + if(window_size % AVX512_REGISTER_SIZE_BYTES != 0 || (window_size / AVX512_REGISTER_SIZE_BYTES) % 2 != 0) { + // Check if window size is a multiple of 64 + // Check if window size makes an even number of vectors for find_maximum_AVX512 + std::cout << "AE window size currently unsupported by AVX512. Please use an even multiple of AVX_REGISTER_SIZE_BYTES (default 64)." << std::endl; + exit(1); + } + avx512_array = new __m512i[avg_block_size / AVX512_REGISTER_SIZE_BYTES]; + if (avx512_array == nullptr) { + std::cout << "Error allocating memory for 512-bit vectors(__m512i)" << std::endl; + exit(1); + } + } + #endif + + #ifdef __ARM_NEON + else if (simd_mode == SIMD_Mode::NEON) { + if(window_size % NEON_REGISTER_SIZE_BYTES != 0 || (window_size / NEON_REGISTER_SIZE_BYTES) % 2 != 0) { + // Check if window size is a multiple of 16 + // Check if window size makes an even number of vectors for find_maximum_neon + std::cout << "AE window size currently unsupported by NEON. Please use an even multiple of NEON_REGISTER_SIZE_BYTES (default 16)." << std::endl; + exit(1); + } + neon_array = new uint8x16_t[avg_block_size / NEON_REGISTER_SIZE_BYTES]; + if (neon_array == nullptr) { + std::cout << "Error allocating memory for NEON vectors(uint8x16_t)" << std::endl; + exit(1); + } + } + #endif + + #ifdef __ALTIVEC__ + else if (simd_mode == SIMD_Mode::ALTIVEC) { + if(window_size % ALTIVEC_REGISTER_SIZE_BYTES != 0 || (window_size / ALTIVEC_REGISTER_SIZE_BYTES) % 2 != 0) { + // Check if window size is a multiple of 16 + // Check if window size makes an even number of vectors for find_maximum_altivec + std::cout << "AE window size currently unsupported by ALTIVEC. Please use an even multiple of ALTIVEC_REGISTER_SIZE_BYTES (default 16)." << std::endl; + exit(1); + } + altivec_array = new __vector unsigned char[avg_block_size / ALTIVEC_REGISTER_SIZE_BYTES]; + if (altivec_array == nullptr) { + std::cout << "Error allocating memory for ALTIVEC vectors(__vector unsigned char)" << std::endl; + exit(1); + } + } + #endif + + else { + std::cerr << "Error: Unsupported SIMD mode" << std::endl; + exit(1); + } + technique_name = "AE Chunking"; } AE_Chunking::~AE_Chunking() { + #ifdef __SSE3__ + if (sse_array != nullptr) { + delete[] sse_array; + } + #endif + + #ifdef __AVX2__ + if (avx256_array != nullptr) { + delete[] avx256_array; + } + #endif + + #ifdef __AVX512F__ + if (avx512_array != nullptr) { + delete[] avx512_array; + } + #endif + + #ifdef __ARM_NEON + if (neon_array != nullptr) { + delete[] neon_array; + } + #endif + + #ifdef __ALTIVEC__ + if (altivec_array != nullptr) { + delete[] altivec_array; + } + #endif + +} + +uint64_t AE_Chunking::find_cutpoint(char *buff, uint64_t size){ + + // chunk_counter++; + + if(simd_mode == SIMD_Mode::NONE){ + return find_cutpoint_native(buff, size); + } + + #ifdef __SSE3__ + else if(simd_mode == SIMD_Mode::SSE128){ + return find_cutpoint_sse128(buff, size); + } + #endif + + #ifdef __AVX2__ + else if(simd_mode == SIMD_Mode::AVX256){ + return find_cutpoint_avx256(buff, size); + } + #endif + + #ifdef __AVX512F__ + else if(simd_mode == SIMD_Mode::AVX512){ + return find_cutpoint_avx512(buff, size); + } + #endif + + #ifdef __ARM_NEON + else if(simd_mode == SIMD_Mode::NEON){ + return find_cutpoint_neon(buff, size); + } + #endif + + #ifdef __ALTIVEC__ + else if(simd_mode == SIMD_Mode::ALTIVEC){ + return find_cutpoint_altivec(buff, size); + } + #endif + + else{ + std::cerr << "Error: Unsupported SIMD mode" << std::endl; + return 0; + } +} + +#ifdef __SSE3__ +uint64_t AE_Chunking::find_cutpoint_sse128(char* buff, uint64_t size) { + uint64_t target_pos = 0; + uint64_t return_pos_range_scan; + uint8_t find_max_result; + + uint8_t target_value = (uint8_t) buff[target_pos]; + if(extreme_mode == AE_Mode::MAX){ + while(target_pos < (size - window_size - 1)){ + + return_pos_range_scan = range_scan_gt_sse128(buff, target_pos + 1, target_pos + 1 + window_size, target_value); + if(return_pos_range_scan != target_pos + 1 + window_size){ + target_pos = return_pos_range_scan; + target_value = (uint8_t) buff[target_pos]; + + find_max_result = find_maximum_sse128(buff, target_pos + 1, target_pos + 1 + window_size, sse_array); + + if(find_max_result < target_value) + return std::min(size, target_pos + window_size); + } + else { + return std::min(size, target_pos + window_size); + } + } + + return size; + } + else if(extreme_mode == AE_Mode::MIN){ + uint64_t target_pos = 0; + uint64_t return_pos_range_scan; + uint8_t find_min_result; + uint8_t target_value = (uint8_t) buff[target_pos]; + + while(target_pos < (size - window_size - 1)){ + return_pos_range_scan = range_scan_lt_sse128(buff, target_pos + 1, target_pos + 1 + window_size, target_value); + if(return_pos_range_scan != target_pos + 1 + window_size){ + target_pos = return_pos_range_scan; + target_value = (uint8_t) buff[target_pos]; + + find_min_result = find_minimum_sse128(buff, target_pos + 1, target_pos + 1 + window_size, sse_array); + + if(find_min_result > target_value) + return std::min(size, target_pos + window_size); + } + else { + return std::min(size, target_pos + window_size); + } + } + } + + return size; +} +#endif + +#ifdef __AVX2__ +uint64_t AE_Chunking::find_cutpoint_avx256(char* buff, uint64_t size) { + uint64_t target_pos = 0; + uint64_t return_pos_range_scan; + uint8_t find_max_result; + + uint8_t target_value = (uint8_t) buff[target_pos]; + if(extreme_mode == AE_Mode::MAX){ + while(target_pos < (size - window_size - 1)){ + + return_pos_range_scan = range_scan_gt_avx256(buff, target_pos + 1, target_pos + 1 + window_size, target_value); + if(return_pos_range_scan != target_pos + 1 + window_size){ + target_pos = return_pos_range_scan; + target_value = (uint8_t) buff[target_pos]; + + find_max_result = find_maximum_avx256(buff, target_pos + 1, target_pos + 1 + window_size, avx256_array); + + if(find_max_result < target_value) + return std::min(size, target_pos + window_size); + } + else { + return std::min(size, target_pos + window_size); + } + } + + return size; + } + else if(extreme_mode == AE_Mode::MIN){ + uint64_t target_pos = 0; + uint64_t return_pos_range_scan; + uint8_t find_min_result; + uint8_t target_value = (uint8_t) buff[target_pos]; + + while(target_pos < (size - window_size - 1)){ + return_pos_range_scan = range_scan_lt_avx256(buff, target_pos + 1, target_pos + 1 + window_size, target_value); + if(return_pos_range_scan != target_pos + 1 + window_size){ + target_pos = return_pos_range_scan; + target_value = (uint8_t) buff[target_pos]; + + find_min_result = find_minimum_avx256(buff, target_pos + 1, target_pos + 1 + window_size, avx256_array); + + if(find_min_result > target_value) + return std::min(size, target_pos + window_size); + } + else { + return std::min(size, target_pos + window_size); + } + } + } + + return size; +} +#endif + +#if defined(__AVX512F__) +uint64_t AE_Chunking::find_cutpoint_avx512(char* buff, uint64_t size) { + uint64_t target_pos = 0; + uint64_t return_pos_range_scan; + uint8_t find_max_result; + + uint8_t target_value = (uint8_t) buff[target_pos]; + if(extreme_mode == AE_Mode::MAX){ + while(target_pos < (size - window_size - 1)){ + + return_pos_range_scan = range_scan_gt_avx512(buff, target_pos + 1, target_pos + 1 + window_size, target_value); + if(return_pos_range_scan != target_pos + 1 + window_size){ + target_pos = return_pos_range_scan; + target_value = (uint8_t) buff[target_pos]; + + find_max_result = find_maximum_avx512(buff, target_pos + 1, target_pos + 1 + window_size, avx512_array); + + if(find_max_result < target_value) + return std::min(size, target_pos + window_size); + } + else { + return std::min(size, target_pos + window_size); + } + } + + return size; + } + else if(extreme_mode == AE_Mode::MIN){ + uint64_t target_pos = 0; + uint64_t return_pos_range_scan; + uint8_t find_min_result; + uint8_t target_value = (uint8_t) buff[target_pos]; + + while(target_pos < (size - window_size - 1)){ + return_pos_range_scan = range_scan_lt_avx512(buff, target_pos + 1, target_pos + 1 + window_size, target_value); + if(return_pos_range_scan != target_pos + 1 + window_size){ + target_pos = return_pos_range_scan; + target_value = (uint8_t) buff[target_pos]; + + find_min_result = find_minimum_avx512(buff, target_pos + 1, target_pos + 1 + window_size, avx512_array); + + if(find_min_result > target_value) + return std::min(size, target_pos + window_size); + } + else { + return std::min(size, target_pos + window_size); + } + } + } + + return size; +} +#endif + +#ifdef __ARM_NEON +uint64_t AE_Chunking::find_cutpoint_neon(char* buff, uint64_t size) { + uint64_t target_pos = 0; + uint64_t return_pos_range_scan; + uint8_t find_max_result; + + uint8_t target_value = (uint8_t) buff[target_pos]; + if(extreme_mode == AE_Mode::MAX){ + while(target_pos < (size - window_size - 1)){ + + return_pos_range_scan = range_scan_gt_neon(buff, target_pos + 1, target_pos + 1 + window_size, target_value); + if(return_pos_range_scan != target_pos + 1 + window_size){ + target_pos = return_pos_range_scan; + target_value = (uint8_t) buff[target_pos]; + + find_max_result = find_maximum_neon(buff, target_pos + 1, target_pos + 1 + window_size, neon_array); + + if(find_max_result < target_value) + { + return std::min(size, target_pos + window_size); + } + } + else { + return std::min(size, target_pos + window_size); + } + } + + return size; + } + else if(extreme_mode == AE_Mode::MIN){ + uint64_t target_pos = 0; + uint64_t return_pos_range_scan; + uint8_t find_min_result; + uint8_t target_value = (uint8_t) buff[target_pos]; + + while(target_pos < (size - window_size - 1)){ + return_pos_range_scan = range_scan_lt_neon(buff, target_pos + 1, target_pos + 1 + window_size, target_value); + if(return_pos_range_scan != target_pos + 1 + window_size){ + target_pos = return_pos_range_scan; + target_value = (uint8_t) buff[target_pos]; + + find_min_result = find_minimum_neon(buff, target_pos + 1, target_pos + 1 + window_size, neon_array); + + if(find_min_result > target_value) + return std::min(size, target_pos + window_size); + } + else { + return std::min(size, target_pos + window_size); + } + } + } + + return size; +} +#endif + +#ifdef __ALTIVEC__ +uint64_t AE_Chunking::find_cutpoint_altivec(char* buff, uint64_t size) { + uint64_t target_pos = 0; + uint64_t return_pos_range_scan; + uint8_t find_max_result; + + uint8_t target_value = (uint8_t) buff[target_pos]; + if(extreme_mode == AE_Mode::MAX){ + while(target_pos < (size - window_size - 1)){ + + return_pos_range_scan = range_scan_gt_altivec(buff, target_pos + 1, target_pos + 1 + window_size, target_value); + if(return_pos_range_scan != target_pos + 1 + window_size){ + target_pos = return_pos_range_scan; + target_value = (uint8_t) buff[target_pos]; + + find_max_result = find_maximum_altivec(buff, target_pos + 1, target_pos + 1 + window_size, altivec_array); + + if(find_max_result < target_value) + { + return std::min(size, target_pos + window_size); + } + } + else { + return std::min(size, target_pos + window_size); + } + } + + return size; + } + else if(extreme_mode == AE_Mode::MIN){ + uint64_t target_pos = 0; + uint64_t return_pos_range_scan; + uint8_t find_min_result; + uint8_t target_value = (uint8_t) buff[target_pos]; + + while(target_pos < (size - window_size - 1)){ + return_pos_range_scan = range_scan_lt_altivec(buff, target_pos + 1, target_pos + 1 + window_size, target_value); + if(return_pos_range_scan != target_pos + 1 + window_size){ + target_pos = return_pos_range_scan; + target_value = (uint8_t) buff[target_pos]; + + find_min_result = find_minimum_altivec(buff, target_pos + 1, target_pos + 1 + window_size, altivec_array); + + if(find_min_result > target_value) + return std::min(size, target_pos + window_size); + } + else { + return std::min(size, target_pos + window_size); + } + } + } + + return size; } +#endif -uint64_t AE_Chunking::find_cutpoint(char* buff, uint64_t size) { +uint64_t AE_Chunking::find_cutpoint_native(char* buff, uint64_t size) { uint32_t i = 0; uint64_t max_value = buff[i]; uint64_t max_pos = i; diff --git a/dedup/src/chunking/avx_chunking_common.cpp b/dedup/src/chunking/avx_chunking_common.cpp index 4e47512..a273f8e 100644 --- a/dedup/src/chunking/avx_chunking_common.cpp +++ b/dedup/src/chunking/avx_chunking_common.cpp @@ -9,6 +9,7 @@ * */ + #include "avx_chunking_common.hpp" #include "chunking_common.hpp" #include "hashing_common.hpp" @@ -21,6 +22,16 @@ #include #include +/** + * * @brief: Helper functions to find maximum value in a region of the data stream + * * @param buff Data Stream + * * @param start_pos Starting position of scanned region + * * @param end_pos Ending position of scanned region + * * @param xmm_array Array of XMM registers to store intermediate results + * * @return uint8_t Maximum Value + */ + +#if defined(__SSE3__) uint8_t AVX_Chunking_Technique::find_maximum_sse128(char *buff, uint64_t start_pos, uint64_t end_pos, __m128i *xmm_array){ // Assume window_size is a multiple of SSE_REGISTER_SIZE_BYTES for now @@ -33,7 +44,7 @@ uint8_t AVX_Chunking_Technique::find_maximum_sse128(char *buff, uint64_t start_p // Load contents into __m128i structures // Could be optimized later as only 16 xmm registers are avaiable per CPU in 64-bit - for(uint64_t i = start_pos; i < num_vectors; i++) + for(uint64_t i = 0; i < num_vectors; i++) xmm_array[i] = _mm_loadu_si128((__m128i const *)(buff + start_pos + (SSE_REGISTER_SIZE_BYTES * i))); // Repeat vmaxu until a single register is remaining with maximum values @@ -65,7 +76,9 @@ uint8_t AVX_Chunking_Technique::find_maximum_sse128(char *buff, uint64_t start_p // Return maximum value return max_val; } +#endif +#if defined(__AVX2__) uint8_t AVX_Chunking_Technique::find_maximum_avx256(char *buff, uint64_t start_pos, uint64_t end_pos, __m256i *xmm_array){ // Assume window_size is a multiple of AVX256_REGISTER_SIZE_BYTES for now @@ -78,7 +91,7 @@ uint8_t AVX_Chunking_Technique::find_maximum_avx256(char *buff, uint64_t start_p // Load contents into __m256i structures // Could be optimized later as only 16 xmm registers are avaiable per CPU in 64-bit - for(uint64_t i = start_pos; i < num_vectors; i++) + for(uint64_t i = 0; i < num_vectors; i++) xmm_array[i] = _mm256_loadu_si256((__m256i const *)(buff + start_pos + (AVX256_REGISTER_SIZE_BYTES * i))); // Repeat vmaxu until a single register is remaining with maximum values @@ -110,6 +123,7 @@ uint8_t AVX_Chunking_Technique::find_maximum_avx256(char *buff, uint64_t start_p // Return maximum value return max_val; } +#endif #if defined(__AVX512F__) uint8_t AVX_Chunking_Technique::find_maximum_avx512(char *buff, uint64_t start_pos, uint64_t end_pos, __m512i *xmm_array){ @@ -126,7 +140,7 @@ uint8_t AVX_Chunking_Technique::find_maximum_avx512(char *buff, uint64_t start_p // Load contents into __m512i structures // Could be optimized later as only 16 xmm registers are avaiable per CPU in 64-bit - for(uint64_t i = start_pos; i < num_vectors; i++) + for(uint64_t i = 0; i < num_vectors; i++) xmm_array[i] = _mm512_loadu_si512((__m512i const *)(buff + start_pos + (AVX512_REGISTER_SIZE_BYTES * i))); // Repeat vmaxu until a single register is remaining with maximum values @@ -159,3 +173,1236 @@ uint8_t AVX_Chunking_Technique::find_maximum_avx512(char *buff, uint64_t start_p return max_val; } #endif + +#if defined(__ARM_NEON) +uint8_t AVX_Chunking_Technique::find_maximum_neon(char *buff, uint64_t start_pos, uint64_t end_pos, uint8x16_t *neon_array){ + + // Assume window_size is a multiple of NEON_REGISTER_SIZE_BYTES for now + // Assume num_vectors is even for now - True for most common window sizes. Can fix later via specific check + + uint64_t num_vectors = (end_pos - start_pos) / NEON_REGISTER_SIZE_BYTES; + + uint64_t step = 2; + uint64_t half_step = 1; + + // Load contents into uint8x16_t structures + // Could be optimized later as only 16 mm registers are avaiable per CPU in 64-bit + for(uint64_t i = 0; i < num_vectors; i++) + neon_array[i] = vld1q_u8((uint8_t const *)(buff + start_pos + (NEON_REGISTER_SIZE_BYTES * i))); + + // Repeat vmaxu until a single register is remaining with maximum values + // Each iteration calculates maximums between a pair of registers and moves it into the first register in the pair + // Finally, only one will be left with the maximum values from all pairs + while(step <= num_vectors){ + + for(uint64_t i = 0; i < num_vectors; i+=step) + neon_array[i] = vmaxq_u8(neon_array[i], neon_array[i+half_step]); + + // Multiply step by 2 + half_step = step; + step = step << 1; + + } + + // Move the final set of values from the NEON into local memory + uint8_t result_store[NEON_REGISTER_SIZE_BYTES] = {0}; + vst1q_u8((uint8_t *)&result_store, neon_array[0]); + + // Sequentially scan the last remaining bytes (16 in this case) to find the max value + uint8_t max_val = 0; + for(uint64_t i = 0; i < NEON_REGISTER_SIZE_BYTES; i++){ + if(result_store[i] > max_val) + max_val = result_store[i]; + } + // Return maximum value + return max_val; + +} +#endif + +#if defined(__ALTIVEC__) +uint8_t AVX_Chunking_Technique::find_maximum_altivec(char *buff, uint64_t start_pos, uint64_t end_pos, __vector unsigned char *vec_array){ + + // Assume window_size is a multiple of ALTIVEC_REGISTER_SIZE_BYTES for now + // Assume num_vectors is even for now - True for most common window sizes. Can fix later via specific check + + uint64_t num_vectors = (end_pos - start_pos) / ALTIVEC_REGISTER_SIZE_BYTES; + + uint64_t step = 2; + uint64_t half_step = 1; + + // Load contents into vector unsigned char structures + // Could be optimized later as only 32 vec registers are available per CPU + for(uint64_t i = 0; i < num_vectors; i++) + vec_array[i] = vec_xl(0, (unsigned char const *)(buff + start_pos + (ALTIVEC_REGISTER_SIZE_BYTES * i))); + + // Repeat vmaxu until a single register is remaining with maximum values + // Each iteration calculates maximums between a pair of registers and moves it into the first register in the pair + // Finally, only one will be left with the maximum values from all pairs + while(step <= num_vectors){ + + for(uint64_t i = 0; i < num_vectors; i+=step) + vec_array[i] = vec_max(vec_array[i], vec_array[i+half_step]); + + // Multiply step by 2 + half_step = step; + step = step << 1; + + } + + // Move the final set of values from the vec into local memory + unsigned char result_store[ALTIVEC_REGISTER_SIZE_BYTES] = {0}; + + vec_st(vec_array[0], 0, (unsigned char *)&result_store); + + // Sequentially scan the last remaining bytes (16 in this case) to find the max value + uint8_t max_val = 0; + for(uint64_t i = 0; i < ALTIVEC_REGISTER_SIZE_BYTES; i++){ + if(result_store[i] > max_val) + max_val = result_store[i]; + } + + // Return maximum value + return max_val; +} +#endif + +/** + * @brief: Helper functions to find minimum value in data stream + * * @param buff Data Stream + * * @param start_pos Starting position of scanned region + * * @param end_pos Ending position of scanned region + * * @param xmm_array Array of SIMD registers to store intermediate results + * * @return uint8_t Minimum Value + */ + + #if defined(__SSE3__) + uint8_t AVX_Chunking_Technique::find_minimum_sse128(char *buff, uint64_t start_pos, uint64_t end_pos, __m128i *xmm_array){ + + // Assume window_size is a multiple of SSE_REGISTER_SIZE_BYTES for now + // Assume num_vectors is even for now - True for most common window sizes. Can fix later via specific check + + uint64_t num_vectors = (end_pos - start_pos) / SSE_REGISTER_SIZE_BYTES; + + uint64_t step = 2; + uint64_t half_step = 1; + + // Load contents into __m128i structures + // Could be optimized later as only 16 xmm registers are avaiable per CPU in 64-bit + for(uint64_t i = 0; i < num_vectors; i++) + xmm_array[i] = _mm_loadu_si128((__m128i const *)(buff + start_pos + (SSE_REGISTER_SIZE_BYTES * i))); + + // Repeat vminu until a single register is remaining with minimum values + // Each iteration calculates minimums between a pair of registers and moves it into the first register in the pair + // Finally, only one will be left with the minimum values from all pairs + while(step <= num_vectors){ + + for(uint64_t i = 0; i < num_vectors; i+=step) + xmm_array[i] = _mm_min_epu8(xmm_array[i], xmm_array[i+half_step]); + + // Multiply step by 2 + half_step = step; + step = step << 1; + + } + + // Move the final set of values from the xmm into local memory + uint8_t result_store[SSE_REGISTER_SIZE_BYTES] = {0}; + + _mm_storeu_si128((__m128i *)&result_store, xmm_array[0]); + + // Sequentially scan the last remaining bytes (128 in this case) to find the min value + uint8_t min_val = UINT8_MAX; + for(uint64_t i = 0; i < SSE_REGISTER_SIZE_BYTES; i++){ + if(result_store[i] < min_val) + min_val = result_store[i]; + } + + // Return minimum value + return min_val; +} +#endif + +#if defined(__AVX2__) +uint8_t AVX_Chunking_Technique::find_minimum_avx256(char *buff, uint64_t start_pos, uint64_t end_pos, __m256i *xmm_array){ + + // Assume window_size is a multiple of AVX256_REGISTER_SIZE_BYTES for now + // Assume num_vectors is even for now - True for most common window sizes. Can fix later via specific check + + uint64_t num_vectors = (end_pos - start_pos) / AVX256_REGISTER_SIZE_BYTES; + + uint64_t step = 2; + uint64_t half_step = 1; + + // Load contents into __m256i structures + // Could be optimized later as only 16 xmm registers are avaiable per CPU in 64-bit + for(uint64_t i = 0; i < num_vectors; i++) + xmm_array[i] = _mm256_loadu_si256((__m256i const *)(buff + start_pos + (AVX256_REGISTER_SIZE_BYTES * i))); + + // Repeat vminu until a single register is remaining with minimum values + // Each iteration calculates minimums between a pair of registers and moves it into the first register in the pair + // Finally, only one will be left with the minimum values from all pairs + while(step <= num_vectors){ + + for(uint64_t i = 0; i < num_vectors; i+=step) + xmm_array[i] = _mm256_min_epu8(xmm_array[i], xmm_array[i+half_step]); + + // Multiply step by 2 + half_step = step; + step = step << 1; + + } + + // Move the final set of values from the xmm into local memory + uint8_t result_store[AVX256_REGISTER_SIZE_BYTES] = {0}; + + _mm256_storeu_si256((__m256i *)&result_store, xmm_array[0]); + + // Sequentially scan the last remaining bytes (256 in this case) to find the min value + uint8_t min_val = UINT8_MAX; + for(uint64_t i = 0; i < AVX256_REGISTER_SIZE_BYTES; i++){ + if(result_store[i] < min_val) + min_val = result_store[i]; + } + + // Return minimum value + return min_val; +} +#endif + +#if defined(__AVX512F__) +uint8_t AVX_Chunking_Technique::find_minimum_avx512(char *buff, uint64_t start_pos, uint64_t end_pos, __m512i *xmm_array){ + + // Assume window_size is a multiple of AVX512_REGISTER_SIZE_BYTES for now + // Assume num_vectors is even for now - True for most common window sizes. Can fix later via specific check + + uint64_t num_vectors = (end_pos - start_pos) / AVX512_REGISTER_SIZE_BYTES; + + uint64_t step = 2; + uint64_t half_step = 1; + + // Load contents into __m512i structures + // Could be optimized later as only 16 xmm registers are avaiable per CPU in 64-bit + for(uint64_t i = 0; i < num_vectors; i++) + xmm_array[i] = _mm512_loadu_si512((__m512i const *)(buff + start_pos + (AVX512_REGISTER_SIZE_BYTES * i))); + + // Repeat vminu until a single register is remaining with minimum values + // Each iteration calculates minimums between a pair of registers and moves it into the first register in the pair + // Finally, only one will be left with the minimum values from all pairs + while(step <= num_vectors){ + + for(uint64_t i = 0; i < num_vectors; i+=step) + xmm_array[i] = _mm512_maskz_min_epu8(UINT64_MAX, xmm_array[i], xmm_array[i+half_step]); + + // Multiply step by 2 + half_step = step; + step = step << 1; + + } + + // Move the final set of values from the xmm into local memory + uint8_t result_store[AVX512_REGISTER_SIZE_BYTES] = {0}; + + _mm512_storeu_si512((__m512i *)&result_store, xmm_array[0]); + + // Sequentially scan the last remaining bytes (512 in this case) to find the min value + uint8_t min_val = UINT8_MAX; + for(uint64_t i = 0; i < AVX512_REGISTER_SIZE_BYTES; i++){ + if(result_store[i] < min_val) + min_val = result_store[i]; + } + + // Return minimum value + return min_val; +} +#endif + +#if defined(__ARM_NEON) +uint8_t AVX_Chunking_Technique::find_minimum_neon(char *buff, uint64_t start_pos, uint64_t end_pos, uint8x16_t *neon_array){ + + // Assume window_size is a multiple of NEON_REGISTER_SIZE_BYTES for now + // Assume num_vectors is even for now - True for most common window sizes. Can fix later via specific check + + uint64_t num_vectors = (end_pos - start_pos) / NEON_REGISTER_SIZE_BYTES; + + uint64_t step = 2; + uint64_t half_step = 1; + + // Load contents into uint8x16_t structures + // Could be optimized later as only 16 mm registers are avaiable per CPU in 64-bit + for(uint64_t i = 0; i < num_vectors; i++) + neon_array[i] = vld1q_u8((uint8_t const *)(buff + start_pos + (NEON_REGISTER_SIZE_BYTES * i))); + + // Repeat vminu until a single register is remaining with minimum values + // Each iteration calculates minimums between a pair of registers and moves it into the first register in the pair + // Finally, only one will be left with the minimum values from all pairs + while(step <= num_vectors){ + + for(uint64_t i = 0; i < num_vectors; i+=step) + neon_array[i] = vminq_u8(neon_array[i], neon_array[i+half_step]); + + // Multiply step by 2 + half_step = step; + step = step << 1; + + } + + // Move the final set of values from the NEON into local memory + uint8_t result_store[NEON_REGISTER_SIZE_BYTES] = {0}; + vst1q_u8((uint8_t *)&result_store, neon_array[0]); + + // Sequentially scan the last remaining bytes (16 in this case) to find the min value + uint8_t min_val = UINT8_MAX; + for(uint64_t i = 0; i < NEON_REGISTER_SIZE_BYTES; i++){ + if(result_store[i] < min_val) + min_val = result_store[i]; + } + + // Return minimum value + return min_val; + +} +#endif + +#if defined(__ALTIVEC__) +uint8_t AVX_Chunking_Technique::find_minimum_altivec(char *buff, uint64_t start_pos, uint64_t end_pos, __vector unsigned char *vec_array){ + // Assume window_size is a multiple of ALTIVEC_REGISTER_SIZE_BYTES for now + // Assume num_vectors is even for now - True for most common window sizes. Can fix later via specific check + + uint64_t num_vectors = (end_pos - start_pos) / ALTIVEC_REGISTER_SIZE_BYTES; + + uint64_t step = 2; + uint64_t half_step = 1; + + // Load contents into vector unsigned char structures + // Could be optimized later as only 32 vec registers are available per CPU + for(uint64_t i = 0; i < num_vectors; i++) + vec_array[i] = vec_xl(0, (unsigned char const *)(buff + start_pos + (ALTIVEC_REGISTER_SIZE_BYTES * i))); + + // Repeat vminu until a single register is remaining with minimum values + // Each iteration calculates minimums between a pair of registers and moves it into the first register in the pair + // Finally, only one will be left with the minimum values from all pairs + while(step <= num_vectors){ + + for(uint64_t i = 0; i < num_vectors; i+=step) + vec_array[i] = vec_min(vec_array[i], vec_array[i+half_step]); + + // Multiply step by 2 + half_step = step; + step = step << 1; + + } + + // Move the final set of values from the vec into local memory + unsigned char result_store[ALTIVEC_REGISTER_SIZE_BYTES] = {0}; + + vec_st(vec_array[0], 0, (unsigned char *)&result_store); + + // Sequentially scan the last remaining bytes (16 in this case) to find the min value + uint8_t min_val = UINT8_MAX; + for(uint64_t i = 0; i < ALTIVEC_REGISTER_SIZE_BYTES; i++){ + if(result_store[i] < min_val) + min_val = result_store[i]; + } + + // Return minimum value + return min_val; +} +#endif + +/** + * Range scan functions for greater-than-or-equal-to (geq) comparisons + * These functions are used to find the first position in the data stream matching a target comparison + * The functions have been implemented for SSE, AVX, AVX512, NEON and AltiVec instruction sets + */ + +#if defined(__SSE3__) +uint64_t AVX_Chunking_Technique::range_scan_geq_sse128(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / SSE_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + uint64_t return_pos; + + // Structures to store bytes from data stream and comparison results in 128-bit SSE format + __m128i xmm_array, cmp_array; + int cmp_mask; + + // Load max_value into xmm-format + __m128i max_val_xmm = _mm_set1_epi8((char)target_value); + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * SSE_REGISTER_SIZE_BYTES); + // Load data into xmm register + xmm_array = _mm_loadu_si128((__m128i const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in xmm_array is geq max_val_xmm, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + #if defined(__AVX512F__) + cmp_mask = _mm_cmpge_epu8_mask(xmm_array, max_val_xmm); + #else + cmp_array = GreaterOrEqual8uSSE(xmm_array, max_val_xmm); + + // Create a mask using the most-significant bit of each byte value in cmp_array + cmp_mask = _mm_movemask_epi8(cmp_array); + #endif + + + + // Return index of first non-zero bit in mask + // This corresponds to the first non-zero byte in cmp_array + if(cmp_mask){ + return_pos = curr_scan_start + (__builtin_ffs(cmp_mask) - 1); + return return_pos; + } + } + + return end_position; +} +#endif + +#if defined(__AVX2__) +uint64_t AVX_Chunking_Technique::range_scan_geq_avx256(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / AVX256_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + + // Structures to store bytes from data stream and comparison results in 256-bit AVX format + __m256i xmm_array, cmp_array; + uint32_t cmp_mask; + + // Load max_value into xmm-format + __m256i max_val_xmm = _mm256_set1_epi8((char)target_value); + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * AVX256_REGISTER_SIZE_BYTES); + // Load data into xmm register + xmm_array = _mm256_loadu_si256((__m256i const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in xmm_array is geq max_val_xmm, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + #if defined(__AVX512F__) + cmp_mask = _mm256_cmpge_epu8_mask(xmm_array, max_val_xmm); + #else + cmp_array = GreaterOrEqual8uAVX256(xmm_array, max_val_xmm); + + // Create a mask using the most-significant bit of each byte value in cmp_array + cmp_mask = _mm256_movemask_epi8(cmp_array); + #endif + + // Return index of first non-zero bit in mask + // This corresponds to the first non-zero byte in cmp_array + if(cmp_mask){ + return curr_scan_start + (__builtin_ffs(cmp_mask) - 1); + } + } + + return end_position; +} +#endif + +#if defined(__AVX512F__) +uint64_t AVX_Chunking_Technique::range_scan_geq_avx512(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / AVX512_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + uint64_t return_pos; + + // Structures to store bytes from data stream and comparison results in 512-bit AVX format + __m512i xmm_array; + __mmask64 cmp_mask; + + // Load max_value into xmm-format + __m512i max_val_xmm = _mm512_set1_epi8((char)target_value); + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * AVX512_REGISTER_SIZE_BYTES); + // Load data into xmm register + xmm_array = _mm512_loadu_si512((__m512i const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in xmm_array is geq max_val_xmm, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + cmp_mask = _mm512_cmpge_epu8_mask(xmm_array, max_val_xmm); + + // Return index of first non-zero bit in mask + // This corresponds to the first non-zero byte in cmp_array + if(cmp_mask){ + return_pos = curr_scan_start + (__builtin_ffsll(cmp_mask) - 1); + return return_pos; + } + } + + return end_position; +} +#endif + +#if defined(__ARM_NEON) +uint64_t AVX_Chunking_Technique::range_scan_geq_neon(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / NEON_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + + // Structures to store bytes from data stream and comparison results in 128-bit NEON format + uint8x16_t xmm_array, cmp_array; + uint8x16_t max_val_xmm = vdupq_n_u8((uint8_t)target_value); + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * NEON_REGISTER_SIZE_BYTES); + // Load data into xmm register + xmm_array = vld1q_u8((uint8_t const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in xmm_array is geq max_val_xmm, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + cmp_array = vcgeq_u8(xmm_array, max_val_xmm); + + // ARM doesn't have native MOVEMASK instructions like x86 does. + // Following advice from an ARM community blog post to implement mask creation instead + // https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon + + // Create a mask using the most-significant bit of each byte value in cmp_array + + uint64_t cmp_mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp_array), 4)), 0); + + // Return index of first non-zero bit in mask + // This corresponds to the first non-zero byte in cmp_array + if(cmp_mask){ + return curr_scan_start + ((__builtin_ffsll(cmp_mask) - 1) >> 2); // Shift right by 2 to account for 4 bits per byte + } + } + + return end_position; +} +#endif + +#ifdef __ALTIVEC__ +uint64_t AVX_Chunking_Technique::range_scan_geq_altivec(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / ALTIVEC_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + + // Structures to store bytes from data stream and comparison results in 128-bit AltiVec format + __vector unsigned char vec_array; + __vector unsigned char perm_array; + __vector __bool char cmp_array; + __vector unsigned char max_val_vec = vec_splats((unsigned char)target_value); + __vector unsigned char bit_selector = {0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120}; // select most significant bit from each byte + + uint8_t cond_mask_1, cond_mask_2; + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * ALTIVEC_REGISTER_SIZE_BYTES); + + // Load data into vec register + // Use vec_xl instead of vec_ld because vec_ld truncates the load address to a 16 byte boundary + // vec_xl allows loading from any address, which is necessary for unaligned data + vec_array = vec_xl(0, (unsigned char const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in vec_array is geq max_val_vec, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + cmp_array = vec_cmpge(vec_array, max_val_vec); + + // Create masks using the most-significant bit of each byte value in cmp_array + perm_array = vec_bperm((__vector unsigned char)cmp_array, bit_selector); + + // bperm stores masks in elements 8 and 9 of the vector + // Extract the masks from the perm_array + // cond_mask_2 corresponds to the first 8 bytes, cond_mask_1 corresponds to the next 8 bytes + // This is because vec_bperm rearranges the bits in little endian order + cond_mask_1 = vec_extract(perm_array, 8); + cond_mask_2 = vec_extract(perm_array, 9); + + // If cond_mask is not 0, it means at least one byte in cmp_array is non-zero + // Subtract 24 because clz implicitly converts to int with a 4-byte size + if (cond_mask_2){ + return curr_scan_start + __builtin_clz(cond_mask_2) - 24 ; + } + else if(cond_mask_1){ + return curr_scan_start + __builtin_clz(cond_mask_1) - 16; // Add 8 to the index to account for the first 8 bytes in the vector + } + + } + + return end_position; +} +#endif + +/** + * @brief: Range scan functions for strictly greater than comparison + * These functions are used to find the first position in the data stream matching a target comparison + * The functions are imlemented for SSE, AVX2, AVX512, ARM NEON and AltiVec instruction sets. + */ + + +#if defined(__SSE3__) +uint64_t AVX_Chunking_Technique::range_scan_gt_sse128(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / SSE_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + uint64_t return_pos; + + // Structures to store bytes from data stream and comparison results in 128-bit SSE format + __m128i xmm_array, cmp_array; + int cmp_mask; + + // Load max_value into xmm-format + __m128i max_val_xmm = _mm_set1_epi8((char)target_value); + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * SSE_REGISTER_SIZE_BYTES); + // Load data into xmm register + xmm_array = _mm_loadu_si128((__m128i const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in xmm_array is gt max_val_xmm, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + #if defined(__AVX512F__) + cmp_mask = _mm_cmpgt_epu8_mask(xmm_array, max_val_xmm); + #else + cmp_array = Greater8uSSE(xmm_array, max_val_xmm); + + // Create a mask using the most-significant bit of each byte value in cmp_array + cmp_mask = _mm_movemask_epi8(cmp_array); + #endif + + // Return index of first non-zero bit in mask + // This corresponds to the first non-zero byte in cmp_array + if(cmp_mask){ + return_pos = curr_scan_start + (__builtin_ffs(cmp_mask) - 1); + return return_pos; + } + } + + return end_position; +} +#endif + +#if defined(__AVX2__) +uint64_t AVX_Chunking_Technique::range_scan_gt_avx256(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / AVX256_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + + // Structures to store bytes from data stream and comparison results in 256-bit AVX format + __m256i xmm_array, cmp_array; + uint32_t cmp_mask; + + // Load max_value into xmm-format + __m256i max_val_xmm = _mm256_set1_epi8((char)target_value); + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * AVX256_REGISTER_SIZE_BYTES); + // Load data into xmm register + xmm_array = _mm256_loadu_si256((__m256i const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in xmm_array is gt max_val_xmm, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + #if defined(__AVX512F__) + cmp_mask = _mm256_cmpgt_epu8_mask(xmm_array, max_val_xmm); + #else + cmp_array = Greater8uAVX256(xmm_array, max_val_xmm); + + // Create a mask using the most-significant bit of each byte value in cmp_array + cmp_mask = _mm256_movemask_epi8(cmp_array); + #endif + + // Return index of first non-zero bit in mask + // This corresponds to the first non-zero byte in cmp_array + if(cmp_mask){ + return curr_scan_start + (__builtin_ffs(cmp_mask) - 1); + } + } + + return end_position; +} +#endif + +#if defined(__AVX512F__) +uint64_t AVX_Chunking_Technique::range_scan_gt_avx512(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / AVX512_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + uint64_t return_pos; + + // Structures to store bytes from data stream and comparison results in 512-bit AVX format + __m512i xmm_array; + __mmask64 cmp_mask; + + // Load max_value into xmm-format + __m512i max_val_xmm = _mm512_set1_epi8((char)target_value); + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * AVX512_REGISTER_SIZE_BYTES); + // Load data into xmm register + xmm_array = _mm512_loadu_si512((__m512i const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in xmm_array is gt max_val_xmm, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + cmp_mask = _mm512_cmpgt_epu8_mask(xmm_array, max_val_xmm); + + // Return index of first non-zero bit in mask + // This corresponds to the first non-zero byte in cmp_array + if(cmp_mask){ + return_pos = curr_scan_start + (__builtin_ffsll(cmp_mask) - 1); + return return_pos; + } + } + + return end_position; +} +#endif + +#if defined(__ARM_NEON) +uint64_t AVX_Chunking_Technique::range_scan_gt_neon(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / NEON_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + + // Structures to store bytes from data stream and comparison results in 128-bit NEON format + uint8x16_t xmm_array, cmp_array; + uint8x16_t max_val_xmm = vdupq_n_u8((uint8_t)target_value); + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * NEON_REGISTER_SIZE_BYTES); + // Load data into xmm register + xmm_array = vld1q_u8((uint8_t const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in xmm_array is gt max_val_xmm, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + cmp_array = vcgtq_u8(xmm_array, max_val_xmm); + + // ARM doesn't have native MOVEMASK instructions like x86 does. + // Following advice from an ARM community blog post to implement mask creation instead + // https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon + + // Create a mask using the most-significant bit of each byte value in cmp_array + + uint64_t cmp_mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp_array), 4)), 0); + + // Return index of first non-zero bit in mask + // This corresponds to the first non-zero byte in cmp_array + if(cmp_mask){ + return curr_scan_start + ((__builtin_ffsll(cmp_mask) - 1) >> 2); // Shift right by 2 to account for 4 bits per byte + } + } + + return end_position; +} +#endif + +#ifdef __ALTIVEC__ +uint64_t AVX_Chunking_Technique::range_scan_gt_altivec(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / ALTIVEC_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + + // Structures to store bytes from data stream and comparison results in 128-bit AltiVec format + __vector unsigned char vec_array; + __vector unsigned char perm_array; + __vector __bool char cmp_array; + __vector unsigned char max_val_vec = vec_splats((unsigned char)target_value); + __vector unsigned char bit_selector = {0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120}; // select most significant bit from each byte + + uint8_t cond_mask_1, cond_mask_2; + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * ALTIVEC_REGISTER_SIZE_BYTES); + + // Load data into vec register + // Use vec_xl instead of vec_ld because vec_ld truncates the load address to a 16 byte boundary + // vec_xl allows loading from any address, which is necessary for unaligned data + vec_array = vec_xl(0, (unsigned char const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in vec_array is gt max_val_vec, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + cmp_array = vec_cmpgt(vec_array, max_val_vec); + + // Create masks using the most-significant bit of each byte value in cmp_array + perm_array = vec_bperm((__vector unsigned char)cmp_array, bit_selector); + cond_mask_1 = vec_extract(perm_array, 8); + cond_mask_2 = vec_extract(perm_array, 9); + + // If cond_mask is not 0, it means at least one byte in cmp_array is non-zero + if (cond_mask_2){ + return curr_scan_start + __builtin_clz(cond_mask_2) - 24 ; + } + else if(cond_mask_1){ + return curr_scan_start + __builtin_clz(cond_mask_1) - 16; // Add 8 + } + } + return end_position; +} +#endif + + +/** + * @brief: Range scan functions for less than or equal to comparison + * These functions are used to find the first position in the data stream matching a target comparison + * The functions are implemented for SSE, AVX2, AVX512, NEON and AltiVec instruction sets. + */ + +#if defined(__SSE3__) +uint64_t AVX_Chunking_Technique::range_scan_leq_sse128(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / SSE_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + uint64_t return_pos; + + // Structures to store bytes from data stream and comparison results in 128-bit SSE format + __m128i xmm_array, cmp_array; + int cmp_mask; + + // Load max_value into xmm-format + __m128i max_val_xmm = _mm_set1_epi8((char)target_value); + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * SSE_REGISTER_SIZE_BYTES); + // Load data into xmm register + xmm_array = _mm_loadu_si128((__m128i const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in xmm_array is leq max_val_xmm, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + #if defined(__AVX512F__) + cmp_mask = _mm_cmple_epu8_mask(xmm_array, max_val_xmm); + #else + cmp_array = LesserOrEqual8uSSE(xmm_array, max_val_xmm); + + // Create a mask using the most-significant bit of each byte value in cmp_array + cmp_mask = _mm_movemask_epi8(cmp_array); + #endif + + // Return index of first non-zero bit in mask + // This corresponds to the first non-zero byte in cmp_array + if(cmp_mask){ + return_pos = curr_scan_start + (__builtin_ffs(cmp_mask) - 1); + return return_pos; + } + } + + return end_position; +} +#endif + +#if defined(__AVX2__) +uint64_t AVX_Chunking_Technique::range_scan_leq_avx256(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / AVX256_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + + // Structures to store bytes from data stream and comparison results in 256-bit AVX format + __m256i xmm_array, cmp_array; + uint32_t cmp_mask; + + // Load max_value into xmm-format + __m256i max_val_xmm = _mm256_set1_epi8((char)target_value); + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * AVX256_REGISTER_SIZE_BYTES); + // Load data into xmm register + xmm_array = _mm256_loadu_si256((__m256i const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in xmm_array is leq max_val_xmm, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + #if defined(__AVX512F__) + cmp_mask = _mm256_cmple_epu8_mask(xmm_array, max_val_xmm); + #else + cmp_array = LesserOrEqual8uAVX256(xmm_array, max_val_xmm); + + // Create a mask using the most-significant bit of each byte value in cmp_array + cmp_mask = _mm256_movemask_epi8(cmp_array); + #endif + + // Return index of first non-zero bit in mask + // This corresponds to the first non-zero byte in cmp_array + if(cmp_mask){ + return curr_scan_start + (__builtin_ffs(cmp_mask) - 1); + } + } + + return end_position; +} +#endif + +#if defined(__AVX512F__) +uint64_t AVX_Chunking_Technique::range_scan_leq_avx512(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / AVX512_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + uint64_t return_pos; + + // Structures to store bytes from data stream and comparison results in 512-bit AVX format + __m512i xmm_array; + __mmask64 cmp_mask; + + // Load max_value into xmm-format + __m512i max_val_xmm = _mm512_set1_epi8((char)target_value); + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * AVX512_REGISTER_SIZE_BYTES); + // Load data into xmm register + xmm_array = _mm512_loadu_si512((__m512i const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in xmm_array is leq max_val_xmm, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + cmp_mask = _mm512_cmple_epu8_mask(xmm_array, max_val_xmm); + + // Return index of first non-zero bit in mask + // This corresponds to the first non-zero byte in cmp_array + if(cmp_mask){ + return_pos = curr_scan_start + (__builtin_ffsll(cmp_mask) - 1); + return return_pos; + } + } + + return end_position; +} +#endif + +#if defined(__ARM_NEON) +uint64_t AVX_Chunking_Technique::range_scan_leq_neon(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / NEON_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + + // Structures to store bytes from data stream and comparison results in 128-bit NEON format + uint8x16_t xmm_array, cmp_array; + uint8x16_t max_val_xmm = vdupq_n_u8((uint8_t)target_value); + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * NEON_REGISTER_SIZE_BYTES); + // Load data into xmm register + xmm_array = vld1q_u8((uint8_t const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in xmm_array is leq max_val_xmm, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + cmp_array = vcleq_u8(xmm_array, max_val_xmm); + + // ARM doesn't have native MOVEMASK instructions like x86 does. + // Following advice from an ARM community blog post to implement mask creation instead + // https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon + + // Create a mask using the most-significant bit of each byte value in cmp_array + + uint64_t cmp_mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp_array), 4)), 0); + + // Return index of first non-zero bit in mask + // This corresponds to the first non-zero byte in cmp_array + if(cmp_mask){ + return curr_scan_start + ((__builtin_ffsll(cmp_mask) - 1) >> 2); // Shift right by 2 to account for 4 bits per byte + } + } + + return end_position; +} +#endif + +#ifdef __ALTIVEC__ +uint64_t AVX_Chunking_Technique::range_scan_leq_altivec(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / ALTIVEC_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + + // Structures to store bytes from data stream and comparison results in 128-bit AltiVec format + __vector unsigned char vec_array; + __vector unsigned char perm_array; + __vector __bool char cmp_array; + __vector unsigned char max_val_vec = vec_splats((unsigned char)target_value); + __vector unsigned char bit_selector = {0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120}; // select most significant bit from each byte + + uint8_t cond_mask_1, cond_mask_2; + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * ALTIVEC_REGISTER_SIZE_BYTES); + + // Load data into vec register + // Use vec_xl instead of vec_ld because vec_ld truncates the load address to a 16 byte boundary + // vec_xl allows loading from any address, which is necessary for unaligned data + vec_array = vec_xl(0, (unsigned char const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in vec_array is leq max_val_vec, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + cmp_array = vec_cmple(vec_array, max_val_vec); + + // Create masks using the most-significant bit of each byte value in cmp_array + perm_array = vec_bperm((__vector unsigned char)cmp_array, bit_selector); + cond_mask_1 = vec_extract(perm_array, 8); + cond_mask_2 = vec_extract(perm_array, 9); + + // If cond_mask is not 0, it means at least one byte in cmp_array is non-zero + if (cond_mask_2){ + return curr_scan_start + __builtin_clz(cond_mask_2) - 24 ; + } + else if(cond_mask_1){ + return curr_scan_start + __builtin_clz(cond_mask_1) - 16; // Add 8 + } + } + return end_position; +} +#endif + +/** + * @brief: Range scan functions for strictly less than comparison + * These functions are used to find the first position in the data stream matching a target comparison + * The functions are implemented for SSE, AVX2, AVX512, NEON and AltiVec instruction sets. + */ + +#if defined(__SSE3__) +uint64_t AVX_Chunking_Technique::range_scan_lt_sse128(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / SSE_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + uint64_t return_pos; + + // Structures to store bytes from data stream and comparison results in 128-bit SSE format + __m128i xmm_array, cmp_array; + int cmp_mask; + + // Load max_value into xmm-format + __m128i max_val_xmm = _mm_set1_epi8((char)target_value); + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * SSE_REGISTER_SIZE_BYTES); + // Load data into xmm register + xmm_array = _mm_loadu_si128((__m128i const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in xmm_array is lt max_val_xmm, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + #if defined(__AVX512F__) + cmp_mask = _mm_cmplt_epu8_mask(xmm_array, max_val_xmm); + #else + cmp_array = Lesser8uSSE(xmm_array, max_val_xmm); + + // Create a mask using the most-significant bit of each byte value in cmp_array + cmp_mask = _mm_movemask_epi8(cmp_array); + #endif + + // Return index of first non-zero bit in mask + // This corresponds to the first non-zero byte in cmp_array + if(cmp_mask){ + return_pos = curr_scan_start + (__builtin_ffs(cmp_mask) - 1); + return return_pos; + } + } + + return end_position; +} +#endif + +#if defined(__AVX2__) +uint64_t AVX_Chunking_Technique::range_scan_lt_avx256(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / AVX256_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + + // Structures to store bytes from data stream and comparison results in 256-bit AVX format + __m256i xmm_array, cmp_array; + uint32_t cmp_mask; + + // Load max_value into xmm-format + __m256i max_val_xmm = _mm256_set1_epi8((char)target_value); + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * AVX256_REGISTER_SIZE_BYTES); + // Load data into xmm register + xmm_array = _mm256_loadu_si256((__m256i const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in xmm_array is lt max_val_xmm, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + #if defined(__AVX512F__) + cmp_mask = _mm256_cmplt_epu8_mask(xmm_array, max_val_xmm); + #else + cmp_array = Lesser8uAVX256(xmm_array, max_val_xmm); + + // Create a mask using the most-significant bit of each byte value in cmp_array + cmp_mask = _mm256_movemask_epi8(cmp_array); + #endif + + // Return index of first non-zero bit in mask + // This corresponds to the first non-zero byte in cmp_array + if(cmp_mask){ + return curr_scan_start + (__builtin_ffs(cmp_mask) - 1); + } + } + + return end_position; +} +#endif + +#if defined(__AVX512F__) +uint64_t AVX_Chunking_Technique::range_scan_lt_avx512(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / AVX512_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + uint64_t return_pos; + + // Structures to store bytes from data stream and comparison results in 512-bit AVX format + __m512i xmm_array; + __mmask64 cmp_mask; + + // Load max_value into xmm-format + __m512i max_val_xmm = _mm512_set1_epi8((char)target_value); + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * AVX512_REGISTER_SIZE_BYTES); + // Load data into xmm register + xmm_array = _mm512_loadu_si512((__m512i const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in xmm_array is lt max_val_xmm, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + cmp_mask = _mm512_cmplt_epu8_mask(xmm_array, max_val_xmm); + + // Return index of first non-zero bit in mask + // This corresponds to the first non-zero byte in cmp_array + if(cmp_mask){ + return_pos = curr_scan_start + (__builtin_ffsll(cmp_mask) - 1); + return return_pos; + } + } + + return end_position; +} +#endif + +#if defined(__ARM_NEON) +uint64_t AVX_Chunking_Technique::range_scan_lt_neon(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + + uint64_t num_vectors = (end_position - start_position) / NEON_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + + // Structures to store bytes from data stream and comparison results in 128-bit NEON format + uint8x16_t xmm_array, cmp_array; + uint8x16_t max_val_xmm = vdupq_n_u8((uint8_t)target_value); + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * NEON_REGISTER_SIZE_BYTES); + // Load data into xmm register + xmm_array = vld1q_u8((uint8_t const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in xmm_array is lt max_val_xmm, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + cmp_array = vcltq_u8(xmm_array, max_val_xmm); + + // ARM doesn't have native MOVEMASK instructions like x86 does. + // Following advice from an ARM community blog post to implement mask creation instead + // https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon + + // Create a mask using the most-significant bit of each byte value in cmp_array + + uint64_t cmp_mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp_array), 4)), 0); + + // Return index of first non-zero bit in mask + // This corresponds to the first non-zero byte in cmp_array + if(cmp_mask){ + return curr_scan_start + ((__builtin_ffsll(cmp_mask) - 1) >> 2); // Shift right by 2 to account for 4 bits per byte + } + } + + return end_position; +} +#endif + +#ifdef __ALTIVEC__ +uint64_t AVX_Chunking_Technique::range_scan_lt_altivec(char *buff, uint64_t start_position, uint64_t end_position, uint8_t target_value){ + uint64_t num_vectors = (end_position - start_position) / ALTIVEC_REGISTER_SIZE_BYTES; + uint64_t curr_scan_start; + + // Structures to store bytes from data stream and comparison results in 128-bit AltiVec format + __vector unsigned char vec_array; + __vector unsigned char perm_array; + __vector __bool char cmp_array; + __vector unsigned char max_val_vec = vec_splats((unsigned char)target_value); + __vector unsigned char bit_selector = {0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120}; // select most significant bit from each byte + + uint8_t cond_mask_1, cond_mask_2; + + for(uint64_t i = 0; i < num_vectors; i++){ + curr_scan_start = start_position + (i * ALTIVEC_REGISTER_SIZE_BYTES); + + // Load data into vec register + // Use vec_xl instead of vec_ld because vec_ld truncates the load address to a 16 byte boundary + // vec_xl allows loading from any address, which is necessary for unaligned data + vec_array = vec_xl(0, (unsigned char const *)(buff + curr_scan_start)); + + /* + Compare values with max_value. If a byte in vec_array is lt max_val_vec, + ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. + */ + + cmp_array = vec_cmplt(vec_array, max_val_vec); + + // Create masks using the most-significant bit of each byte value in cmp_array + perm_array = vec_bperm((__vector unsigned char)cmp_array, bit_selector); + + // bperm stores masks in elements 8 and 9 of the vector + // Extract the masks from the perm_array + // cond_mask_2 corresponds to the first 8 bytes, cond_mask_1 corresponds to the next 8 bytes + // This is because vec_bperm rearranges the bits in little endian order + cond_mask_1 = vec_extract(perm_array, 8); + cond_mask_2 = vec_extract(perm_array, 9); + + // If cond_mask is not 0, it means at least one byte in cmp_array is non-zero + // Subtract 24 because clz implicitly converts to int with a 4-byte size + if (cond_mask_2){ + return curr_scan_start + __builtin_clz(cond_mask_2) - 24 ; + } + else if(cond_mask_1){ + return curr_scan_start + __builtin_clz(cond_mask_1) - 16; // Add 8 to the index to account for the first 8 bytes in the vector + } + } + return end_position; +} +#endif diff --git a/dedup/src/chunking/chunking_common.cpp b/dedup/src/chunking/chunking_common.cpp index 4ee5a02..0cad79b 100644 --- a/dedup/src/chunking/chunking_common.cpp +++ b/dedup/src/chunking/chunking_common.cpp @@ -15,7 +15,6 @@ #include #include #include -#include #include #include diff --git a/dedup/src/chunking/maxp_chunking.cpp b/dedup/src/chunking/maxp_chunking.cpp new file mode 100644 index 0000000..76f3892 --- /dev/null +++ b/dedup/src/chunking/maxp_chunking.cpp @@ -0,0 +1,519 @@ +/** + * @file maxp_chunking.cpp + * @author WASL + * @brief Implementations for MAXP chunking technique + * @version 0.1 + * @date 2025-3-26 + * + * @copyright Copyright (c) 2023 + * + */ + + #include + #include "maxp_chunking.hpp" + + MAXP_Chunking::MAXP_Chunking(){ + window_size = DEFAULT_MAXP_WINDOW_SIZE; + max_block_size = DEFAULT_MAXP_MAX_BLOCK_SIZE; + simd_mode = SIMD_Mode::NONE; + + + #ifdef __SSE3__ + xmm_array = nullptr; + #endif + + #ifdef __AVX2__ + ymm_array = nullptr; + #endif + + #if defined(__AVX512F__) + zmm_array = nullptr; + #endif + + #if defined(__ARM_NEON) + neon_array = nullptr; + #endif + + #ifdef __ALTIVEC__ + altivec_array = nullptr; + #endif + + chunk_counter = 0; + } + + MAXP_Chunking::MAXP_Chunking(const Config &config){ + window_size = config.get_maxp_window_size(); + max_block_size = config.get_maxp_max_block_size(); + simd_mode = config.get_simd_mode(); + + #ifdef __SSE3__ + xmm_array = nullptr; + #endif + + #ifdef __AVX2__ + ymm_array = nullptr; + #endif + + #if defined(__AVX512F__) + zmm_array = nullptr; + #endif + + #if defined(__ARM_NEON) + neon_array = nullptr; + #endif + + #ifdef __ALTIVEC__ + altivec_array = nullptr; + #endif + + chunk_counter = 0; + + if(simd_mode == SIMD_Mode::NONE) { + // No SIMD mode selected + // Pass + } + + #ifdef __SSE3__ + else if(simd_mode == SIMD_Mode::SSE128) { + if(window_size % SSE_REGISTER_SIZE_BYTES != 0 || (window_size / 16) % 2 != 0) { + // Check if window size is a multiple of 16 + // Check if window size makes an even number of vectors for find_maximum_sse128 + std::cout << "MAXP window size currently unsupported by SSE128. Please use an even multiple of SSE128_REGISTER_SIZE_BYTES (default 16)." << std::endl; + exit(1); + } + uint64_t num_vectors = window_size / SSE_REGISTER_SIZE_BYTES; + xmm_array = new __m128i[num_vectors](); + if(xmm_array == nullptr) { + std::cout << "Error allocating memory for 128-bit vectors (__m128i)" << std::endl; + exit(1); + } + } + #endif + + #ifdef __AVX2__ + else if(simd_mode == SIMD_Mode::AVX256) { + if(window_size % AVX256_REGISTER_SIZE_BYTES != 0 || (window_size / 32) % 2 != 0) { + // Check if window size is a multiple of 32 + // Check if window size makes an even number of vectors for find_maximum_AVX256 + std::cout << "MAXP window size currently unsupported by AVX256. Please use an even multiple of AVX_REGISTER_SIZE_BYTES (default 32)." << std::endl; + exit(1); + } + uint64_t num_vectors = window_size / AVX256_REGISTER_SIZE_BYTES; + ymm_array = new __m256i[num_vectors](); + if(ymm_array == nullptr) { + std::cout << "Error allocating memory for 256-bit vectors(__m256i)" << std::endl; + exit(1); + } + } + #endif + + #if defined(__AVX512F__) + else if(simd_mode == SIMD_Mode::AVX512) { + if(window_size % AVX512_REGISTER_SIZE_BYTES != 0 || (window_size / 64) % 2 != 0) { + // Check if window size is a multiple of 64 + // Check if window size makes an even number of vectors for find_maximum_AVX512 + std::cout << "MAXP window size currently unsupported by AVX512. Please use an even multiple of AVX_REGISTER_SIZE_BYTES (default 64)." << std::endl; + exit(1); + } + uint64_t num_vectors = window_size / AVX512_REGISTER_SIZE_BYTES; + zmm_array = new __m512i[num_vectors](); + if(zmm_array == nullptr) { + std::cout << "Error allocating memory for 512-bit vectors(__m512i)" << std::endl; + exit(1); + } + } + #endif + + #if defined(__ARM_NEON) + else if(simd_mode == SIMD_Mode::NEON) { + if(window_size % NEON_REGISTER_SIZE_BYTES != 0 || (window_size / 16) % 2 != 0) { + // Check if window size is a multiple of 16 + // Check if window size makes an even number of vectors for find_maximum_neon + std::cout << "MAXP window size currently unsupported by NEON. Please use an even multiple of NEON_REGISTER_SIZE_BYTES (default 16)." << std::endl; + exit(1); + } + uint64_t num_vectors = window_size / NEON_REGISTER_SIZE_BYTES; + neon_array = new uint8x16_t[num_vectors](); + if(neon_array == nullptr) { + std::cout << "Error allocating memory for NEON vectors(uint8x16_t)" << std::endl; + exit(1); + } + } + #endif + + #ifdef __ALTIVEC__ + else if(simd_mode == SIMD_Mode::ALTIVEC) { + if(window_size % ALTIVEC_REGISTER_SIZE_BYTES != 0 || (window_size / 16) % 2 != 0) { + // Check if window size is a multiple of 16 + // Check if window size makes an even number of vectors for find_maximum_altivec + std::cout << "MAXP window size currently unsupported by ALTIVEC. Please use an even multiple of ALTIVEC_REGISTER_SIZE_BYTES (default 16)." << std::endl; + exit(1); + } + uint64_t num_vectors = window_size / ALTIVEC_REGISTER_SIZE_BYTES; + altivec_array = new __vector unsigned char[num_vectors](); + if(altivec_array == nullptr) { + std::cout << "Error allocating memory for ALTIVEC vectors(__vector unsigned char)" << std::endl; + exit(1); + } + } + #endif + + else { + std::cout << "Unsupported SIMD Mode for MAXP" << std::endl; + exit(1); + } + } + + #ifdef __SSE3__ + uint64_t MAXP_Chunking::find_cutpoint_sse128(char *buff, uint64_t size){ + if(size < (2 * window_size) + 1) + return size; + + // Cap out max size + size = std::min(size, max_block_size); + + uint8_t max_value; + uint8_t backward_max = 0; + uint64_t max_pos = window_size; + + uint64_t return_pos_range_scan; + + while(max_pos < (size - window_size)){ + + max_value = (uint8_t)buff[max_pos]; + + return_pos_range_scan = range_scan_geq_sse128(buff, max_pos + 1, max_pos + 1 + window_size, max_value); + if(return_pos_range_scan == max_pos + window_size + 1){ + // No match found i.e. all bytes in range are less than max_value + // Start backward scan to verify if target is a local max + + backward_max = find_maximum_sse128(buff, max_pos - window_size, max_pos, xmm_array); + + // No bytes > max value in the backward region i.e. max_value is a local max + // Chunk boundary found + if(backward_max <= max_value){ + return max_pos; + } + else { + // Nothing until max_pos + window_size + 1 can be a chunk boundary now + max_pos += window_size + 1; + } + + } + else { + // Update maximum value + max_pos = return_pos_range_scan; + } + } + + // No chunk boundary found + return size; + +} +#endif + +#ifdef __AVX2__ +uint64_t MAXP_Chunking::find_cutpoint_avx256(char *buff, uint64_t size){ + if(size < (2 * window_size) + 1) + return size; + + // Cap out max size + size = std::min(size, max_block_size); + + uint8_t max_value; + uint8_t backward_max = 0; + uint64_t max_pos = window_size; + + uint64_t return_pos_range_scan; + + while(max_pos < (size - window_size)){ + + max_value = (uint8_t)buff[max_pos]; + + return_pos_range_scan = range_scan_geq_avx256(buff, max_pos + 1, max_pos + 1 + window_size, max_value); + if(return_pos_range_scan == max_pos + window_size + 1){ + // No match found i.e. all bytes in range are less than max_value + // Start backward scan to verify if target is a local max + + backward_max = find_maximum_avx256(buff, max_pos - window_size, max_pos, ymm_array); + + // No bytes > max value in the backward region i.e. max_value is a local max + // Chunk boundary found + if(backward_max <= max_value){ + return max_pos; + } + else { + // Nothing until max_pos + window_size + 1 can be a chunk boundary now + max_pos += window_size + 1; + } + + } + else { + // Update maximum value + max_pos = return_pos_range_scan; + } + } + + // No chunk boundary found + return size; +} +#endif + +#if defined(__AVX512F__) +uint64_t MAXP_Chunking::find_cutpoint_avx512(char *buff, uint64_t size){ + if(size < (2 * window_size) + 1) + return size; + + // Cap out max size + size = std::min(size, max_block_size); + + uint8_t max_value; + uint8_t backward_max = 0; + uint64_t max_pos = window_size; + + uint64_t return_pos_range_scan; + + while(max_pos < (size - window_size)){ + + max_value = (uint8_t)buff[max_pos]; + + return_pos_range_scan = range_scan_geq_avx512(buff, max_pos + 1, max_pos + 1 + window_size, max_value); + if(return_pos_range_scan == max_pos + window_size + 1){ + // No match found i.e. all bytes in range are less than max_value + // Start backward scan to verify if target is a local max + + backward_max = find_maximum_avx512(buff, max_pos - window_size, max_pos, zmm_array); + + // No bytes > max value in the backward region i.e. max_value is a local max + // Chunk boundary found + if(backward_max <= max_value){ + return max_pos; + } + else { + // Nothing until max_pos + window_size + 1 can be a chunk boundary now + max_pos += window_size + 1; + } + + } + else { + // Update maximum value + max_pos = return_pos_range_scan; + } + } + + // No chunk boundary found + return size; + +} +#endif + +#if defined(__ARM_NEON) +uint64_t MAXP_Chunking::find_cutpoint_neon(char *buff, uint64_t size){ + if(size < (2 * window_size) + 1) + return size; + + // Cap out max size + size = std::min(size, max_block_size); + + uint8_t max_value; + uint8_t backward_max = 0; + uint64_t max_pos = window_size; + + uint64_t return_pos_range_scan; + + while(max_pos < (size - window_size)){ + + max_value = (uint8_t)buff[max_pos]; + + return_pos_range_scan = range_scan_geq_neon(buff, max_pos + 1, max_pos + 1 + window_size, max_value); + if(return_pos_range_scan == max_pos + window_size + 1){ + // No match found i.e. all bytes in range are less than max_value + // Start backward scan to verify if target is a local max + + backward_max = find_maximum_neon(buff, max_pos - window_size, max_pos, neon_array); + + // No bytes > max value in the backward region i.e. max_value is a local max + // Chunk boundary found + if(backward_max <= max_value){ + return max_pos; + } + else { + // Nothing until max_pos + window_size + 1 can be a chunk boundary now + max_pos += window_size + 1; + } + + } + else { + // Update maximum value + max_pos = return_pos_range_scan; + } + } + + // No chunk boundary found + return size; + +} +#endif + +#ifdef __ALTIVEC__ +uint64_t MAXP_Chunking::find_cutpoint_altivec(char *buff, uint64_t size){ + if(size < (2 * window_size) + 1){ + return size; + } + + // Cap out max size + size = std::min(size, max_block_size); + uint8_t max_value; + uint8_t backward_max = 0; + uint64_t max_pos = window_size; + uint64_t return_pos_range_scan; + + while(max_pos < (size - window_size)){ + + max_value = (uint8_t)buff[max_pos]; + + return_pos_range_scan = range_scan_geq_altivec(buff, max_pos + 1, max_pos + 1 + window_size, max_value); + + if(return_pos_range_scan == max_pos + window_size + 1){ + // No match found i.e. all bytes in range are less than max_value + // Start backward scan to verify if target is a local max + + backward_max = find_maximum_altivec(buff, max_pos - window_size, max_pos, altivec_array); + + // No bytes > max value in the backward region i.e. max_value is a local max + // Chunk boundary found + if(backward_max <= max_value){ + return max_pos; + } + else { + // Nothing until max_pos + window_size + 1 can be a chunk boundary now + max_pos += window_size + 1; + } + + } + else { + // Update maximum value + max_pos = return_pos_range_scan; + } + } + + // N:wqo chunk boundary found + return size; +} +#endif + +uint64_t MAXP_Chunking::find_cutpoint_native(char *buff, uint64_t size){ + + if(size < (2 * window_size) + 1){ + return size; + } + + + size = std::min(size, max_block_size); + + uint64_t max_position = window_size; + uint8_t max_value = (uint8_t)buff[max_position]; + uint64_t j; + bool local_max_found = false; + + // Scan window from left to right looking for a byte with a value > all bytes in a "window_size" after it + for(uint64_t i = window_size; i < size - 1; i++){ + + if((uint8_t)buff[i] >= max_value){ + max_position = i; + max_value = (uint8_t)buff[i]; + } + else if(i == max_position + window_size){ + // Target byte found + // Initiate backward scanning to see if i is local maximum + local_max_found = true; + + for(j = max_position - window_size; j < max_position; j++){ + if((uint8_t)buff[j] > max_value){ + max_position = i+1; + max_value = (uint8_t)buff[i+1]; + local_max_found = false; + break; + } + } + + // Insert chunk boundary at max_position (not i) i.e. chunk boundary does not include right window + if(local_max_found == true){ + return max_position; + } + } + } + + // Returning maximum chunk size + return size; + } + + uint64_t MAXP_Chunking::find_cutpoint(char *buff, uint64_t size){ + chunk_counter++; + if(simd_mode == SIMD_Mode::NONE) { + return find_cutpoint_native(buff, size); + } + + #ifdef __SSE3__ + else if(simd_mode == SIMD_Mode::SSE128) { + return find_cutpoint_sse128(buff, size); + } + #endif + + #ifdef __AVX2__ + else if(simd_mode == SIMD_Mode::AVX256) { + return find_cutpoint_avx256(buff, size); + } + #endif + + #if defined(__AVX512F__) + else if(simd_mode == SIMD_Mode::AVX512) { + return find_cutpoint_avx512(buff, size); + } + #endif + + #if defined(__ARM_NEON) + else if(simd_mode == SIMD_Mode::NEON) { + return find_cutpoint_neon(buff, size); + } + #endif + + #ifdef __ALTIVEC__ + else if(simd_mode == SIMD_Mode::ALTIVEC) { + return find_cutpoint_altivec(buff, size); + } + #endif + + else { + std::cout << "Unsupported SIMD Mode for MAXP" << std::endl; + exit(1); + } + } + + MAXP_Chunking::~MAXP_Chunking(){ + + #ifdef __SSE3__ + if(xmm_array != nullptr) + delete xmm_array; + #endif + + #ifdef __AVX2__ + if(ymm_array != nullptr) + delete ymm_array; + #endif + + #if defined(__AVX512F__) + if(zmm_array != nullptr) + delete zmm_array; + #endif + + #if defined(__ARM_NEON__) + if(neon_array != nullptr) + delete neon_array; + #endif + + #ifdef __ALTIVEC__ + if(altivec_array != nullptr) + delete altivec_array; + #endif + } \ No newline at end of file diff --git a/dedup/src/chunking/ram_chunking.cpp b/dedup/src/chunking/ram_chunking.cpp index 0ac0a86..834ee55 100644 --- a/dedup/src/chunking/ram_chunking.cpp +++ b/dedup/src/chunking/ram_chunking.cpp @@ -18,9 +18,25 @@ RAM_Chunking::RAM_Chunking() { window_size = avg_block_size - 256; + #ifdef __SSE3__ sse_array = nullptr; + #endif + + #ifdef __AVX2__ avx256_array = nullptr; - avx512_array = nullptr; + #endif + + #if defined(__AVX512F__) + avx512_array = nullptr; + #endif + + #ifdef __ARM_NEON + neon_array = nullptr; + #endif + + #ifdef __ALTIVEC__ + altivec_array = nullptr; + #endif simd_mode = SIMD_Mode::NONE; @@ -32,40 +48,106 @@ RAM_Chunking::RAM_Chunking(const Config& config) { window_size = avg_block_size - 256; // window_size = avg_block_size / (exp(1) - 1); // avg_block size / e-1 - sse_array = nullptr; - avx256_array = nullptr; - avx512_array = nullptr; + #ifdef __SSE3__ + sse_array = nullptr; + #endif + + #ifdef __AVX2__ + avx256_array = nullptr; + #endif + + #if defined(__AVX512F__) + avx512_array = nullptr; + #endif + #ifdef __ARM_NEON + neon_array = nullptr; + #endif + + #ifdef __ALTIVEC__ + altivec_array = nullptr; + #endif + technique_name = "RAM Chunking"; simd_mode = config.get_simd_mode(); - if(simd_mode == SIMD_Mode::SSE128 || simd_mode == SIMD_Mode::SSE128_NOSLIDE){ + if(simd_mode == SIMD_Mode::NONE){ + + } + + #ifdef __SSE3__ + else if(simd_mode == SIMD_Mode::SSE128){ uint64_t num_vectors = window_size / SSE_REGISTER_SIZE_BYTES; sse_array = new __m128i[num_vectors](); } + #endif + + #ifdef __AVX2__ else if(simd_mode == SIMD_Mode::AVX256){ uint64_t num_vectors = window_size / AVX256_REGISTER_SIZE_BYTES; avx256_array = new __m256i[num_vectors](); } + #endif #if defined(__AVX512F__) else if(simd_mode == SIMD_Mode::AVX512){ uint64_t num_vectors = window_size / AVX512_REGISTER_SIZE_BYTES; avx512_array = new __m512i[num_vectors](); } - #endif + #endif + + #ifdef __ARM_NEON + else if(simd_mode == SIMD_Mode::NEON){ + uint64_t num_vectors = window_size / NEON_REGISTER_SIZE_BYTES; + neon_array = new uint8x16_t[num_vectors](); + } + #endif + + #ifdef __ALTIVEC__ + else if(simd_mode == SIMD_Mode::ALTIVEC){ + uint64_t num_vectors = window_size / ALTIVEC_REGISTER_SIZE_BYTES; + altivec_array = new __vector unsigned char[num_vectors](); + } + #endif + + else { + std::cout << "Error: Unsupported SIMD mode" << std::endl; + exit(EXIT_FAILURE); + } } RAM_Chunking::~RAM_Chunking() { - if(simd_mode == SIMD_Mode::SSE128 || simd_mode == SIMD_Mode::SSE128_NOSLIDE) + + if(simd_mode == SIMD_Mode::NONE){ + // No SIMD mode, nothing to delete + return; + } + + #ifdef __SSE3__ + else if(simd_mode == SIMD_Mode::SSE128) delete sse_array; + #endif + + #ifdef __AVX2__ else if(simd_mode == SIMD_Mode::AVX256) delete avx256_array; + #endif + #if defined(__AVX512F__) else if(simd_mode == SIMD_Mode::AVX512) delete avx512_array; #endif + + #ifdef __ARM_NEON + else if(simd_mode == SIMD_Mode::NEON) + delete neon_array; + #endif + + #ifdef __ALTIVEC__ + else if(simd_mode == SIMD_Mode::ALTIVEC) + delete altivec_array; + #endif } uint64_t RAM_Chunking::find_cutpoint(char* buff, uint64_t size) { @@ -77,159 +159,61 @@ uint64_t RAM_Chunking::find_cutpoint(char* buff, uint64_t size) { else if(size < window_size) return size; - // If SSE128 with normal sliding - if (simd_mode == SIMD_Mode::SSE128_NOSLIDE){ - max_value = find_maximum_sse128(buff, 0, window_size, sse_array); + if(simd_mode == SIMD_Mode::NONE){ + // If no SIMD enabled, execute basic find_maximum + for(i = 0; i < window_size; i++){ + if ((uint8_t)buff[i] >= max_value) + max_value = (uint8_t)buff[i]; + } + + for (i = window_size; i < size; i++) { + if ((uint8_t)buff[i] >= max_value) + return i; + } + } + + #ifdef __SSE3__ // If SIMD enabled, accelerate find_maximum() and slide depending on chosen SIMD mode else if(simd_mode == SIMD_Mode::SSE128){ max_value = find_maximum_sse128(buff, 0, window_size, sse_array); - return get_return_position_sse128(buff, window_size, size, max_value); + return range_scan_geq_sse128(buff, window_size, size, max_value); } + #endif + + #ifdef __AVX2__ else if(simd_mode == SIMD_Mode::AVX256){ max_value = find_maximum_avx256(buff, 0, window_size, avx256_array); - return get_return_position_avx256(buff, window_size, size, max_value); + return range_scan_geq_avx256(buff, window_size, size, max_value); } + #endif + #if defined(__AVX512F__) else if(simd_mode == SIMD_Mode::AVX512){ max_value = find_maximum_avx512(buff, 0, window_size, avx512_array); - return get_return_position_avx512(buff, window_size, size, max_value); + return range_scan_geq_avx512(buff, window_size, size, max_value); } #endif - - // Else execute basic find_maximum, same as native RAM - else { - for(i = 0; i < window_size; i++){ - if ((uint8_t)buff[i] >= max_value) - max_value = (uint8_t)buff[i]; - } - } - for (i = window_size; i < size; i++) { - if ((uint8_t)buff[i] >= max_value) - return i; + #if defined(__ARM_NEON) + else if(simd_mode == SIMD_Mode::NEON){ + max_value = find_maximum_neon(buff, 0, window_size, neon_array); + return range_scan_geq_neon(buff, window_size, size, max_value); } + #endif - return size; -} - -uint64_t RAM_Chunking::get_return_position_sse128(char *buff, uint64_t start_position, uint64_t end_position, uint8_t max_value){ - - uint64_t num_vectors = (end_position - start_position) / SSE_REGISTER_SIZE_BYTES; - uint64_t curr_scan_start; - - // Structures to store bytes from data stream and comparison results in 128-bit SSE format - __m128i xmm_array, cmp_array; - int cmp_mask; - - // Load max_value into xmm-format - __m128i max_val_xmm = _mm_set1_epi8((char)max_value); - - for(uint64_t i = 0; i < num_vectors; i++){ - curr_scan_start = start_position + (i * SSE_REGISTER_SIZE_BYTES); - // Load data into xmm register - xmm_array = _mm_loadu_si128((__m128i const *)(buff + curr_scan_start)); - - /* - - Compare values with max_value. If a byte in xmm_array is geq max_val_xmm, - ALL the corresponding bits of the corresponding byte in cmp_array are set to 1. - */ - - #if defined(__AVX512F__) - cmp_mask = _mm_cmpge_epu8_mask(xmm_array, max_val_xmm); - #else - cmp_array = GreaterOrEqual8uSSE(xmm_array, max_val_xmm); - - // Create a mask using the most-significant bit of each byte value in cmp_array - cmp_mask = _mm_movemask_epi8(cmp_array); - #endif - - - - // Return index of first non-zero bit in mask - // This corresponds to the first non-zero byte in cmp_array - if(cmp_mask) - return curr_scan_start + (__builtin_ffs(cmp_mask) - 1); + #ifdef __ALTIVEC__ + else if(simd_mode == SIMD_Mode::ALTIVEC){ + max_value = find_maximum_altivec(buff, 0, window_size, altivec_array); + return range_scan_geq_altivec(buff, window_size, size, max_value); } - - return end_position; -} - -uint64_t RAM_Chunking::get_return_position_avx256(char *buff, uint64_t start_position, uint64_t end_position, uint8_t max_value){ - - uint64_t num_vectors = (end_position - start_position) / AVX256_REGISTER_SIZE_BYTES; - uint64_t curr_scan_start; - - // Structures to store bytes from data stream and comparison results in 128-bit SSE format - __m256i xmm_array, cmp_array; - uint32_t cmp_mask; - - // Load max_value into xmm-format - __m256i max_val_xmm = _mm256_set1_epi8((char)max_value); + #endif - for(uint64_t i = 0; i < num_vectors; i++){ - curr_scan_start = start_position + (i * AVX256_REGISTER_SIZE_BYTES); - // Load data into xmm register - xmm_array = _mm256_loadu_si256((__m256i const *)(buff + curr_scan_start)); - - /* - - Compare values with max_value. If a byte in xmm_array is geq max_val_xmm, - the corresponding bit of cmp_mask is set to 1. - */ - #if defined(__AVX512F__) - __mmask32 all_ones = UINT32_MAX; - cmp_mask = _mm256_mask_cmpge_epu8_mask(all_ones, xmm_array, max_val_xmm); - #else - cmp_array = GreaterOrEqual8uAVX256(xmm_array, max_val_xmm); - - // Create a mask using the most-significant bit of each byte value in cmp_array - cmp_mask = _mm256_movemask_epi8(cmp_array); - #endif - - // Return index of first non-zero bit in mask - // This corresponds to the first non-zero byte in cmp_array - if(cmp_mask) - return curr_scan_start + (__builtin_ffs(cmp_mask) - 1); + else { + std::cout << "Error: Unsupported SIMD mode" << std::endl; + exit(EXIT_FAILURE); } - return end_position; - + return size; } -#if defined(__AVX512F__) - uint64_t RAM_Chunking::get_return_position_avx512(char *buff, uint64_t start_position, uint64_t end_position, uint8_t max_value){ - - uint64_t num_vectors = (end_position - start_position) / AVX512_REGISTER_SIZE_BYTES; - uint64_t curr_scan_start; - - // Structures to store bytes from data stream and comparison results in 128-bit SSE format - __m512i xmm_array; - uint64_t cmp_mask; - - // Load max_value into xmm-format - __m512i max_val_xmm = _mm512_set1_epi8((char)max_value); - - for(uint64_t i = 0; i < num_vectors; i++){ - curr_scan_start = start_position + (i * AVX512_REGISTER_SIZE_BYTES); - // Load data into xmm register - xmm_array = _mm512_loadu_si512((__m512i const *)(buff + curr_scan_start)); - - /* - Compare values with max_value. If a byte in xmm_array is geq max_val_xmm, - the corresponding bit of cmp_mask is set to 1. - */ - - // Return a mask with the most significant bit of GEQ comparison byte-wise - cmp_mask = _mm512_cmpge_epu8_mask(xmm_array, max_val_xmm); - - // Return index of first non-zero bit in mask - // This corresponds to the first non-zero byte in cmp_array - if(cmp_mask) - return curr_scan_start + (__builtin_ffsll(cmp_mask) - 1); - } - - return end_position; - } -#endif diff --git a/dedup/src/config/config.cpp b/dedup/src/config/config.cpp index 1463ef0..1f0bb31 100644 --- a/dedup/src/config/config.cpp +++ b/dedup/src/config/config.cpp @@ -23,8 +23,8 @@ ChunkingTech Config::get_chunking_tech() const { return ChunkingTech::FASTCDC; } else if (value == "ram") { return ChunkingTech::RAM; - } else if (value == "experiment") { - return ChunkingTech::EXPERIMENT; + } else if(value == "maxp"){ + return ChunkingTech::MAXP; } else if (value == "crc") { return ChunkingTech::CRC; } else if (value == "seq") { @@ -49,7 +49,11 @@ HashingTech Config::get_hashing_tech() const { return HashingTech::SHA512; } else if (value == "md5") { return HashingTech::MD5; - } + } else if (value == "xxhash128") { + return HashingTech::XXHASH128; + } else if (value == "murmurhash3") { + return HashingTech::MURMURHASH3; + } } catch (...) { } throw ConfigError( @@ -61,23 +65,44 @@ SIMD_Mode Config::get_simd_mode() const { std::string value = parser.get_property(SIMD_MODE_STRING); if (value == "none") { return SIMD_Mode::NONE; - } else if (value == "sse128") { - return SIMD_Mode::SSE128; - } else if (value == "avx256") { - return SIMD_Mode::AVX256; - } else if (value == "avx512") { - if(__builtin_cpu_supports("avx512f")) + } + #ifdef __SSE3__ + else if (value == "sse128") { + return SIMD_Mode::SSE128; + } + else if (value == "sse128_noslide"){ + return SIMD_Mode::SSE128_NOSLIDE; + } + #endif + + #ifdef __AVX2__ + else if (value == "avx256") { + return SIMD_Mode::AVX256; + } + #endif + + #ifdef __AVX512F__ + else if (value == "avx512") { return SIMD_Mode::AVX512; - else - throw ConfigError( - "Invalid SIMD Mode in configuration file: AVX-512 not supported by CPU"); - } else if (value == "sse128_noslide"){ - return SIMD_Mode::SSE128_NOSLIDE; + } + #endif + #ifdef __ARM_NEON + else if (value == "neon128") { + return SIMD_Mode::NEON; + } + #endif + #ifdef __ALTIVEC__ + else if (value == "altivec128") { + return SIMD_Mode::ALTIVEC; + } + #endif + else { + throw ConfigError("Unsupported SIMD mode"); } } catch (...) { } throw ConfigError( - "The configuration file does not specify a valid SIMD mode"); + "Unsupported SIMD mode. Please check compilation flags and configuration file."); } uint64_t Config::get_fc_size() const { @@ -109,7 +134,7 @@ uint64_t Config::get_rabinc_min_block_size() const { } catch (...) { } throw ConfigError( - "The configuration file does not specify a valid rabins minimum block size"); + "The configuration file does not specify a valid minimum block size"); } uint64_t Config::get_rabinc_avg_block_size() const { @@ -508,3 +533,25 @@ uint64_t Config::get_tttd_max_block_size() const { "The configuration file does not specify a valid TTTD max block " "size"); } + +uint64_t Config::get_maxp_window_size() const { + try { + std::string value = parser.get_property(MAXP_WINDOW_SIZE); + return std::stoull(value); + } catch (...) { + } + throw ConfigError( + "The configuration file does not specify a valid MAXP window " + "size"); +} + +uint64_t Config::get_maxp_max_block_size() const { + try { + std::string value = parser.get_property(MAXP_MAX_BLOCK_SIZE); + return std::stoull(value); + } catch (...) { + } + throw ConfigError( + "The configuration file does not specify a valid MAXP maximum chunk " + "size"); +} \ No newline at end of file diff --git a/dedup/src/driver.cpp b/dedup/src/driver.cpp index de92fe6..07ad3b9 100644 --- a/dedup/src/driver.cpp +++ b/dedup/src/driver.cpp @@ -32,24 +32,25 @@ #include "crc_chunking.hpp" #include "seq_chunking.hpp" #include "tttd_chunking.hpp" +#include "maxp_chunking.hpp" #include "md5_hashing.hpp" #include "sha1_hashing.hpp" #include "sha256_hashing.hpp" #include "sha512_hashing.hpp" +#include "xxhash_hashing.hpp" +#include "murmurhash3_hashing.hpp" bool disable_hashing = false; static void driver_function(const std::filesystem::path& dir_path, - std::unique_ptr& chunk_method, - const std::string& output_file) { + std::unique_ptr& chunk_method, const std::string& output_file) { /** * @brief Uses the specified chunking technique to chunk the file, hash it * using the specified hashing technique and print the hashes * @param chunk_method: Chunking Technique Object. Object from a class * inheriting the Chunking_Technique interface. - * @param hash_method: Hash Technique Object. Object from a class inheriting - * the Hashing_Technique interface. + * @param output_file: Output file path for writing hashes to * @return: void * */ @@ -109,8 +110,7 @@ static void driver_function(const std::filesystem::path& dir_path, std::cout << "Avg Chunk size: " << total_bytes / chunk_count << std::endl; - std::cout << "Chunking Throughput (MB/sec): " - << total_mb / total_seconds_chunking << std::endl; + std::cout << "Chunking Throughput (MB/sec): " << total_mb / total_seconds_chunking << std::endl; std::cout << "Hashing Throughput (MB/sec): " << total_mb / total_seconds_hashing << std::endl; } @@ -122,10 +122,10 @@ int main(int argc, char* argv[]) { * @todo: Add Config class which takes in parameters */ if (argc > 4 || argc < 3) { - std::cout << "Usage: ./dedup.exe [bool]" + std::cout << "Usage: ./dedup.exe [bool]" << std::endl; std::cout - << "\t : Path to directory to run chunking and hashing on." + << "\t : Path to file to run chunking and hashing on." << std::endl; std::cout << "\t : Path to the config file." << std::endl; @@ -181,6 +181,9 @@ int main(int argc, char* argv[]) { case ChunkingTech::CRC: chunk_method = std::make_unique(config); break; + case ChunkingTech::MAXP: + chunk_method = std::make_unique(config); + break; case ChunkingTech::SEQ: chunk_method = std::make_unique(config); break; @@ -205,6 +208,12 @@ int main(int argc, char* argv[]) { case HashingTech::SHA512: chunk_method -> hash_method = std::make_unique(); break; + case HashingTech::XXHASH128: + chunk_method -> hash_method = std::make_unique(); + break; + case HashingTech::MURMURHASH3: + chunk_method -> hash_method = std::make_unique(); + break; default: std::cerr << "Unimplemented hashing technique" << std::endl; exit(EXIT_FAILURE); diff --git a/dedup/src/hashing/murmurhash3_hashing.cpp b/dedup/src/hashing/murmurhash3_hashing.cpp new file mode 100644 index 0000000..b801fe8 --- /dev/null +++ b/dedup/src/hashing/murmurhash3_hashing.cpp @@ -0,0 +1,345 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "murmurhash3_hashing.hpp" + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#define FORCE_INLINE __forceinline + +#include + +#define ROTL32(x,y) _rotl(x,y) +#define ROTL64(x,y) _rotl64(x,y) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#define FORCE_INLINE inline __attribute__((always_inline)) + +inline uint32_t rotl32 ( uint32_t x, int8_t r ) +{ + return (x << r) | (x >> (32 - r)); +} + +inline uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i ) +{ + return p[i]; +} + +FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) +{ + return p[i]; +} + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +FORCE_INLINE uint32_t fmix32 ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//---------- + +FORCE_INLINE uint64_t fmix64 ( uint64_t k ) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock32(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix32(h1); + + *(uint32_t*)out = h1; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_128 ( const void * key, const int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + const uint32_t c1 = 0x239b961b; + const uint32_t c2 = 0xab0e9789; + const uint32_t c3 = 0x38b34ae5; + const uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock32(blocks,i*4+0); + uint32_t k2 = getblock32(blocks,i*4+1); + uint32_t k3 = getblock32(blocks,i*4+2); + uint32_t k4 = getblock32(blocks,i*4+3); + + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch(len & 15) + { + case 15: k4 ^= tail[14] << 16; + case 14: k4 ^= tail[13] << 8; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + case 12: k3 ^= tail[11] << 24; + case 11: k3 ^= tail[10] << 16; + case 10: k3 ^= tail[ 9] << 8; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + case 8: k2 ^= tail[ 7] << 24; + case 7: k2 ^= tail[ 6] << 16; + case 6: k2 ^= tail[ 5] << 8; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + case 4: k1 ^= tail[ 3] << 24; + case 3: k1 ^= tail[ 2] << 16; + case 2: k1 ^= tail[ 1] << 8; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = fmix32(h1); + h2 = fmix32(h2); + h3 = fmix32(h3); + h4 = fmix32(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + ((uint32_t*)out)[0] = h1; + ((uint32_t*)out)[1] = h2; + ((uint32_t*)out)[2] = h3; + ((uint32_t*)out)[3] = h4; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const int len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + for(int i = 0; i < nblocks; i++) + { + uint64_t k1 = getblock64(blocks,i*2+0); + uint64_t k2 = getblock64(blocks,i*2+1); + + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) + { + case 15: k2 ^= ((uint64_t)tail[14]) << 48; + case 14: k2 ^= ((uint64_t)tail[13]) << 40; + case 13: k2 ^= ((uint64_t)tail[12]) << 32; + case 12: k2 ^= ((uint64_t)tail[11]) << 24; + case 11: k2 ^= ((uint64_t)tail[10]) << 16; + case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; + case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; + case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; + case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; + case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; + case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; + case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; + case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; + case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*)out)[0] = h1; + ((uint64_t*)out)[1] = h2; +} + +//----------------------------------------------------------------------------- + +// Additions for dedup-bench + +void MurmurHash3_Hashing::hash_chunk(File_Chunk& file_chunk) { + + // Allocate output buffer for the hash + file_chunk.init_hash(HashingTech::MURMURHASH3, MURMURHASH3_DIGEST_LENGTH); + + // Call the appropriate MurmurHash3 function based on the chunk size + MurmurHash3_x64_128(file_chunk.get_data(), file_chunk.get_size(), 0, file_chunk.get_hash()); +} diff --git a/dedup/src/hashing/xxhash_hashing.cpp b/dedup/src/hashing/xxhash_hashing.cpp new file mode 100644 index 0000000..3cd672c --- /dev/null +++ b/dedup/src/hashing/xxhash_hashing.cpp @@ -0,0 +1,15 @@ +#include "xxhash_hashing.hpp" +#include "hash.hpp" +#include + +#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ +#define XXH_IMPLEMENTATION /* access definitions */ + +void XXHash_Hashing::hash_chunk(File_Chunk& file_chunk) { + file_chunk.init_hash(HashingTech::XXHASH128, XXH128_DIGEST_LENGTH); + + XXH128_hash_t hash_value = XXH3_128bits((const unsigned char*)file_chunk.get_data(), file_chunk.get_size()); + BYTE* hash_ptr = file_chunk.get_hash(); + std::memcpy(hash_ptr, &hash_value, XXH128_DIGEST_LENGTH); + return; +}