diff --git a/README.md b/README.md index 946797c..0e2e896 100755 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ ##### Efficient and fastest **Run Length Encoding** library - ARM NEON support + - RISC-V scalar fallback support - 100% C (C++ compatible headers), without inline assembly - Most efficient compression - No other RLE compress or decompress faster with better compression @@ -150,6 +151,7 @@ for more info, see also: [Entropy Coding Benchmark](https://sites.google.com/sit - Linux amd64: Clang (>=3.2) - Linux arm64: 64 bits aarch64 ARMv8: gcc (>=6.3) - Linux arm64: 64 bits aarch64 ARMv8: clang +- Linux riscv64: scalar fallback path with a C99 compiler - MaxOS: XCode (>=9) - PowerPC ppc64le (incl. SIMD): gcc (>=8.0) diff --git a/include_/conf.h b/include_/conf.h index 98c60d2..ca92470 100644 --- a/include_/conf.h +++ b/include_/conf.h @@ -181,6 +181,24 @@ static ALWAYS_INLINE void stof64( void *cp, double static ALWAYS_INLINE void ltou32(unsigned *x, const void *cp) { memcpy(x, cp, sizeof(*x)); } // ua read into ptr static ALWAYS_INLINE void ltou64(unsigned long long *x, const void *cp) { memcpy(x, cp, sizeof(*x)); } + #elif defined(__riscv) +#include +#define ctou16(_cp_) ({ unsigned short _x; memcpy(&_x, (_cp_), sizeof(_x)); _x; }) +#define ctou32(_cp_) ({ unsigned _x; memcpy(&_x, (_cp_), sizeof(_x)); _x; }) +#define ctou64(_cp_) ({ uint64_t _x; memcpy(&_x, (_cp_), sizeof(_x)); _x; }) +#define ctof32(_cp_) ({ float _x; memcpy(&_x, (_cp_), sizeof(_x)); _x; }) +#define ctof64(_cp_) ({ double _x; memcpy(&_x, (_cp_), sizeof(_x)); _x; }) + +#define stou8(_cp_, _x_) (*((uint8_t *)(_cp_)) = (_x_)) +#define stou16(_cp_, _x_) do { unsigned short _v = (_x_); memcpy((_cp_), &_v, sizeof(_v)); } while(0) +#define stou32(_cp_, _x_) do { unsigned _v = (_x_); memcpy((_cp_), &_v, sizeof(_v)); } while(0) +#define stou64(_cp_, _x_) do { uint64_t _v = (_x_); memcpy((_cp_), &_v, sizeof(_v)); } while(0) +#define stof32(_cp_, _x_) do { float _v = (_x_); memcpy((_cp_), &_v, sizeof(_v)); } while(0) +#define stof64(_cp_, _x_) do { double _v = (_x_); memcpy((_cp_), &_v, sizeof(_v)); } while(0) + +#define ltou32(_px_, _cp_) do { memcpy((_px_), (_cp_), sizeof(*(_px_))); } while(0) +#define ltou64(_px_, _cp_) do { memcpy((_px_), (_cp_), sizeof(*(_px_))); } while(0) + #elif defined(__i386__) || defined(__x86_64__) || \ defined(_M_IX86) || defined(_M_AMD64) || _MSC_VER ||\ defined(__powerpc__) || defined(__s390__) ||\ @@ -251,6 +269,7 @@ struct _PACKED doubleu { double d; }; defined(__x86_64__) || defined(_M_X64) ||\ defined(__ia64) || defined(_M_IA64) ||\ defined(__aarch64__) ||\ + (defined(__riscv) && (__riscv_xlen == 64)) ||\ defined(__mips64) ||\ defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\ defined(__s390x__) diff --git a/include_/time_.h b/include_/time_.h index 7735823..e24ca2d 100644 --- a/include_/time_.h +++ b/include_/time_.h @@ -24,7 +24,7 @@ // time_.h : parameter free high precision time/benchmark functions #include #include - #ifdef _WIN32 + #if defined(_WIN32) && !defined(__riscv) #include #ifndef sleep #define sleep(n) Sleep((n) * 1000) @@ -104,13 +104,19 @@ static int tmiszero(tm_t t) { return !t; } #define TM_MBS "MB/s" static double TMBS(unsigned l, double t) { return (l/t)/1000000.0; } - #ifdef _WIN32 //-------- windows + #if defined(_WIN32) && !defined(__riscv) //-------- windows static LARGE_INTEGER tps; typedef unsigned __int64 tm_t; static tm_t tmtime() { LARGE_INTEGER tm; tm_t t; QueryPerformanceCounter(&tm); return tm.QuadPart; } static tm_t tminit() { tm_t t0,ts; QueryPerformanceFrequency(&tps); t0 = tmtime(); while((ts = tmtime())==t0) {}; return ts; } static double tmdiff(tm_t start, tm_t stop) { return (double)(stop - start)/tps.QuadPart; } +static int tmiszero(tm_t t) { return !t; } + #elif defined(__riscv) +typedef clock_t tm_t; +static tm_t tmtime() { return clock(); } +static tm_t tminit() { tm_t t0 = tmtime(), t; while((t = tmtime()) == t0) {}; return t; } +static double tmdiff(tm_t start, tm_t stop) { return (double)(stop - start) / CLOCKS_PER_SEC; } static int tmiszero(tm_t t) { return !t; } #else // Linux & compatible / MacOS #ifdef __APPLE__ diff --git a/makefile b/makefile index fbf3a98..75f8b99 100644 --- a/makefile +++ b/makefile @@ -27,6 +27,10 @@ else ifneq (,$(findstring aarch64,$(CC))) ARCH = aarch64 +else ifneq (,$(findstring riscv64,$(CC))) + ARCH = riscv64 +else ifneq (,$(findstring riscv32,$(CC))) + ARCH = riscv32 else ifneq (,$(findstring powerpc64le,$(CC))) ARCH = ppc64le endif @@ -43,6 +47,9 @@ else CFLAGS+=-march=armv8-a endif MSSE=-march=armv8-a +else ifneq ($(filter riscv%,$(ARCH)),) + MARCH= + MSSE= else ifeq ($(ARCH),$(filter $(ARCH),x86_64 ppc64le)) CFLAGS=-march=native MSSE=-mssse3 diff --git a/trle_.h b/trle_.h index 9111218..a1abe0c 100644 --- a/trle_.h +++ b/trle_.h @@ -40,9 +40,9 @@ #define _vlput32(_op_, _x_, _act_) {\ if(likely((_x_) < VL_OFS1)){ *_op_++ = (_x_); _act_;}\ - else if ((_x_) < VL_OFS2) { ctou16(_op_) = bswap16((VL_OFS1<<8)+((_x_)-VL_OFS1)); _op_ += 2; _act_;}\ - else if ((_x_) < VL_OFS3) { *_op_++ = VL_BA2 + (((_x_) -= VL_OFS2) >> 16); ctou16(_op_) = (_x_); _op_ += 2; _act_;}\ - else { unsigned _b = (bsr32((_x_))+7)/8; *_op_++ = VL_BA3 + (_b - 3); ctou32(_op_) = (_x_); _op_ += _b; _act_;}\ + else if ((_x_) < VL_OFS2) { stou16(_op_, bswap16((VL_OFS1<<8)+((_x_)-VL_OFS1))); _op_ += 2; _act_;}\ + else if ((_x_) < VL_OFS3) { *_op_++ = VL_BA2 + (((_x_) -= VL_OFS2) >> 16); stou16(_op_, (_x_)); _op_ += 2; _act_;}\ + else { unsigned _b = (bsr32((_x_))+7)/8; *_op_++ = VL_BA3 + (_b - 3); stou32(_op_, (_x_)); _op_ += _b; _act_;}\ } #define _vlget32(_ip_, _x_, _act_) do { _x_ = *_ip_++;\ diff --git a/trled.c b/trled.c index 5af91c1..6d2dfc6 100644 --- a/trled.c +++ b/trled.c @@ -225,12 +225,12 @@ unsigned _trled(const unsigned char *__restrict in, unsigned char *__restrict ou while(op < out+(outlen-32)) { #if __WORDSIZE == 64 uint64_t z = (uint64_t)rmap[ip[7]]<<56 | (uint64_t)rmap[ip[6]] << 48 | (uint64_t)rmap[ip[5]] << 40 | (uint64_t)rmap[ip[4]] << 32 | (uint32_t)rmap[ip[3]] << 24 | (uint32_t)rmap[ip[2]] << 16| (uint32_t)rmap[ip[1]] << 8| rmap[ip[0]]; - ctou64(op) = ctou64(ip); if(z) goto a; ip += 8; op += 8; + stou64(op, ctou64(ip)); if(z) goto a; ip += 8; op += 8; continue; a: z = ctz64(z)>>3; #else uint32_t z = (uint32_t)rmap[ip[3]] << 24 | (uint32_t)rmap[ip[2]] << 16| (uint32_t)rmap[ip[1]] << 8| rmap[ip[0]]; - ctou32(op) = ctou32(ip); if(z) goto a; ip += 4; op += 4; + stou32(op, ctou32(ip)); if(z) goto a; ip += 4; op += 4; continue; a: z = ctz32(z)>>3; #endif @@ -310,8 +310,8 @@ unsigned trled(const unsigned char *__restrict in, unsigned inlen, unsigned char #define rmemset(_op_, _c_, _i_) do { uint64_t _cc; uint8_t *_up = (uint8_t *)_op_; _op_ +=_i_;\ T2(_cset, USIZE)(_cc,_c_);\ do {\ - T2(ctou, USIZE)(_up) = _c_; _up += USIZE/8;\ - T2(ctou, USIZE)(_up) = _c_; _up += USIZE/8;\ + T2(stou, USIZE)(_up, _c_); _up += USIZE/8;\ + T2(stou, USIZE)(_up, _c_); _up += USIZE/8;\ } while(_up < (uint8_t *)_op_);\ } while(0) #endif