Skip to content

Commit 7db5603

Browse files
feat: Enhance Huffman encoding in pzip for improved compression efficiency
- Added new methods for histogram calculation, dynamic header writing, and code generation in HuffmanBitWriter. - Introduced a new writeBlockHuff method to handle Huffman-only compression for low token counts. - Implemented quick entropy detection to optimize compression decisions based on data characteristics. - Updated FastDeflate to utilize the new Huffman encoding features for better performance. Log: Improve Huffman encoding and compression efficiency in pzip bug: https://pms.uniontech.com/bug-view-346679.html
1 parent 37c12e0 commit 7db5603

4 files changed

Lines changed: 339 additions & 84 deletions

File tree

3rdparty/pzip/include/pzip/fast_deflate.h

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@
1717

1818
#include <cstdint>
1919
#include <cstring>
20+
#include <functional>
2021
#include <vector>
2122
#include <array>
2223
#include <algorithm>
2324
#include <memory>
25+
#include <utility>
2426

2527
#if defined(__GNUC__) || defined(__clang__)
2628
#define PZIP_FORCE_INLINE __attribute__((always_inline)) inline
@@ -330,32 +332,46 @@ class HuffmanBitWriter {
330332

331333
void writeBlock(Tokens* tokens, bool eof, const uint8_t* input, size_t inputLen);
332334
void writeBlockDynamic(Tokens* tokens, bool eof, const uint8_t* input, size_t inputLen, bool sync);
335+
void writeBlockHuff(bool eof, const uint8_t* input, size_t inputLen, bool sync);
333336

334337
void writeTokens(const Token* tokens, size_t n, const HCode* leCodes, const HCode* oeCodes);
335338

336339
const std::vector<uint8_t>& data() const { return output_; }
337340
std::vector<uint8_t>& data() { return output_; }
338341

342+
void setLogNewTablePenalty(int penalty) { logNewTablePenalty_ = penalty; }
343+
339344
private:
340345
void writeOutBits();
341346
void indexTokens(Tokens* t, bool alwaysEOB);
342347
void generate();
343348
int extraBitSize();
344349
int fixedSize(int extraBits);
345350
int storedSize(const uint8_t* input, size_t len, bool* storable);
351+
void histogram(const uint8_t* input, size_t len);
352+
std::pair<int, int> headerSize();
353+
void generateCodegen(int numLiterals, int numOffsets, HuffmanEncoder* litEnc, HuffmanEncoder* offEnc);
354+
int codegens();
355+
void writeDynamicHeader(int numLiterals, int numOffsets, int numCodegens, bool isEof);
346356

347357
std::vector<uint8_t> output_;
348358
uint64_t bits_ = 0;
349359
uint8_t nbits_ = 0;
350360
uint8_t nbytes_ = 0;
351361
int lastHeader_ = 0;
362+
bool lastHuffMan_ = false;
363+
int logNewTablePenalty_ = 7;
352364

353365
std::array<uint8_t, 256 + 8> bytes_;
354366
std::array<uint16_t, LENGTH_CODES_START + 32> literalFreq_;
355367
std::array<uint16_t, 32> offsetFreq_;
368+
std::array<uint16_t, 19> codegenFreq_;
369+
std::array<uint8_t, LITERAL_COUNT + OFFSET_CODE_COUNT + 1> codegen_;
356370

357371
std::unique_ptr<HuffmanEncoder> literalEncoding_;
358372
std::unique_ptr<HuffmanEncoder> offsetEncoding_;
373+
std::unique_ptr<HuffmanEncoder> tmpLitEncoding_;
374+
std::unique_ptr<HuffmanEncoder> codegenEncoding_;
359375
};
360376

361377
// ============================================================================
@@ -448,22 +464,39 @@ size_t deflateCompress(const uint8_t* input, size_t inputSize,
448464
CompressionLevel level = CompressionLevel::DefaultCompression);
449465

450466
// ============================================================================
451-
// DeflateStream
467+
// FlateWriter - 流式压缩器(参照 Go klauspost/compress flate.Writer)
452468
// ============================================================================
453469

454-
class DeflateStream {
470+
// 输出回调类型(参照 Go io.Writer)
471+
using WriteFunc = std::function<void(const uint8_t*, size_t)>;
472+
473+
class FlateWriter {
455474
public:
456-
explicit DeflateStream(CompressionLevel level = CompressionLevel::DefaultCompression);
457-
~DeflateStream();
475+
// 接收输出目标(参照 Go flate.NewWriter(w io.Writer, level int))
476+
explicit FlateWriter(WriteFunc output, CompressionLevel level = CompressionLevel::BestSpeed);
477+
~FlateWriter() = default;
458478

479+
// 流式写入数据(参照 Go compressor.write)
459480
size_t write(const uint8_t* data, size_t size);
460-
size_t finish(std::vector<uint8_t>& output);
461-
void reset();
481+
482+
// 完成压缩(参照 Go compressor.Close)
483+
void close();
484+
485+
// 重置并设置新的输出目标(参照 Go compressor.Reset)
486+
void reset(WriteFunc output);
462487

463488
private:
464-
std::unique_ptr<FastDeflate> deflate_;
465-
std::vector<uint8_t> buffer_;
466-
static constexpr size_t BUFFER_SIZE = 128 * 1024;
489+
void storeFast();
490+
size_t fillBlock(const uint8_t* data, size_t size);
491+
void flushOutput();
492+
493+
WriteFunc output_;
494+
std::vector<uint8_t> window_;
495+
size_t windowEnd_ = 0;
496+
497+
std::unique_ptr<FastEncL1> encoder_;
498+
std::unique_ptr<HuffmanBitWriter> writer_;
499+
Tokens tokens_;
467500
};
468501

469502
} // namespace pzip

3rdparty/pzip/src/archiver.cpp

Lines changed: 16 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -180,75 +180,35 @@ Error Archiver::compressFile(FileTask* task) {
180180
}
181181

182182
Error Archiver::compress(FileTask* task) {
183-
// 目录不需要压缩
184183
if (fs::is_directory(task->status)) {
185184
return Error();
186185
}
187186

188-
// 打开源文件
189187
std::ifstream file(task->path, std::ios::binary);
190188
if (!file.is_open()) {
191189
return Error(ErrorCode::FILE_OPEN_ERROR, "Cannot open file: " + task->path.string());
192190
}
193191

194-
// 读取整个文件到内存
195-
std::vector<uint8_t> fileData(task->fileSize);
196-
file.read(reinterpret_cast<char*>(fileData.data()), task->fileSize);
197-
if (static_cast<size_t>(file.gcount()) != task->fileSize) {
198-
return Error(ErrorCode::FILE_READ_ERROR, "Failed to read file: " + task->path.string());
199-
}
200-
file.close();
201-
202-
#ifdef USE_LIBDEFLATE
203-
// 使用 libdeflate(高性能)
204-
// 注意:libdeflate level 1 最快,level 12 压缩率最高
205-
// 默认使用 level 1(最快),用户可以通过 -6 等参数调整
206-
task->header.crc32 = libdeflate_crc32(0, fileData.data(), fileData.size());
207-
208-
int level = options_.compressionLevel;
209-
if (level < 1 || level > 12) level = 1; // 默认使用最快级别
210-
211-
struct libdeflate_compressor* compressor = libdeflate_alloc_compressor(level);
212-
if (!compressor) {
213-
return Error(ErrorCode::COMPRESSION_ERROR, "Failed to create compressor");
214-
}
215-
216-
size_t maxCompressedSize = libdeflate_deflate_compress_bound(compressor, fileData.size());
217-
std::vector<uint8_t> compressed(maxCompressedSize);
218-
219-
size_t compressedSize = libdeflate_deflate_compress(
220-
compressor,
221-
fileData.data(), fileData.size(),
222-
compressed.data(), compressed.size()
223-
);
192+
FlateWriter writer([task](const uint8_t* data, size_t size) {
193+
task->write(data, size);
194+
});
224195

225-
libdeflate_free_compressor(compressor);
196+
constexpr size_t BUFFER_SIZE = 32 * 1024;
197+
std::vector<uint8_t> buf(BUFFER_SIZE);
198+
uint32_t crc = 0;
226199

227-
if (compressedSize == 0 && !fileData.empty()) {
228-
return Error(ErrorCode::COMPRESSION_ERROR, "Compression failed");
229-
}
230-
231-
task->write(compressed.data(), compressedSize);
232-
#else
233-
// 使用内置压缩器 - 使用 thread_local 避免每次创建新对象
234-
task->header.crc32 = ::crc32(0L, fileData.data(), fileData.size());
235-
236-
// thread_local 压缩器(使用最快级别)和输出缓冲区,避免重复分配
237-
thread_local FastDeflate deflate(CompressionLevel::BestSpeed);
238-
thread_local std::vector<uint8_t> compressed;
239-
240-
// 重置压缩器状态并清空缓冲区
241-
deflate.reset();
242-
compressed.clear();
243-
244-
size_t compressedSize = deflate.compress(fileData.data(), fileData.size(), compressed);
245-
246-
if (compressedSize == 0 && !fileData.empty()) {
247-
return Error(ErrorCode::COMPRESSION_ERROR, "Compression failed");
200+
while (file.good() && !file.eof()) {
201+
file.read(reinterpret_cast<char*>(buf.data()), buf.size());
202+
auto bytesRead = file.gcount();
203+
if (bytesRead > 0) {
204+
crc = ::crc32(crc, buf.data(), bytesRead);
205+
writer.write(buf.data(), bytesRead);
206+
}
248207
}
208+
file.close();
249209

250-
task->write(compressed.data(), compressed.size());
251-
#endif
210+
task->header.crc32 = crc;
211+
writer.close();
252212

253213
return Error();
254214
}

0 commit comments

Comments
 (0)