GPAK  1.0.0
A general-purpose archive library
gpak_compressors.c
1 #include "gpak_compressors.h"
2 
3 #include "gpak_helper.h"
4 #include "filesystem_tree.h"
5 
6 #include <stdlib.h>
7 #include <assert.h>
8 
9 #ifdef _WIN32
10 #include <windows.h>
11 #else
12 #include <unistd.h>
13 #include <sys/sysinfo.h>
14 #endif
15 
16 // Zlib
17 #include <zlib.h>
18 
19 // Z-standard
20 #include <zstd.h>
21 #include <zdict.h>
22 
23 #define _DICTIONARY_SAMPLE_COUNT 200
24 
25 
26 int get_num_threads()
27 {
28  int numThreads;
29 
30 #ifdef _WIN32
31  SYSTEM_INFO sysinfo;
32  GetSystemInfo(&sysinfo);
33  numThreads = sysinfo.dwNumberOfProcessors;
34 #else
35  numThreads = sysconf(_SC_THREAD_THREADS_MAX);
36 #endif
37 
38  return numThreads;
39 }
40 
41 
42 uint32_t _gpak_compressor_none(gpak_t* _pak, FILE* _infile, FILE* _outfile)
43 {
44  char* _bufferIn = (char*)malloc(_DEFAULT_BLOCK_SIZE);
45  size_t _readed = 0ull;
46  uint32_t _crc32 = crc32(0L, Z_NULL, 0);
47 
48  fseek(_infile, 0, SEEK_END);
49  size_t _total_size = ftell(_infile);
50  fseek(_infile, 0, SEEK_SET);
51 
52  size_t bytes_readed = 0ull;
53  do
54  {
55  _readed = _freadb(_bufferIn, 1ull, _DEFAULT_BLOCK_SIZE, _infile);
56  _crc32 = crc32(_crc32, _bufferIn, _readed);
57  bytes_readed += _readed;
58  _gpak_pass_progress(_pak, bytes_readed, _total_size, GPAK_STAGE_COMPRESSION);
59  _fwriteb(_bufferIn, 1ull, _readed, _outfile);
60  } while (_readed);
61 
62  free(_bufferIn);
63 
64  return _crc32;
65 }
66 
67 uint32_t _gpak_decompressor_none(gpak_t* _pak, FILE* _infile, FILE* _outfile, size_t _read_size)
68 {
69  char* _bufferIn = (char*)malloc(_DEFAULT_BLOCK_SIZE);
70  size_t bytesReaded = 0ull;
71  size_t nextBlockSize = _DEFAULT_BLOCK_SIZE;
72 
73  uint32_t _crc32 = crc32(0L, Z_NULL, 0);
74 
75  do
76  {
77  size_t _readed = _freadb(_bufferIn, 1ull, nextBlockSize, _infile);
78  _crc32 = crc32(_crc32, _bufferIn, _readed);
79 
80  _fwriteb(_bufferIn, 1ull, _readed, _outfile);
81  bytesReaded += _readed;
82  _gpak_pass_progress(_pak, bytesReaded, _read_size, GPAK_STAGE_DECOMPRESSION);
83 
84  if (_read_size - bytesReaded < nextBlockSize)
85  nextBlockSize = _read_size - bytesReaded;
86  } while (bytesReaded != _read_size);
87 
88  if (bytesReaded != _read_size)
89  return _gpak_make_error(_pak, GPAK_ERROR_READ);
90 
91  free(_bufferIn);
92 
93  return _crc32;
94 }
95 
96 
97 uint32_t _gpak_compressor_deflate(gpak_t* _pak, FILE* _infile, FILE* _outfile)
98 {
99  z_stream strm;
100  strm.zalloc = Z_NULL;
101  strm.zfree = Z_NULL;
102  strm.opaque = Z_NULL;
103 
104  char* _bufferIn = (char*)malloc(_DEFAULT_BLOCK_SIZE);
105  char* _bufferOut = (char*)malloc(_DEFAULT_BLOCK_SIZE);
106 
107  fseek(_infile, 0, SEEK_END);
108  size_t _total_size = ftell(_infile);
109  fseek(_infile, 0, SEEK_SET);
110 
111  int ret = deflateInit(&strm, _pak->header_.compression_level_);
112  if (ret != Z_OK)
113  return _gpak_make_error(_pak, GPAK_ERROR_DEFLATE_INIT);
114 
115  uint32_t _crc32 = crc32(0L, Z_NULL, 0);
116 
117  int flush;
118  unsigned have;
119  size_t _total_readed = 0ull;
120  do
121  {
122  strm.avail_in = _freadb(_bufferIn, 1ull, _DEFAULT_BLOCK_SIZE, _infile);
123  _total_readed += strm.avail_in;
124  _crc32 = crc32(_crc32, _bufferIn, strm.avail_in);
125  _gpak_pass_progress(_pak, _total_readed, _total_size, GPAK_STAGE_COMPRESSION);
126 
127  if (ferror(_infile))
128  {
129  _gpak_make_error(_pak, GPAK_ERROR_READ);
130  goto end;
131  }
132 
133  flush = feof(_infile) ? Z_FINISH : Z_NO_FLUSH;
134  strm.next_in = _bufferIn;
135 
136  do
137  {
138  strm.avail_out = _DEFAULT_BLOCK_SIZE;
139  strm.next_out = _bufferOut;
140  ret = deflate(&strm, flush);
141  assert(ret != Z_STREAM_ERROR);
142 
143  have = _DEFAULT_BLOCK_SIZE - strm.avail_out;
144  if (_fwriteb(_bufferOut, 1ull, have, _outfile) != have || ferror(_outfile))
145  {
146  _gpak_make_error(_pak, GPAK_ERROR_WRITE);
147  goto end;
148  }
149  } while (strm.avail_out == 0);
150  assert(strm.avail_in == 0);
151 
152  } while (flush != Z_FINISH);
153 
154 end:
155  deflateEnd(&strm);
156  free(_bufferIn);
157  free(_bufferOut);
158 
159  return _crc32;
160 }
161 
162 uint32_t _gpak_decompressor_inflate(gpak_t* _pak, FILE* _infile, FILE* _outfile, size_t _read_size)
163 {
164  z_stream strm;
165  strm.zalloc = Z_NULL;
166  strm.zfree = Z_NULL;
167  strm.opaque = Z_NULL;
168 
169  char* _bufferIn = (char*)malloc(_DEFAULT_BLOCK_SIZE);
170  char* _bufferOut = (char*)malloc(_DEFAULT_BLOCK_SIZE);
171 
172  int ret = inflateInit(&strm);
173  if (ret != Z_OK)
174  return _gpak_make_error(_pak, GPAK_ERROR_INFLATE_INIT);
175 
176  uint32_t _crc32 = crc32(0L, Z_NULL, 0);
177 
178  unsigned have;
179  size_t total_read = 0ull;
180 
181  do
182  {
183  size_t bytes_to_read = _read_size - total_read < _DEFAULT_BLOCK_SIZE ? _read_size - total_read : _DEFAULT_BLOCK_SIZE;
184  strm.avail_in = _freadb(_bufferIn, 1, bytes_to_read, _infile);
185  total_read += strm.avail_in;
186  _gpak_pass_progress(_pak, total_read, _read_size, GPAK_STAGE_DECOMPRESSION);
187 
188  strm.next_in = _bufferIn;
189 
190  do
191  {
192  strm.avail_out = _DEFAULT_BLOCK_SIZE;
193  strm.next_out = _bufferOut;
194  ret = inflate(&strm, Z_NO_FLUSH);
195  assert(ret != Z_STREAM_ERROR);
196 
197  switch (ret) {
198  case Z_NEED_DICT:
199  case Z_DATA_ERROR:
200  case Z_MEM_ERROR:
201  goto end;
202  }
203 
204  have = _DEFAULT_BLOCK_SIZE - strm.avail_out;
205 
206  _crc32 = crc32(_crc32, _bufferOut, have);
207 
208  if (_fwriteb(_bufferOut, 1, have, _outfile) != have || ferror(_outfile))
209  {
210  _gpak_make_error(_pak, GPAK_ERROR_INFLATE_FAILED);
211  goto end;
212  }
213 
214  } while (strm.avail_out == 0);
215 
216  } while (ret != Z_STREAM_END);
217 
218 end:
219  free(_bufferIn);
220  free(_bufferOut);
221  (void)inflateEnd(&strm);
222 
223  return _crc32;
224 }
225 
226 
227 uint32_t _gpak_compressor_zstd(gpak_t* _pak, FILE* _infile, FILE* _outfile)
228 {
229  size_t const buffInSize = ZSTD_CStreamInSize();
230  void* const buffIn = malloc(buffInSize);
231  size_t const buffOutSize = ZSTD_CStreamOutSize();
232  void* const buffOut = malloc(buffOutSize);
233 
234  fseek(_infile, 0, SEEK_END);
235  size_t _total_size = ftell(_infile);
236  fseek(_infile, 0, SEEK_SET);
237 
238  uint32_t _crc32 = crc32(0L, Z_NULL, 0);
239 
240  /* Create the context. */
241  ZSTD_CCtx* const cctx = ZSTD_createCCtx();
242 
243  uint32_t thread_count = get_num_threads();
244 
245  /* Set parameters. */
246  ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, _pak->header_.compression_level_);
247  ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 1);
248  ZSTD_CCtx_setParameter(cctx, ZSTD_c_enableLongDistanceMatching, 1);
249  ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, 27);
250  ZSTD_CCtx_setParameter(cctx, ZSTD_c_nbWorkers, thread_count);
251  ZSTD_CCtx_setParameter(cctx, ZSTD_c_jobSize, buffInSize * thread_count);
252 
253  if (_pak->dictionary_ && _pak->header_.dictionary_size_ > 0)
254  ZSTD_CCtx_loadDictionary(cctx, _pak->dictionary_, _pak->header_.dictionary_size_);
255 
256  size_t _total_readed = 0ull;
257  size_t const toRead = buffInSize;
258  for (;;)
259  {
260  size_t read = _freadb(buffIn, 1, toRead, _infile);
261  _total_readed += read;
262  _crc32 = crc32(_crc32, buffIn, read);
263  _gpak_pass_progress(_pak, _total_readed, _total_size, GPAK_STAGE_COMPRESSION);
264 
265  int const lastChunk = (read < toRead);
266  ZSTD_EndDirective const mode = lastChunk ? ZSTD_e_end : ZSTD_e_continue;
267 
268  ZSTD_inBuffer input = { buffIn, read, 0 };
269  int finished;
270  do
271  {
272  ZSTD_outBuffer output = { buffOut, buffOutSize, 0 };
273  size_t const remaining = ZSTD_compressStream2(cctx, &output, &input, mode);
274  //CHECK_ZSTD(remaining);
275  _fwriteb(buffOut, 1, output.pos, _outfile);
276  finished = lastChunk ? (remaining == 0) : (input.pos == input.size);
277  } while (!finished);
278  assert(input.pos == input.size && "Impossible: zstd only returns 0 when the input is completely consumed!");
279 
280  if (lastChunk)
281  break;
282  }
283 
284  ZSTD_freeCCtx(cctx);
285  free(buffIn);
286  free(buffOut);
287 
288  return _crc32;
289 }
290 
291 uint32_t _gpak_decompressor_zstd(gpak_t* _pak, FILE* _infile, FILE* _outfile, size_t _read_size)
292 {
293  size_t const buffInSize = ZSTD_DStreamInSize();
294  void* const buffIn = malloc(buffInSize);
295  size_t const buffOutSize = ZSTD_DStreamOutSize();
296  void* const buffOut = malloc(buffOutSize);
297 
298  uint32_t _crc32 = crc32(0L, Z_NULL, 0);
299 
300  ZSTD_DCtx* const dctx = ZSTD_createDCtx();
301  assert(dctx != NULL && "ZSTD_createDCtx() failed!");
302 
303  if (_pak->dictionary_ && _pak->header_.dictionary_size_ > 0)
304  ZSTD_DCtx_loadDictionary(dctx, _pak->dictionary_, _pak->header_.dictionary_size_);
305 
306  size_t bytesRead = 0;
307  size_t lastRet = 0;
308  int isEmpty = 1;
309 
310  while (bytesRead < _read_size)
311  {
312  size_t const toRead = (bytesRead + buffInSize <= _read_size) ? buffInSize : (_read_size - bytesRead);
313  size_t read = _freadb(buffIn, 1, toRead, _infile);
314 
315  if (!read)
316  {
317  break;
318  }
319 
320  bytesRead += read;
321 
322  _gpak_pass_progress(_pak, bytesRead, _read_size, GPAK_STAGE_DECOMPRESSION);
323 
324  isEmpty = 0;
325  ZSTD_inBuffer input = { buffIn, read, 0 };
326  while (input.pos < input.size)
327  {
328  ZSTD_outBuffer output = { buffOut, buffOutSize, 0 };
329  size_t const ret = ZSTD_decompressStream(dctx, &output, &input);
330 
331  _crc32 = crc32(_crc32, buffOut, output.pos);
332 
333  _fwriteb(buffOut, 1, output.pos, _outfile);
334  lastRet = ret;
335  }
336  }
337 
338  if (isEmpty)
339  {
340  _gpak_make_error(_pak, GPAK_ERROR_EMPTY_INPUT);
341  goto clear;
342  }
343 
344  if (lastRet != 0)
345  {
346  _gpak_make_error(_pak, GPAK_ERROR_EOF_BEFORE_EOS);
347  goto clear;
348  }
349 
350 clear:
351  ZSTD_freeDCtx(dctx);
352  free(buffIn);
353  free(buffOut);
354 
355  return _crc32;
356 }
357 
359 {
360  char* samples = NULL;
361  size_t sample_sizes[_DICTIONARY_SAMPLE_COUNT];
362  size_t samples_count = 0ull;
363  size_t samples_capacity = 0ull;
364  size_t current_offset = 0ull;
365 
367  filesystem_tree_node_t* next_directory = _pak->root_;
368  do
369  {
370  if (samples_count >= _DICTIONARY_SAMPLE_COUNT - 1)
371  break;
372 
373  filesystem_tree_file_t* next_file = NULL;
374  while ((next_file = filesystem_iterator_next_file(iterator)))
375  {
376  FILE* _infile = fopen(next_file->path_, "rb");
377 
378  fseek(_infile, 0, SEEK_END);
379  sample_sizes[samples_count] = ftell(_infile);
380  fseek(_infile, 0, SEEK_SET);
381 
382  samples_capacity += sample_sizes[samples_count];
383  samples = (char*)realloc(samples, samples_capacity);
384 
385  _freadb(samples + current_offset, 1ull, sample_sizes[samples_count], _infile);
386 
387  current_offset += sample_sizes[samples_count];
388 
389  fclose(_infile);
390 
391  if (samples_count >= _DICTIONARY_SAMPLE_COUNT - 1)
392  break;
393 
394  ++samples_count;
395  }
396  } while ((next_directory = filesystem_iterator_next_directory(iterator)));
397 
398  filesystem_iterator_free(iterator);
399 
400  size_t average_file_size = 0ull;
401 
402  for (size_t idx = 0ull; idx < samples_count; ++idx)
403  average_file_size += sample_sizes[idx] / samples_count;
404 
405  size_t nearest_pow_of_2 = 1;
406  while (nearest_pow_of_2 < average_file_size)
407  nearest_pow_of_2 *= 2;
408 
409  _pak->header_.dictionary_size_ = nearest_pow_of_2;
410 
411  _pak->dictionary_ = (char*)malloc(_pak->header_.dictionary_size_);
412  _pak->header_.dictionary_size_ = ZDICT_trainFromBuffer(_pak->dictionary_, _pak->header_.dictionary_size_, samples, sample_sizes, samples_count);
413  _pak->dictionary_ = (char*)realloc(_pak->dictionary_, _pak->header_.dictionary_size_);
414 
415  fseek(_pak->stream_, sizeof(pak_header_t), SEEK_SET);
416 
417  _fwriteb(_pak->dictionary_, 1ull, _pak->header_.dictionary_size_, _pak->stream_);
418 
419  free(samples);
420 
421  return _gpak_make_error(_pak, GPAK_ERROR_OK);
422 }
GPAK_API filesystem_tree_node_t * filesystem_iterator_next_directory(filesystem_tree_iterator_t *_iterator)
GPAK_API void filesystem_iterator_free(filesystem_tree_iterator_t *_iterator)
GPAK_API filesystem_tree_iterator_t * filesystem_iterator_create(filesystem_tree_node_t *_root)
GPAK_API filesystem_tree_file_t * filesystem_iterator_next_file(filesystem_tree_iterator_t *_iterator)
GPAK_API uint32_t _gpak_compressor_deflate(gpak_t *_pak, FILE *_infile, FILE *_outfile)
GPAK_API uint32_t _gpak_compressor_none(gpak_t *_pak, FILE *_infile, FILE *_outfile)
GPAK_API uint32_t _gpak_decompressor_none(gpak_t *_pak, FILE *_infile, FILE *_outfile, size_t _read_size)
GPAK_API uint32_t _gpak_decompressor_inflate(gpak_t *_pak, FILE *_infile, FILE *_outfile, size_t _read_size)
GPAK_API uint32_t _gpak_decompressor_zstd(gpak_t *_pak, FILE *_infile, FILE *_outfile, size_t _read_size)
GPAK_API int32_t _gpak_compressor_generate_dictionary(gpak_t *_pak)
GPAK_API uint32_t _gpak_compressor_zstd(gpak_t *_pak, FILE *_infile, FILE *_outfile)
@ GPAK_ERROR_INFLATE_FAILED
Definition: gpak_data.h:169
@ GPAK_ERROR_INFLATE_INIT
Definition: gpak_data.h:168
@ GPAK_ERROR_WRITE
Definition: gpak_data.h:161
@ GPAK_ERROR_EMPTY_INPUT
Definition: gpak_data.h:181
@ GPAK_ERROR_READ
Definition: gpak_data.h:162
@ GPAK_ERROR_DEFLATE_INIT
Definition: gpak_data.h:166
@ GPAK_ERROR_EOF_BEFORE_EOS
Definition: gpak_data.h:182
@ GPAK_ERROR_OK
Definition: gpak_data.h:155
@ GPAK_STAGE_COMPRESSION
Definition: gpak_data.h:226
@ GPAK_STAGE_DECOMPRESSION
Definition: gpak_data.h:227
char compression_level_
Definition: gpak_data.h:114
uint32_t dictionary_size_
Definition: gpak_data.h:116
char * dictionary_
Definition: gpak_data.h:262
struct filesystem_tree_node * root_
Definition: gpak_data.h:260
FILE * stream_
Definition: gpak_data.h:259
pak_header_t header_
Definition: gpak_data.h:258