| 1 |
/* |
| 2 |
* Copyright (c) Yann Collet, Facebook, Inc. |
| 3 |
* All rights reserved. |
| 4 |
* |
| 5 |
* This source code is licensed under both the BSD-style license (found in the |
| 6 |
* LICENSE file in the root directory of this source tree) and the GPLv2 (found |
| 7 |
* in the COPYING file in the root directory of this source tree). |
| 8 |
* You may select, at your option, one of the above-listed licenses. |
| 9 |
*/ |
| 10 |
|
| 11 |
#ifndef DICTBUILDER_H_001 |
| 12 |
#define DICTBUILDER_H_001 |
| 13 |
|
| 14 |
#if defined (__cplusplus) |
| 15 |
extern "C" { |
| 16 |
#endif |
| 17 |
|
| 18 |
|
| 19 |
/*====== Dependencies ======*/ |
| 20 |
#include <stddef.h> /* size_t */ |
| 21 |
|
| 22 |
|
| 23 |
/* ===== ZDICTLIB_API : control library symbols visibility ===== */ |
| 24 |
#ifndef ZDICTLIB_VISIBILITY |
| 25 |
# if defined(__GNUC__) && (__GNUC__ >= 4) |
| 26 |
# define ZDICTLIB_VISIBILITY __attribute__ ((visibility ("default"))) |
| 27 |
# else |
| 28 |
# define ZDICTLIB_VISIBILITY |
| 29 |
# endif |
| 30 |
#endif |
| 31 |
#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) |
| 32 |
# define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBILITY |
| 33 |
#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) |
| 34 |
# define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ |
| 35 |
#else |
| 36 |
# define ZDICTLIB_API ZDICTLIB_VISIBILITY |
| 37 |
#endif |
| 38 |
|
| 39 |
/******************************************************************************* |
| 40 |
* Zstd dictionary builder |
| 41 |
* |
| 42 |
* FAQ |
| 43 |
* === |
| 44 |
* Why should I use a dictionary? |
| 45 |
* ------------------------------ |
| 46 |
* |
| 47 |
* Zstd can use dictionaries to improve compression ratio of small data. |
| 48 |
* Traditionally small files don't compress well because there is very little |
| 49 |
* repetion in a single sample, since it is small. But, if you are compressing |
| 50 |
* many similar files, like a bunch of JSON records that share the same |
| 51 |
* structure, you can train a dictionary on ahead of time on some samples of |
| 52 |
* these files. Then, zstd can use the dictionary to find repetitions that are |
| 53 |
* present across samples. This can vastly improve compression ratio. |
| 54 |
* |
| 55 |
* When is a dictionary useful? |
| 56 |
* ---------------------------- |
| 57 |
* |
| 58 |
* Dictionaries are useful when compressing many small files that are similar. |
| 59 |
* The larger a file is, the less benefit a dictionary will have. Generally, |
| 60 |
* we don't expect dictionary compression to be effective past 100KB. And the |
| 61 |
* smaller a file is, the more we would expect the dictionary to help. |
| 62 |
* |
| 63 |
* How do I use a dictionary? |
| 64 |
* -------------------------- |
| 65 |
* |
| 66 |
* Simply pass the dictionary to the zstd compressor with |
| 67 |
* `ZSTD_CCtx_loadDictionary()`. The same dictionary must then be passed to |
| 68 |
* the decompressor, using `ZSTD_DCtx_loadDictionary()`. There are other |
| 69 |
* more advanced functions that allow selecting some options, see zstd.h for |
| 70 |
* complete documentation. |
| 71 |
* |
| 72 |
* What is a zstd dictionary? |
| 73 |
* -------------------------- |
| 74 |
* |
| 75 |
* A zstd dictionary has two pieces: Its header, and its content. The header |
| 76 |
* contains a magic number, the dictionary ID, and entropy tables. These |
| 77 |
* entropy tables allow zstd to save on header costs in the compressed file, |
| 78 |
* which really matters for small data. The content is just bytes, which are |
| 79 |
* repeated content that is common across many samples. |
| 80 |
* |
| 81 |
* What is a raw content dictionary? |
| 82 |
* --------------------------------- |
| 83 |
* |
| 84 |
* A raw content dictionary is just bytes. It doesn't have a zstd dictionary |
| 85 |
* header, a dictionary ID, or entropy tables. Any buffer is a valid raw |
| 86 |
* content dictionary. |
| 87 |
* |
| 88 |
* How do I train a dictionary? |
| 89 |
* ---------------------------- |
| 90 |
* |
| 91 |
* Gather samples from your use case. These samples should be similar to each |
| 92 |
* other. If you have several use cases, you could try to train one dictionary |
| 93 |
* per use case. |
| 94 |
* |
| 95 |
* Pass those samples to `ZDICT_trainFromBuffer()` and that will train your |
| 96 |
* dictionary. There are a few advanced versions of this function, but this |
| 97 |
* is a great starting point. If you want to further tune your dictionary |
| 98 |
* you could try `ZDICT_optimizeTrainFromBuffer_cover()`. If that is too slow |
| 99 |
* you can try `ZDICT_optimizeTrainFromBuffer_fastCover()`. |
| 100 |
* |
| 101 |
* If the dictionary training function fails, that is likely because you |
| 102 |
* either passed too few samples, or a dictionary would not be effective |
| 103 |
* for your data. Look at the messages that the dictionary trainer printed, |
| 104 |
* if it doesn't say too few samples, then a dictionary would not be effective. |
| 105 |
* |
| 106 |
* How large should my dictionary be? |
| 107 |
* ---------------------------------- |
| 108 |
* |
| 109 |
* A reasonable dictionary size, the `dictBufferCapacity`, is about 100KB. |
| 110 |
* The zstd CLI defaults to a 110KB dictionary. You likely don't need a |
| 111 |
* dictionary larger than that. But, most use cases can get away with a |
| 112 |
* smaller dictionary. The advanced dictionary builders can automatically |
| 113 |
* shrink the dictionary for you, and select a the smallest size that |
| 114 |
* doesn't hurt compression ratio too much. See the `shrinkDict` parameter. |
| 115 |
* A smaller dictionary can save memory, and potentially speed up |
| 116 |
* compression. |
| 117 |
* |
| 118 |
* How many samples should I provide to the dictionary builder? |
| 119 |
* ------------------------------------------------------------ |
| 120 |
* |
| 121 |
* We generally recommend passing ~100x the size of the dictionary |
| 122 |
* in samples. A few thousand should suffice. Having too few samples |
| 123 |
* can hurt the dictionaries effectiveness. Having more samples will |
| 124 |
* only improve the dictionaries effectiveness. But having too many |
| 125 |
* samples can slow down the dictionary builder. |
| 126 |
* |
| 127 |
* How do I determine if a dictionary will be effective? |
| 128 |
* ----------------------------------------------------- |
| 129 |
* |
| 130 |
* Simply train a dictionary and try it out. You can use zstd's built in |
| 131 |
* benchmarking tool to test the dictionary effectiveness. |
| 132 |
* |
| 133 |
* # Benchmark levels 1-3 without a dictionary |
| 134 |
* zstd -b1e3 -r /path/to/my/files |
| 135 |
* # Benchmark levels 1-3 with a dictioanry |
| 136 |
* zstd -b1e3 -r /path/to/my/files -D /path/to/my/dictionary |
| 137 |
* |
| 138 |
* When should I retrain a dictionary? |
| 139 |
* ----------------------------------- |
| 140 |
* |
| 141 |
* You should retrain a dictionary when its effectiveness drops. Dictionary |
| 142 |
* effectiveness drops as the data you are compressing changes. Generally, we do |
| 143 |
* expect dictionaries to "decay" over time, as your data changes, but the rate |
| 144 |
* at which they decay depends on your use case. Internally, we regularly |
| 145 |
* retrain dictionaries, and if the new dictionary performs significantly |
| 146 |
* better than the old dictionary, we will ship the new dictionary. |
| 147 |
* |
| 148 |
* I have a raw content dictionary, how do I turn it into a zstd dictionary? |
| 149 |
* ------------------------------------------------------------------------- |
| 150 |
* |
| 151 |
* If you have a raw content dictionary, e.g. by manually constructing it, or |
| 152 |
* using a third-party dictionary builder, you can turn it into a zstd |
| 153 |
* dictionary by using `ZDICT_finalizeDictionary()`. You'll also have to |
| 154 |
* provide some samples of the data. It will add the zstd header to the |
| 155 |
* raw content, which contains a dictionary ID and entropy tables, which |
| 156 |
* will improve compression ratio, and allow zstd to write the dictionary ID |
| 157 |
* into the frame, if you so choose. |
| 158 |
* |
| 159 |
* Do I have to use zstd's dictionary builder? |
| 160 |
* ------------------------------------------- |
| 161 |
* |
| 162 |
* No! You can construct dictionary content however you please, it is just |
| 163 |
* bytes. It will always be valid as a raw content dictionary. If you want |
| 164 |
* a zstd dictionary, which can improve compression ratio, use |
| 165 |
* `ZDICT_finalizeDictionary()`. |
| 166 |
* |
| 167 |
* What is the attack surface of a zstd dictionary? |
| 168 |
* ------------------------------------------------ |
| 169 |
* |
| 170 |
* Zstd is heavily fuzz tested, including loading fuzzed dictionaries, so |
| 171 |
* zstd should never crash, or access out-of-bounds memory no matter what |
| 172 |
* the dictionary is. However, if an attacker can control the dictionary |
| 173 |
* during decompression, they can cause zstd to generate arbitrary bytes, |
| 174 |
* just like if they controlled the compressed data. |
| 175 |
* |
| 176 |
******************************************************************************/ |
| 177 |
|
| 178 |
|
| 179 |
/*! ZDICT_trainFromBuffer(): |
| 180 |
* Train a dictionary from an array of samples. |
| 181 |
* Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4, |
| 182 |
* f=20, and accel=1. |
| 183 |
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`, |
| 184 |
* supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. |
| 185 |
* The resulting dictionary will be saved into `dictBuffer`. |
| 186 |
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) |
| 187 |
* or an error code, which can be tested with ZDICT_isError(). |
| 188 |
* Note: Dictionary training will fail if there are not enough samples to construct a |
| 189 |
* dictionary, or if most of the samples are too small (< 8 bytes being the lower limit). |
| 190 |
* If dictionary training fails, you should use zstd without a dictionary, as the dictionary |
| 191 |
* would've been ineffective anyways. If you believe your samples would benefit from a dictionary |
| 192 |
* please open an issue with details, and we can look into it. |
| 193 |
* Note: ZDICT_trainFromBuffer()'s memory usage is about 6 MB. |
| 194 |
* Tips: In general, a reasonable dictionary has a size of ~ 100 KB. |
| 195 |
* It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. |
| 196 |
* In general, it's recommended to provide a few thousands samples, though this can vary a lot. |
| 197 |
* It's recommended that total size of all samples be about ~x100 times the target size of dictionary. |
| 198 |
*/ |
| 199 |
ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, |
| 200 |
const void* samplesBuffer, |
| 201 |
const size_t* samplesSizes, unsigned nbSamples); |
| 202 |
|
| 203 |
typedef struct { |
| 204 |
int compressionLevel; /*< optimize for a specific zstd compression level; 0 means default */ |
| 205 |
unsigned notificationLevel; /*< Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ |
| 206 |
unsigned dictID; /*< force dictID value; 0 means auto mode (32-bits random value) |
| 207 |
* NOTE: The zstd format reserves some dictionary IDs for future use. |
| 208 |
* You may use them in private settings, but be warned that they |
| 209 |
* may be used by zstd in a public dictionary registry in the future. |
| 210 |
* These dictionary IDs are: |
| 211 |
* - low range : <= 32767 |
| 212 |
* - high range : >= (2^31) |
| 213 |
*/ |
| 214 |
} ZDICT_params_t; |
| 215 |
|
| 216 |
/*! ZDICT_finalizeDictionary(): |
| 217 |
* Given a custom content as a basis for dictionary, and a set of samples, |
| 218 |
* finalize dictionary by adding headers and statistics according to the zstd |
| 219 |
* dictionary format. |
| 220 |
* |
| 221 |
* Samples must be stored concatenated in a flat buffer `samplesBuffer`, |
| 222 |
* supplied with an array of sizes `samplesSizes`, providing the size of each |
| 223 |
* sample in order. The samples are used to construct the statistics, so they |
| 224 |
* should be representative of what you will compress with this dictionary. |
| 225 |
* |
| 226 |
* The compression level can be set in `parameters`. You should pass the |
| 227 |
* compression level you expect to use in production. The statistics for each |
| 228 |
* compression level differ, so tuning the dictionary for the compression level |
| 229 |
* can help quite a bit. |
| 230 |
* |
| 231 |
* You can set an explicit dictionary ID in `parameters`, or allow us to pick |
| 232 |
* a random dictionary ID for you, but we can't guarantee no collisions. |
| 233 |
* |
| 234 |
* The dstDictBuffer and the dictContent may overlap, and the content will be |
| 235 |
* appended to the end of the header. If the header + the content doesn't fit in |
| 236 |
* maxDictSize the beginning of the content is truncated to make room, since it |
| 237 |
* is presumed that the most profitable content is at the end of the dictionary, |
| 238 |
* since that is the cheapest to reference. |
| 239 |
* |
| 240 |
* `dictContentSize` must be >= ZDICT_CONTENTSIZE_MIN bytes. |
| 241 |
* `maxDictSize` must be >= max(dictContentSize, ZSTD_DICTSIZE_MIN). |
| 242 |
* |
| 243 |
* @return: size of dictionary stored into `dstDictBuffer` (<= `maxDictSize`), |
| 244 |
* or an error code, which can be tested by ZDICT_isError(). |
| 245 |
* Note: ZDICT_finalizeDictionary() will push notifications into stderr if |
| 246 |
* instructed to, using notificationLevel>0. |
| 247 |
* NOTE: This function currently may fail in several edge cases including: |
| 248 |
* * Not enough samples |
| 249 |
* * Samples are uncompressible |
| 250 |
* * Samples are all exactly the same |
| 251 |
*/ |
| 252 |
ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dstDictBuffer, size_t maxDictSize, |
| 253 |
const void* dictContent, size_t dictContentSize, |
| 254 |
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
| 255 |
ZDICT_params_t parameters); |
| 256 |
|
| 257 |
|
| 258 |
/*====== Helper functions ======*/ |
| 259 |
ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize); /**< extracts dictID; @return zero if error (not a valid dictionary) */ |
| 260 |
ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize); /* returns dict header size; returns a ZSTD error code on failure */ |
| 261 |
ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode); |
| 262 |
ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode); |
| 263 |
|
| 264 |
|
| 265 |
|
| 266 |
#ifdef ZDICT_STATIC_LINKING_ONLY |
| 267 |
|
| 268 |
/* ==================================================================================== |
| 269 |
* The definitions in this section are considered experimental. |
| 270 |
* They should never be used with a dynamic library, as they may change in the future. |
| 271 |
* They are provided for advanced usages. |
| 272 |
* Use them only in association with static linking. |
| 273 |
* ==================================================================================== */ |
| 274 |
|
| 275 |
#define ZDICT_CONTENTSIZE_MIN 128 |
| 276 |
#define ZDICT_DICTSIZE_MIN 256 |
| 277 |
|
| 278 |
/*! ZDICT_cover_params_t: |
| 279 |
* k and d are the only required parameters. |
| 280 |
* For others, value 0 means default. |
| 281 |
*/ |
| 282 |
typedef struct { |
| 283 |
unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */ |
| 284 |
unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ |
| 285 |
unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */ |
| 286 |
unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ |
| 287 |
double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */ |
| 288 |
unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */ |
| 289 |
unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */ |
| 290 |
ZDICT_params_t zParams; |
| 291 |
} ZDICT_cover_params_t; |
| 292 |
|
| 293 |
typedef struct { |
| 294 |
unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */ |
| 295 |
unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ |
| 296 |
unsigned f; /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(20)*/ |
| 297 |
unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */ |
| 298 |
unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ |
| 299 |
double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */ |
| 300 |
unsigned accel; /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */ |
| 301 |
unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */ |
| 302 |
unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */ |
| 303 |
|
| 304 |
ZDICT_params_t zParams; |
| 305 |
} ZDICT_fastCover_params_t; |
| 306 |
|
| 307 |
/*! ZDICT_trainFromBuffer_cover(): |
| 308 |
* Train a dictionary from an array of samples using the COVER algorithm. |
| 309 |
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`, |
| 310 |
* supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. |
| 311 |
* The resulting dictionary will be saved into `dictBuffer`. |
| 312 |
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) |
| 313 |
* or an error code, which can be tested with ZDICT_isError(). |
| 314 |
* See ZDICT_trainFromBuffer() for details on failure modes. |
| 315 |
* Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte. |
| 316 |
* Tips: In general, a reasonable dictionary has a size of ~ 100 KB. |
| 317 |
* It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. |
| 318 |
* In general, it's recommended to provide a few thousands samples, though this can vary a lot. |
| 319 |
* It's recommended that total size of all samples be about ~x100 times the target size of dictionary. |
| 320 |
*/ |
| 321 |
ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( |
| 322 |
void *dictBuffer, size_t dictBufferCapacity, |
| 323 |
const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, |
| 324 |
ZDICT_cover_params_t parameters); |
| 325 |
|
| 326 |
/*! ZDICT_optimizeTrainFromBuffer_cover(): |
| 327 |
* The same requirements as above hold for all the parameters except `parameters`. |
| 328 |
* This function tries many parameter combinations and picks the best parameters. |
| 329 |
* `*parameters` is filled with the best parameters found, |
| 330 |
* dictionary constructed with those parameters is stored in `dictBuffer`. |
| 331 |
* |
| 332 |
* All of the parameters d, k, steps are optional. |
| 333 |
* If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}. |
| 334 |
* if steps is zero it defaults to its default value. |
| 335 |
* If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000]. |
| 336 |
* |
| 337 |
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) |
| 338 |
* or an error code, which can be tested with ZDICT_isError(). |
| 339 |
* On success `*parameters` contains the parameters selected. |
| 340 |
* See ZDICT_trainFromBuffer() for details on failure modes. |
| 341 |
* Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread. |
| 342 |
*/ |
| 343 |
ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( |
| 344 |
void* dictBuffer, size_t dictBufferCapacity, |
| 345 |
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
| 346 |
ZDICT_cover_params_t* parameters); |
| 347 |
|
| 348 |
/*! ZDICT_trainFromBuffer_fastCover(): |
| 349 |
* Train a dictionary from an array of samples using a modified version of COVER algorithm. |
| 350 |
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`, |
| 351 |
* supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. |
| 352 |
* d and k are required. |
| 353 |
* All other parameters are optional, will use default values if not provided |
| 354 |
* The resulting dictionary will be saved into `dictBuffer`. |
| 355 |
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) |
| 356 |
* or an error code, which can be tested with ZDICT_isError(). |
| 357 |
* See ZDICT_trainFromBuffer() for details on failure modes. |
| 358 |
* Note: ZDICT_trainFromBuffer_fastCover() requires 6 * 2^f bytes of memory. |
| 359 |
* Tips: In general, a reasonable dictionary has a size of ~ 100 KB. |
| 360 |
* It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. |
| 361 |
* In general, it's recommended to provide a few thousands samples, though this can vary a lot. |
| 362 |
* It's recommended that total size of all samples be about ~x100 times the target size of dictionary. |
| 363 |
*/ |
| 364 |
ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer, |
| 365 |
size_t dictBufferCapacity, const void *samplesBuffer, |
| 366 |
const size_t *samplesSizes, unsigned nbSamples, |
| 367 |
ZDICT_fastCover_params_t parameters); |
| 368 |
|
| 369 |
/*! ZDICT_optimizeTrainFromBuffer_fastCover(): |
| 370 |
* The same requirements as above hold for all the parameters except `parameters`. |
| 371 |
* This function tries many parameter combinations (specifically, k and d combinations) |
| 372 |
* and picks the best parameters. `*parameters` is filled with the best parameters found, |
| 373 |
* dictionary constructed with those parameters is stored in `dictBuffer`. |
| 374 |
* All of the parameters d, k, steps, f, and accel are optional. |
| 375 |
* If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}. |
| 376 |
* if steps is zero it defaults to its default value. |
| 377 |
* If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000]. |
| 378 |
* If f is zero, default value of 20 is used. |
| 379 |
* If accel is zero, default value of 1 is used. |
| 380 |
* |
| 381 |
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) |
| 382 |
* or an error code, which can be tested with ZDICT_isError(). |
| 383 |
* On success `*parameters` contains the parameters selected. |
| 384 |
* See ZDICT_trainFromBuffer() for details on failure modes. |
| 385 |
* Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 6 * 2^f bytes of memory for each thread. |
| 386 |
*/ |
| 387 |
ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer, |
| 388 |
size_t dictBufferCapacity, const void* samplesBuffer, |
| 389 |
const size_t* samplesSizes, unsigned nbSamples, |
| 390 |
ZDICT_fastCover_params_t* parameters); |
| 391 |
|
| 392 |
typedef struct { |
| 393 |
unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */ |
| 394 |
ZDICT_params_t zParams; |
| 395 |
} ZDICT_legacy_params_t; |
| 396 |
|
| 397 |
/*! ZDICT_trainFromBuffer_legacy(): |
| 398 |
* Train a dictionary from an array of samples. |
| 399 |
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`, |
| 400 |
* supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. |
| 401 |
* The resulting dictionary will be saved into `dictBuffer`. |
| 402 |
* `parameters` is optional and can be provided with values set to 0 to mean "default". |
| 403 |
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) |
| 404 |
* or an error code, which can be tested with ZDICT_isError(). |
| 405 |
* See ZDICT_trainFromBuffer() for details on failure modes. |
| 406 |
* Tips: In general, a reasonable dictionary has a size of ~ 100 KB. |
| 407 |
* It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. |
| 408 |
* In general, it's recommended to provide a few thousands samples, though this can vary a lot. |
| 409 |
* It's recommended that total size of all samples be about ~x100 times the target size of dictionary. |
| 410 |
* Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0. |
| 411 |
*/ |
| 412 |
ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy( |
| 413 |
void* dictBuffer, size_t dictBufferCapacity, |
| 414 |
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
| 415 |
ZDICT_legacy_params_t parameters); |
| 416 |
|
| 417 |
|
| 418 |
/* Deprecation warnings */ |
| 419 |
/* It is generally possible to disable deprecation warnings from compiler, |
| 420 |
for example with -Wno-deprecated-declarations for gcc |
| 421 |
or _CRT_SECURE_NO_WARNINGS in Visual. |
| 422 |
Otherwise, it's also possible to manually define ZDICT_DISABLE_DEPRECATE_WARNINGS */ |
| 423 |
#ifdef ZDICT_DISABLE_DEPRECATE_WARNINGS |
| 424 |
# define ZDICT_DEPRECATED(message) ZDICTLIB_API /* disable deprecation warnings */ |
| 425 |
#else |
| 426 |
# define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) |
| 427 |
# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ |
| 428 |
# define ZDICT_DEPRECATED(message) [[deprecated(message)]] ZDICTLIB_API |
| 429 |
# elif defined(__clang__) || (ZDICT_GCC_VERSION >= 405) |
| 430 |
# define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated(message))) |
| 431 |
# elif (ZDICT_GCC_VERSION >= 301) |
| 432 |
# define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated)) |
| 433 |
# elif defined(_MSC_VER) |
| 434 |
# define ZDICT_DEPRECATED(message) ZDICTLIB_API __declspec(deprecated(message)) |
| 435 |
# else |
| 436 |
# pragma message("WARNING: You need to implement ZDICT_DEPRECATED for this compiler") |
| 437 |
# define ZDICT_DEPRECATED(message) ZDICTLIB_API |
| 438 |
# endif |
| 439 |
#endif /* ZDICT_DISABLE_DEPRECATE_WARNINGS */ |
| 440 |
|
| 441 |
ZDICT_DEPRECATED("use ZDICT_finalizeDictionary() instead") |
| 442 |
size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, |
| 443 |
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); |
| 444 |
|
| 445 |
|
| 446 |
#endif /* ZDICT_STATIC_LINKING_ONLY */ |
| 447 |
|
| 448 |
#if defined (__cplusplus) |
| 449 |
} |
| 450 |
#endif |
| 451 |
|
| 452 |
#endif /* DICTBUILDER_H_001 */ |