Skip to content

Commit

Permalink
Merge pull request #4180 from facebook/split_param
Browse files Browse the repository at this point in the history
Block splitter control parameter
  • Loading branch information
Cyan4973 authored Oct 31, 2024
2 parents 5bae43b + bbaba45 commit 15c2916
Show file tree
Hide file tree
Showing 6 changed files with 178 additions and 46 deletions.
86 changes: 59 additions & 27 deletions lib/compress/zstd_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog);
assert(cctxParams.ldmParams.hashRateLog < 32);
}
cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
cctxParams.postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.postBlockSplitter, &cParams);
cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
Expand Down Expand Up @@ -391,13 +391,13 @@ ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
*/
cctxParams->compressionLevel = compressionLevel;
cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
cctxParams->postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->postBlockSplitter, &params->cParams);
cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
cctxParams->useRowMatchFinder, cctxParams->postBlockSplitter, cctxParams->ldmParams.enableLdm);
}

size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params)
Expand Down Expand Up @@ -598,11 +598,16 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
bounds.upperBound = 1;
return bounds;

case ZSTD_c_useBlockSplitter:
case ZSTD_c_splitAfterSequences:
bounds.lowerBound = (int)ZSTD_ps_auto;
bounds.upperBound = (int)ZSTD_ps_disable;
return bounds;

case ZSTD_c_blockSplitterLevel:
bounds.lowerBound = 0;
bounds.upperBound = ZSTD_BLOCKSPLITTER_LEVEL_MAX;
return bounds;

case ZSTD_c_useRowMatchFinder:
bounds.lowerBound = (int)ZSTD_ps_auto;
bounds.upperBound = (int)ZSTD_ps_disable;
Expand Down Expand Up @@ -669,6 +674,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
case ZSTD_c_minMatch:
case ZSTD_c_targetLength:
case ZSTD_c_strategy:
case ZSTD_c_blockSplitterLevel:
return 1;

case ZSTD_c_format:
Expand All @@ -695,7 +701,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
case ZSTD_c_stableOutBuffer:
case ZSTD_c_blockDelimiters:
case ZSTD_c_validateSequences:
case ZSTD_c_useBlockSplitter:
case ZSTD_c_splitAfterSequences:
case ZSTD_c_useRowMatchFinder:
case ZSTD_c_deterministicRefPrefix:
case ZSTD_c_prefetchCDictTables:
Expand Down Expand Up @@ -754,7 +760,8 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
case ZSTD_c_stableOutBuffer:
case ZSTD_c_blockDelimiters:
case ZSTD_c_validateSequences:
case ZSTD_c_useBlockSplitter:
case ZSTD_c_splitAfterSequences:
case ZSTD_c_blockSplitterLevel:
case ZSTD_c_useRowMatchFinder:
case ZSTD_c_deterministicRefPrefix:
case ZSTD_c_prefetchCDictTables:
Expand Down Expand Up @@ -975,10 +982,15 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
CCtxParams->validateSequences = value;
return (size_t)CCtxParams->validateSequences;

case ZSTD_c_useBlockSplitter:
BOUNDCHECK(ZSTD_c_useBlockSplitter, value);
CCtxParams->useBlockSplitter = (ZSTD_paramSwitch_e)value;
return CCtxParams->useBlockSplitter;
case ZSTD_c_splitAfterSequences:
BOUNDCHECK(ZSTD_c_splitAfterSequences, value);
CCtxParams->postBlockSplitter = (ZSTD_paramSwitch_e)value;
return CCtxParams->postBlockSplitter;

case ZSTD_c_blockSplitterLevel:
BOUNDCHECK(ZSTD_c_blockSplitterLevel, value);
CCtxParams->preBlockSplitter_level = value;
return (size_t)CCtxParams->preBlockSplitter_level;

case ZSTD_c_useRowMatchFinder:
BOUNDCHECK(ZSTD_c_useRowMatchFinder, value);
Expand Down Expand Up @@ -1135,8 +1147,11 @@ size_t ZSTD_CCtxParams_getParameter(
case ZSTD_c_validateSequences :
*value = (int)CCtxParams->validateSequences;
break;
case ZSTD_c_useBlockSplitter :
*value = (int)CCtxParams->useBlockSplitter;
case ZSTD_c_splitAfterSequences :
*value = (int)CCtxParams->postBlockSplitter;
break;
case ZSTD_c_blockSplitterLevel :
*value = CCtxParams->preBlockSplitter_level;
break;
case ZSTD_c_useRowMatchFinder :
*value = (int)CCtxParams->useRowMatchFinder;
Expand Down Expand Up @@ -2099,7 +2114,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
{
ZSTD_cwksp* const ws = &zc->workspace;
DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d",
(U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->useBlockSplitter);
(U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->postBlockSplitter);
assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));

zc->isFirstBlock = 1;
Expand All @@ -2111,7 +2126,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
params = &zc->appliedParams;

assert(params->useRowMatchFinder != ZSTD_ps_auto);
assert(params->useBlockSplitter != ZSTD_ps_auto);
assert(params->postBlockSplitter != ZSTD_ps_auto);
assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
assert(params->maxBlockSize != 0);
if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
Expand Down Expand Up @@ -2517,10 +2532,10 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
/* Copy only compression parameters related to tables. */
params.cParams = srcCCtx->appliedParams.cParams;
assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto);
assert(srcCCtx->appliedParams.useBlockSplitter != ZSTD_ps_auto);
assert(srcCCtx->appliedParams.postBlockSplitter != ZSTD_ps_auto);
assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto);
params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder;
params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
params.postBlockSplitter = srcCCtx->appliedParams.postBlockSplitter;
params.ldmParams = srcCCtx->appliedParams.ldmParams;
params.fParams = fParams;
params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
Expand Down Expand Up @@ -2728,9 +2743,9 @@ static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams)
* Returns 1 if true, 0 otherwise. */
static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams)
{
DEBUGLOG(5, "ZSTD_blockSplitterEnabled (useBlockSplitter=%d)", cctxParams->useBlockSplitter);
assert(cctxParams->useBlockSplitter != ZSTD_ps_auto);
return (cctxParams->useBlockSplitter == ZSTD_ps_enable);
DEBUGLOG(5, "ZSTD_blockSplitterEnabled (postBlockSplitter=%d)", cctxParams->postBlockSplitter);
assert(cctxParams->postBlockSplitter != ZSTD_ps_auto);
return (cctxParams->postBlockSplitter == ZSTD_ps_enable);
}

/* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types
Expand Down Expand Up @@ -4300,7 +4315,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
U32 nbSeq;
size_t cSize;
DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
assert(zc->appliedParams.useBlockSplitter == ZSTD_ps_enable);
assert(zc->appliedParams.postBlockSplitter == ZSTD_ps_enable);

{ const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
Expand Down Expand Up @@ -4491,7 +4506,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,

#include "zstd_preSplit.h"

static size_t ZSTD_optimalBlockSize(ZSTD_CCtx* cctx, const void* src, size_t srcSize, size_t blockSizeMax, ZSTD_strategy strat, S64 savings)
static size_t ZSTD_optimalBlockSize(ZSTD_CCtx* cctx, const void* src, size_t srcSize, size_t blockSizeMax, int splitLevel, ZSTD_strategy strat, S64 savings)
{
/* split level based on compression strategy, from `fast` to `btultra2` */
static const int splitLevels[] = { 0, 0, 1, 2, 2, 3, 3, 4, 4, 4 };
Expand All @@ -4505,10 +4520,22 @@ static size_t ZSTD_optimalBlockSize(ZSTD_CCtx* cctx, const void* src, size_t src
* require verified savings to allow pre-splitting.
* Note: as a consequence, the first full block is not split.
*/
if (savings < 3) return 128 KB;
/* dynamic splitting has a cpu cost for analysis,
* select a variant among multiple gradual speed/accuracy tradeoffs */
return ZSTD_splitBlock(src, blockSizeMax, splitLevels[strat], cctx->tmpWorkspace, cctx->tmpWkspSize);
if (savings < 3) {
DEBUGLOG(6, "don't attempt splitting: savings (%i) too low", (int)savings);
return 128 KB;
}
/* apply @splitLevel, or use default value (which depends on @strat).
* note that splitting heuristic is still conditioned by @savings >= 3,
* so the first block will not reach this code path */
if (splitLevel == 1) return 128 KB;
if (splitLevel == 0) {
assert(ZSTD_fast <= strat && strat <= ZSTD_btultra2);
splitLevel = splitLevels[strat];
} else {
assert(2 <= splitLevel && splitLevel <= 6);
splitLevel -= 2;
}
return ZSTD_splitBlock(src, blockSizeMax, splitLevel, cctx->tmpWorkspace, cctx->tmpWkspSize);
}

/*! ZSTD_compress_frameChunk() :
Expand Down Expand Up @@ -4539,7 +4566,12 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,

while (remaining) {
ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
size_t const blockSize = ZSTD_optimalBlockSize(cctx, ip, remaining, blockSizeMax, cctx->appliedParams.cParams.strategy, savings);
size_t const blockSize = ZSTD_optimalBlockSize(cctx,
ip, remaining,
blockSizeMax,
cctx->appliedParams.preBlockSplitter_level,
cctx->appliedParams.cParams.strategy,
savings);
U32 const lastBlock = lastFrameChunk & (blockSize == remaining);
assert(blockSize <= remaining);

Expand Down Expand Up @@ -6286,7 +6318,7 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
dictSize, mode);
}

params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
params.postBlockSplitter = ZSTD_resolveBlockSplitterMode(params.postBlockSplitter, &params.cParams);
params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
Expand Down
20 changes: 15 additions & 5 deletions lib/compress/zstd_compress_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -343,8 +343,21 @@ struct ZSTD_CCtx_params_s {
ZSTD_sequenceFormat_e blockDelimiters;
int validateSequences;

/* Block splitting */
ZSTD_paramSwitch_e useBlockSplitter;
/* Block splitting
* @postBlockSplitter executes split analysis after sequences are produced,
* it's more accurate but consumes more resources.
* @preBlockSplitter_level splits before knowing sequences,
* it's more approximative but also cheaper.
* Valid @preBlockSplitter_level values range from 0 to 6 (included).
* 0 means auto, 1 means do not split,
* then levels are sorted in increasing cpu budget, from 2 (fastest) to 6 (slowest).
* Highest @preBlockSplitter_level combines well with @postBlockSplitter.
*/
ZSTD_paramSwitch_e postBlockSplitter;
int preBlockSplitter_level;

/* Adjust the max block size*/
size_t maxBlockSize;

/* Param for deciding whether to use row-based matchfinder */
ZSTD_paramSwitch_e useRowMatchFinder;
Expand All @@ -368,9 +381,6 @@ struct ZSTD_CCtx_params_s {
void* extSeqProdState;
ZSTD_sequenceProducer_F extSeqProdFunc;

/* Adjust the max block size*/
size_t maxBlockSize;

/* Controls repcode search in external sequence parsing */
ZSTD_paramSwitch_e searchForExternalRepcodes;
}; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
Expand Down
1 change: 1 addition & 0 deletions lib/compress/zstd_preSplit.c
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ size_t ZSTD_splitBlock(const void* blockStart, size_t blockSize,
int level,
void* workspace, size_t wkspSize)
{
DEBUGLOG(6, "ZSTD_splitBlock (level=%i)", level);
assert(0<=level && level<=4);
if (level == 0)
return ZSTD_splitBlock_fromBorders(blockStart, blockSize, workspace, wkspSize);
Expand Down
38 changes: 32 additions & 6 deletions lib/zstd.h
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,8 @@ typedef enum {
* ZSTD_c_stableOutBuffer
* ZSTD_c_blockDelimiters
* ZSTD_c_validateSequences
* ZSTD_c_useBlockSplitter
* ZSTD_c_blockSplitterLevel
* ZSTD_c_splitAfterSequences
* ZSTD_c_useRowMatchFinder
* ZSTD_c_prefetchCDictTables
* ZSTD_c_enableSeqProducerFallback
Expand All @@ -518,7 +519,8 @@ typedef enum {
ZSTD_c_experimentalParam16=1013,
ZSTD_c_experimentalParam17=1014,
ZSTD_c_experimentalParam18=1015,
ZSTD_c_experimentalParam19=1016
ZSTD_c_experimentalParam19=1016,
ZSTD_c_experimentalParam20=1017
} ZSTD_cParameter;

typedef struct {
Expand Down Expand Up @@ -2148,16 +2150,40 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
*/
#define ZSTD_c_validateSequences ZSTD_c_experimentalParam12

/* ZSTD_c_useBlockSplitter
* Controlled with ZSTD_paramSwitch_e enum.
/* ZSTD_c_blockSplitterLevel
* note: this parameter only influences the first splitter stage,
* which is active before producing the sequences.
* ZSTD_c_splitAfterSequences controls the next splitter stage,
* which is active after sequence production.
* Note that both can be combined.
* Allowed values are between 0 and ZSTD_BLOCKSPLITTER_LEVEL_MAX included.
* 0 means "auto", which will select a value depending on current ZSTD_c_strategy.
* 1 means no splitting.
* Then, values from 2 to 6 are sorted in increasing cpu load order.
*
* Note that currently the first block is never split,
* to ensure expansion guarantees in presence of incompressible data.
*/
#define ZSTD_BLOCKSPLITTER_LEVEL_MAX 6
#define ZSTD_c_blockSplitterLevel ZSTD_c_experimentalParam20

/* ZSTD_c_splitAfterSequences
* This is a stronger splitter algorithm,
* based on actual sequences previously produced by the selected parser.
* It's also slower, and as a consequence, mostly used for high compression levels.
* While the post-splitter does overlap with the pre-splitter,
* both can nonetheless be combined,
* notably with ZSTD_c_blockSplitterLevel at ZSTD_BLOCKSPLITTER_LEVEL_MAX,
* resulting in higher compression ratio than just one of them.
*
* Default is ZSTD_ps_auto.
* Set to ZSTD_ps_disable to never use block splitter.
* Set to ZSTD_ps_enable to always use block splitter.
*
* By default, in ZSTD_ps_auto, the library will decide at runtime whether to use
* block splitting based on the compression parameters.
*/
#define ZSTD_c_useBlockSplitter ZSTD_c_experimentalParam13
#define ZSTD_c_splitAfterSequences ZSTD_c_experimentalParam13

/* ZSTD_c_useRowMatchFinder
* Controlled with ZSTD_paramSwitch_e enum.
Expand Down Expand Up @@ -2236,7 +2262,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
* that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
* bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
* compressBound() inaccurate). Only currently meant to be used for testing.
*
*/
#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18

Expand Down Expand Up @@ -2264,6 +2289,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
*/
#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19


/*! ZSTD_CCtx_getParameter() :
* Get the requested compression parameter value, selected by enum ZSTD_cParameter,
* and store it into int* value.
Expand Down
3 changes: 2 additions & 1 deletion tests/fuzz/zstd_helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,8 @@ void FUZZ_setRandomParameters(ZSTD_CCtx *cctx, size_t srcSize, FUZZ_dataProducer
setRand(cctx, ZSTD_c_forceMaxWindow, 0, 1, producer);
setRand(cctx, ZSTD_c_literalCompressionMode, 0, 2, producer);
setRand(cctx, ZSTD_c_forceAttachDict, 0, 2, producer);
setRand(cctx, ZSTD_c_useBlockSplitter, 0, 2, producer);
setRand(cctx, ZSTD_c_blockSplitterLevel, 0, ZSTD_BLOCKSPLITTER_LEVEL_MAX, producer);
setRand(cctx, ZSTD_c_splitAfterSequences, 0, 2, producer);
setRand(cctx, ZSTD_c_deterministicRefPrefix, 0, 1, producer);
setRand(cctx, ZSTD_c_prefetchCDictTables, 0, 2, producer);
setRand(cctx, ZSTD_c_maxBlockSize, ZSTD_BLOCKSIZE_MAX_MIN, ZSTD_BLOCKSIZE_MAX, producer);
Expand Down
Loading

0 comments on commit 15c2916

Please sign in to comment.