Skip to content

Commit

Permalink
Merge pull request #129 from Cyan4973/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
Cyan4973 committed Feb 18, 2016
2 parents 201433a + 4c64d51 commit c580b75
Show file tree
Hide file tree
Showing 65 changed files with 4,900 additions and 4,861 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,9 @@ ipch/

# Other files
.directory
_codelite
_zstdbench

lib/zstd_opt_LZ5.c
lib/zstd_opt_llen.c
lib/zstd_opt_nollen.c
3 changes: 1 addition & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ language: c

before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq gcc-arm-linux-gnueabi
- sudo apt-get install -qq clang
- sudo apt-get install -qq g++-multilib
- sudo apt-get install -qq gcc-multilib
Expand All @@ -13,7 +12,7 @@ env:
- ZSTD_TRAVIS_CI_ENV=cmaketest
- ZSTD_TRAVIS_CI_ENV=clangtest
- ZSTD_TRAVIS_CI_ENV=gpptest
- ZSTD_TRAVIS_CI_ENV=armtest
- ZSTD_TRAVIS_CI_ENV=armtest-w-install
- ZSTD_TRAVIS_CI_ENV=test
- ZSTD_TRAVIS_CI_ENV="-C programs test32"
- ZSTD_TRAVIS_CI_ENV="-C programs test-zstd_nolegacy"
Expand Down
41 changes: 31 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# ################################################################
# zstd - Makefile
# Copyright (C) Yann Collet 2014-2015
# Copyright (C) Yann Collet 2014-2016
# All rights reserved.
#
# BSD license
Expand All @@ -27,16 +27,14 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# You can contact the author at :
# - zstd source repository : https://github.com/Cyan4973/zstd
# - Public forum : https://groups.google.com/forum/#!forum/lz4c
# - zstd homepage : http://www.zstd.net/
# ################################################################

# force a version number : uncomment below export (otherwise, default to the one declared into zstd.h)
#export VERSION := 0.4.6
#export VERSION := 0.5.1

PRGDIR = programs
ZSTDDIR = lib
DICTDIR = dictBuilder

# Define nul output
ifneq (,$(filter Windows%,$(OS)))
Expand All @@ -52,15 +50,13 @@ default: zstdprogram
all:
$(MAKE) -C $(ZSTDDIR) $@
$(MAKE) -C $(PRGDIR) $@
$(MAKE) -C $(DICTDIR) $@

zstdprogram:
$(MAKE) -C $(PRGDIR)

clean:
@$(MAKE) -C $(ZSTDDIR) $@ > $(VOID)
@$(MAKE) -C $(PRGDIR) $@ > $(VOID)
@$(MAKE) -C $(DICTDIR) $@ > $(VOID)
@echo Cleaning completed


Expand All @@ -81,7 +77,6 @@ travis-install:

test:
$(MAKE) -C $(PRGDIR) $@
$(MAKE) -C $(DICTDIR) $@

cmaketest:
cd contrib/cmake ; cmake . ; $(MAKE)
Expand All @@ -94,8 +89,34 @@ gpptest: clean
$(MAKE) all CC=g++ CFLAGS="-O3 -Wall -Wextra -Wundef -Wshadow -Wcast-align -Werror"

armtest: clean
$(MAKE) -C $(ZSTDDIR) all CC=arm-linux-gnueabi-gcc MOREFLAGS="-Werror"
$(MAKE) -C $(PRGDIR) CC=arm-linux-gnueabi-gcc MOREFLAGS="-Werror -static"
# $(MAKE) -C $(ZSTDDIR) all CC=arm-linux-gnueabi-gcc MOREFLAGS="-Werror"
$(MAKE) -C $(PRGDIR) datagen # use native, faster
$(MAKE) -C $(PRGDIR) test CC=arm-linux-gnueabi-gcc ZSTDRTTEST= MOREFLAGS=-static # MOREFLAGS="-Werror -static"

# for Travis CI
arminstall: clean
sudo apt-get install -q qemu
sudo apt-get install -q binfmt-support
sudo apt-get install -q qemu-user-static
sudo apt-get install -q gcc-arm-linux-gnueabi

# for Travis CI
armtest-w-install: clean arminstall armtest

ppctest: clean
$(MAKE) -C $(PRGDIR) datagen # use native, faster
$(MAKE) -C $(PRGDIR) test CC=powerpc-linux-gnu-gcc ZSTDRTTEST= MOREFLAGS=-static # MOREFLAGS="-Werror -static"

# for Travis CI
ppcinstall: clean
sudo apt-get install -q qemu
sudo apt-get install -q binfmt-support
sudo apt-get install -q qemu-user-static
sudo apt-get update -q
sudo apt-get install -q gcc-powerpc-linux-gnu # unfortunately, doesn't work on Travis CI (package not available)

# for Travis CI
ppctest-w-install: clean ppcinstall ppctest

usan: clean
$(MAKE) test CC=clang MOREFLAGS="-g -fsanitize=undefined"
Expand Down
7 changes: 7 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
v0.5.1
New : Optimal parsing => Very high compression modes, thanks to Przemyslaw Skibinski
Changed : Dictionary builder integrated into libzstd and zstd cli
Changed (!) : zstd cli now uses "multiple input files" as default mode. See `zstd -h`.
Fix : high compression modes for big-endian platforms
New : zstd cli : `-t` | `--test` command

v0.5.0
New : dictionary builder utility
Changed : streaming & dictionary API
Expand Down
80 changes: 65 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ As a reference, several fast compression algorithms were tested and compared to
|Name | Ratio | C.speed | D.speed |
|-----------------|-------|--------:|--------:|
| | | MB/s | MB/s |
|**zstd 0.4.7 -1**|**2.875**|**330**| **890** |
|**zstd 0.5.1 -1**|**2.876**|**330**| **890** |
| [zlib] 1.2.8 -1 | 2.730 | 95 | 360 |
| brotli -0 | 2.708 | 220 | 430 |
| QuickLZ 1.5 | 2.237 | 510 | 605 |
Expand All @@ -35,38 +35,88 @@ The following test is run on a Core i7-3930K CPU @ 4.5GHz, using [lzbench], an o

Compression Speed vs Ratio | Decompression Speed
---------------------------|--------------------
![Compression Speed vs Ratio](images/CSpeed.png "Compression Speed vs Ratio") | ![Decompression Speed](images/DSpeed.png "Decompression Speed")
![Compression Speed vs Ratio](images/Cspeed4.png "Compression Speed vs Ratio") | ![Decompression Speed](images/Dspeed4.png "Decompression Speed")

Several algorithms can produce higher compression ratio at slower speed, falling outside of the graph.
For a larger picture including very slow modes, [click on this link](images/DCspeed5.png) .


### The case for Small Data compression

The above chart is applicable to large files or large streams scenarios (200 MB in this case).
Above chart provides results applicable to large files or large streams scenarios (200 MB for this case).
Small data (< 64 KB) come with different perspectives.
The smaller the amount of data to compress, the more difficult it is to achieve any significant compression.
On reaching the 1 KB region, it becomes almost impossible to compress anything.
This problem is common to all compression algorithms, and throwing CPU power at it achieves no significant gains.
This problem is common to any compression algorithms, and throwing CPU power at it achieves little gains.

The reason is, compression algorithms learn from past data how to compress future data.
But at the beginning of a new file, there is no "past" to build upon.

[Starting with 0.5](https://github.com/Cyan4973/zstd/releases), Zstd now offers [a _Dictionary Builder_ tool](https://github.com/Cyan4973/zstd/tree/master/dictBuilder).
It can be used to train the algorithm to fit a selected type of data, by providing it with some samples.
The result is a file (or a byte buffer) called "dictionary", which can be loaded before compression and decompression.
By using this dictionary, the compression ratio achievable on small data improves dramatically :
To solve this situation, Zstd now offers a __training mode__,
which can be used to make the algorithm fit a selected type of data, by providing it with some samples.
The result of the training is a file called "dictionary", which can be loaded before compression and decompression.
Using this dictionary, the compression ratio achievable on small data improves dramatically :

| Collection Name | Direct compression | Dictionary Compression | Gains | Average unit | Range |
| --------------- | ------------------ | ---------------------- | ----- | ------------:| ----- |
| Small JSON records | x1.331 - x1.366 | x5.860 - x6.830 | ~ x4.7 | 300 | 200 - 400 |
| Mercurial events | x2.322 - x2.538 | x3.377 - x4.462 | ~ x1.5 | 1.5 KB | 20 - 200 KB |
| Large JSON docs | x3.813 - x4.043 | x8.935 - x13.366 | ~ x2.8 | 6 KB | 800 - 20 KB |
| Collection Name | Direct compression | Dictionary Compression | Gains | Average unit | Range |
| --------------- | ------------------ | ---------------------- | --------- | ------------:| ----- |
| Small JSON records | x1.331 - x1.366 | x5.860 - x6.830 | ~ __x4.7__ | 300 | 200 - 400 |
| Mercurial events | x2.322 - x2.538 | x3.377 - x4.462 | ~ __x1.5__ | 1.5 KB | 20 - 200 KB |
| Large JSON docs | x3.813 - x4.043 | x8.935 - x13.366 | ~ __x2.8__ | 6 KB | 800 - 20 KB |

It has to be noted that these compression gains are achieved without any speed loss, and even some faster decompression processing.
These compression gains are achieved without any speed loss, and prove in general a bit faster to compress and decompress.

Dictionary work if there is some correlation in a family of small data (there is no _universal dictionary_).
Hence, deploying one dictionary per type of data will provide the greater benefits.

Large documents will benefit proportionally less, since dictionary gains are mostly effective in the first few KB.
Then there is enough history to build upon, and the compression algorithm can rely on it to compress the rest of the file.
Then, the compression algorithm will rely more and more on already decoded content to compress the rest of the file.

#### Dictionary compression How To :

##### _Using the Command Line Utility_ :

1) Create the dictionary

`zstd --train FullPathToTrainingSet/* -o dictionaryName`

2) Compression with dictionary

`zstd FILE -D dictionaryName`

3) Decompress with dictionary

`zstd --decompress FILE.zst -D dictionaryName`

##### _Using API_ :

1) Create dictionary

```
#include "zdict.h"
(...)
/* Train a dictionary from a memory buffer `samplesBuffer`,
where `nbSamples` samples have been stored concatenated. */
size_t dictSize = ZDICT_trainFromBuffer(dictBuffer, dictBufferCapacity,
samplesBuffer, samplesSizes, nbSamples);
```

2) Compression with dictionary

```
#include "zstd.h"
(...)
ZSTD_CCtx* context = ZSTD_createCCtx();
size_t compressedSize = ZSTD_compress_usingDict(context, dst, dstCapacity, src, srcSize, dict, dictSize, compressionLevel);
```

3) Decompress with dictionary

```
#include "zstd.h"
(...)
ZSTD_DCtx* context = ZSTD_createDCtx();
size_t regeneratedSize = ZSTD_decompress_usingDict(context, dst, dstCapacity, cSrc, cSrcSize, dict, dictSize);
```


### Status
Expand Down
2 changes: 1 addition & 1 deletion contrib/cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# ################################################################
# zstd - Makefile
# Copyright (C) Yann Collet 2014-2015
# Copyright (C) Yann Collet 2014-2016
# All rights reserved.
#
# BSD license
Expand Down
23 changes: 14 additions & 9 deletions contrib/cmake/lib/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# ################################################################
# zstd - Makefile
# Copyright (C) Yann Collet 2014-2015
# Copyright (C) Yann Collet 2014-2016
# All rights reserved.
#
# BSD license
Expand All @@ -27,8 +27,7 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# You can contact the author at :
# - zstd source repository : https://github.com/Cyan4973/zstd
# - Public forum : https://groups.google.com/forum/#!forum/lz4c
# - zstd homepage : http://www.zstd.net/
# ################################################################

# Get library version based on information from input content (use regular exp)
Expand Down Expand Up @@ -58,9 +57,11 @@ GetLibraryVersion("${HEADER_CONTENT}" LIBVER_MAJOR LIBVER_MINOR LIBVER_RELEASE)
MESSAGE("ZSTD VERSION ${LIBVER_MAJOR}.${LIBVER_MINOR}.${LIBVER_RELEASE}")

SET(Sources
${LIBRARY_DIR}/divsufsort.c
${LIBRARY_DIR}/fse.c
${LIBRARY_DIR}/huff0.c
${LIBRARY_DIR}/zstd_buffered.c
${LIBRARY_DIR}/zbuff.c
${LIBRARY_DIR}/zdict.c
${LIBRARY_DIR}/zstd_compress.c
${LIBRARY_DIR}/zstd_decompress.c)

Expand All @@ -73,8 +74,10 @@ SET(Headers
${LIBRARY_DIR}/huff0.h
${LIBRARY_DIR}/huff0_static.h
${LIBRARY_DIR}/mem.h
${LIBRARY_DIR}/zstd_buffered_static.h
${LIBRARY_DIR}/zstd_buffered.h
${LIBRARY_DIR}/zbuff.h
${LIBRARY_DIR}/zbuff_static.h
${LIBRARY_DIR}/zdict.h
${LIBRARY_DIR}/zdict_static.h
${LIBRARY_DIR}/zstd_internal.h
${LIBRARY_DIR}/zstd_static.h
${LIBRARY_DIR}/zstd.h)
Expand All @@ -86,13 +89,15 @@ IF (ZSTD_LEGACY_SUPPORT)
SET(Sources ${Sources}
${LIBRARY_LEGACY_DIR}/zstd_v01.c
${LIBRARY_LEGACY_DIR}/zstd_v02.c
${LIBRARY_LEGACY_DIR}/zstd_v03.c)
${LIBRARY_LEGACY_DIR}/zstd_v03.c
${LIBRARY_LEGACY_DIR}/zstd_v04.c)

SET(Headers ${Headers}
${LIBRARY_LEGACY_DIR}/zstd_legacy.h
${LIBRARY_LEGACY_DIR}/zstd_v01.h
${LIBRARY_LEGACY_DIR}/zstd_v02.h
${LIBRARY_LEGACY_DIR}/zstd_v03.h)
${LIBRARY_LEGACY_DIR}/zstd_v03.h
${LIBRARY_LEGACY_DIR}/zstd_v04.h)
ENDIF (ZSTD_LEGACY_SUPPORT)

IF (MSVC)
Expand Down Expand Up @@ -161,7 +166,7 @@ IF (UNIX)
SET(INSTALL_INCLUDE_DIR ${PREFIX}/include)

# install target
INSTALL(FILES ${LIBRARY_DIR}/zstd.h DESTINATION ${INSTALL_INCLUDE_DIR})
INSTALL(FILES ${LIBRARY_DIR}/zstd.h ${LIBRARY_DIR}/zstd_buffered.h ${LIBRARY_DIR}/dictBuilder.h DESTINATION ${INSTALL_INCLUDE_DIR})
INSTALL(TARGETS libzstd_static DESTINATION ${INSTALL_LIBRARY_DIR})
INSTALL(TARGETS libzstd_shared LIBRARY DESTINATION ${INSTALL_LIBRARY_DIR})

Expand Down
13 changes: 6 additions & 7 deletions contrib/cmake/programs/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# ################################################################
# zstd - Makefile
# Copyright (C) Yann Collet 2014-2015
# Copyright (C) Yann Collet 2014-2016
# All rights reserved.
#
# BSD license
Expand All @@ -27,8 +27,7 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# You can contact the author at :
# - zstd source repository : https://github.com/Cyan4973/zstd
# - Public forum : https://groups.google.com/forum/#!forum/lz4c
# - zstd homepage : http://www.zstd.net/
# ################################################################

PROJECT(programs)
Expand Down Expand Up @@ -59,7 +58,7 @@ IF (ZSTD_LEGACY_SUPPORT)
SET(ZSTD_FILEIO_LEGACY ${PROGRAMS_LEGACY_DIR}/fileio_legacy.c)
ENDIF (ZSTD_LEGACY_SUPPORT)

ADD_EXECUTABLE(zstd ${PROGRAMS_DIR}/zstdcli.c ${PROGRAMS_DIR}/fileio.c ${PROGRAMS_DIR}/bench.c ${PROGRAMS_DIR}/xxhash.c ${PROGRAMS_DIR}/datagen.c ${ZSTD_FILEIO_LEGACY})
ADD_EXECUTABLE(zstd ${PROGRAMS_DIR}/zstdcli.c ${PROGRAMS_DIR}/fileio.c ${PROGRAMS_DIR}/bench.c ${PROGRAMS_DIR}/xxhash.c ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/dibio.c ${ZSTD_FILEIO_LEGACY})
TARGET_LINK_LIBRARIES(zstd libzstd_static)

ADD_EXECUTABLE(fullbench ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/fullbench.c)
Expand All @@ -69,9 +68,9 @@ ADD_EXECUTABLE(fuzzer ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/xxhash.c ${PROGR
TARGET_LINK_LIBRARIES(fuzzer libzstd_static)

IF (UNIX)
ADD_EXECUTABLE(zstd-noBench ${PROGRAMS_DIR}/zstdcli.c ${PROGRAMS_DIR}/fileio.c ${ZSTD_FILEIO_LEGACY})
TARGET_LINK_LIBRARIES(zstd-noBench libzstd_static)
SET_TARGET_PROPERTIES(zstd-noBench PROPERTIES COMPILE_DEFINITIONS "ZSTD_NOBENCH")
ADD_EXECUTABLE(zstd-frugal ${PROGRAMS_DIR}/zstdcli.c ${PROGRAMS_DIR}/fileio.c)
TARGET_LINK_LIBRARIES(zstd-frugal libzstd_static)
SET_TARGET_PROPERTIES(zstd-frugal PROPERTIES COMPILE_DEFINITIONS "ZSTD_NOBENCH;ZSTD_NODICT")

ADD_EXECUTABLE(zbufftest ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/xxhash.c ${PROGRAMS_DIR}/zbufftest.c)
TARGET_LINK_LIBRARIES(zbufftest libzstd_static)
Expand Down
Loading

0 comments on commit c580b75

Please sign in to comment.