Code Examples
package main
import (
"bytes"
"compress/flate"
"fmt"
"io"
"log"
"os"
"strings"
)
func main() {
// The dictionary is a string of bytes. When compressing some input data,
// the compressor will attempt to substitute substrings with matches found
// in the dictionary. As such, the dictionary should only contain substrings
// that are expected to be found in the actual data stream.
const dict = `` + `` + `` + `
...
`
var b bytes.Buffer
// Compress the data using the specially crafted dictionary.
zw, err := flate.NewWriterDict(&b, flate.DefaultCompression, []byte(dict))
if err != nil {
log.Fatal(err)
}
if _, err := io.Copy(zw, strings.NewReader(data)); err != nil {
log.Fatal(err)
}
if err := zw.Close(); err != nil {
log.Fatal(err)
}
// The decompressor must use the same dictionary as the compressor.
// Otherwise, the input may appear as corrupted.
fmt.Println("Decompressed output using the dictionary:")
zr := flate.NewReaderDict(bytes.NewReader(b.Bytes()), []byte(dict))
if _, err := io.Copy(os.Stdout, zr); err != nil {
log.Fatal(err)
}
if err := zr.Close(); err != nil {
log.Fatal(err)
}
fmt.Println()
// Substitute all of the bytes in the dictionary with a '#' to visually
// demonstrate the approximate effectiveness of using a preset dictionary.
fmt.Println("Substrings matched by the dictionary are marked with #:")
hashDict := []byte(dict)
for i := range hashDict {
hashDict[i] = '#'
}
zr = flate.NewReaderDict(&b, hashDict)
if _, err := io.Copy(os.Stdout, zr); err != nil {
log.Fatal(err)
}
if err := zr.Close(); err != nil {
log.Fatal(err)
}
}
package main
import (
"bytes"
"compress/flate"
"io"
"log"
"os"
"strings"
)
func main() {
proverbs := []string{
"Don't communicate by sharing memory, share memory by communicating.\n",
"Concurrency is not parallelism.\n",
"The bigger the interface, the weaker the abstraction.\n",
"Documentation is for users.\n",
}
var r strings.Reader
var b bytes.Buffer
buf := make([]byte, 32<<10)
zw, err := flate.NewWriter(nil, flate.DefaultCompression)
if err != nil {
log.Fatal(err)
}
zr := flate.NewReader(nil)
for _, s := range proverbs {
r.Reset(s)
b.Reset()
// Reset the compressor and encode from some input stream.
zw.Reset(&b)
if _, err := io.CopyBuffer(zw, &r, buf); err != nil {
log.Fatal(err)
}
if err := zw.Close(); err != nil {
log.Fatal(err)
}
// Reset the decompressor and decode to some output stream.
if err := zr.(flate.Resetter).Reset(&b, nil); err != nil {
log.Fatal(err)
}
if _, err := io.CopyBuffer(os.Stdout, zr, buf); err != nil {
log.Fatal(err)
}
if err := zr.Close(); err != nil {
log.Fatal(err)
}
}
}
package main
import (
"compress/flate"
"fmt"
"io"
"log"
"strings"
"sync"
)
func main() {
var wg sync.WaitGroup
defer wg.Wait()
// Use io.Pipe to simulate a network connection.
// A real network application should take care to properly close the
// underlying connection.
rp, wp := io.Pipe()
// Start a goroutine to act as the transmitter.
wg.Add(1)
go func() {
defer wg.Done()
zw, err := flate.NewWriter(wp, flate.BestSpeed)
if err != nil {
log.Fatal(err)
}
b := make([]byte, 256)
for _, m := range strings.Fields("A long time ago in a galaxy far, far away...") {
// We use a simple framing format where the first byte is the
// message length, followed the message itself.
b[0] = uint8(copy(b[1:], m))
if _, err := zw.Write(b[:1+len(m)]); err != nil {
log.Fatal(err)
}
// Flush ensures that the receiver can read all data sent so far.
if err := zw.Flush(); err != nil {
log.Fatal(err)
}
}
if err := zw.Close(); err != nil {
log.Fatal(err)
}
}()
// Start a goroutine to act as the receiver.
wg.Add(1)
go func() {
defer wg.Done()
zr := flate.NewReader(rp)
b := make([]byte, 256)
for {
// Read the message length.
// This is guaranteed to return for every corresponding
// Flush and Close on the transmitter side.
if _, err := io.ReadFull(zr, b[:1]); err != nil {
if err == io.EOF {
break // The transmitter closed the stream
}
log.Fatal(err)
}
// Read the message content.
n := int(b[0])
if _, err := io.ReadFull(zr, b[:n]); err != nil {
log.Fatal(err)
}
fmt.Printf("Received %d bytes: %s\n", n, b[:n])
}
fmt.Println()
if err := zr.Close(); err != nil {
log.Fatal(err)
}
}()
}
Package-Level Type Names (total 23, in which 7 are exported)
/* sort exporteds by: | */
A CorruptInputError reports the presence of corrupt input at a given offset.
( T) Error() string
T : error
An InternalError reports an error in the flate code itself.
( T) Error() string
T : error
A ReadError reports an error encountered while reading input.
Deprecated: No longer returned.
// error returned by underlying Read
// byte offset where error occurred
(*T) Error() string
*T : error
Resetter resets a ReadCloser returned by NewReader or NewReaderDict
to switch to a new underlying Reader. This permits reusing a ReadCloser
instead of allocating a new one.
Reset discards any buffered data and resets the Resetter as if it was
newly initialized with the given reader.
*decompressor
A WriteError reports an error encountered while writing output.
Deprecated: No longer returned.
// error returned by underlying Read
// byte offset where error occurred
(*T) Error() string
*T : error
A Writer takes data written to it and writes the compressed
form of that data to an underlying writer (see NewWriter).
dcompressordict[]byte
Close flushes and closes the writer.
Flush flushes any pending data to the underlying writer.
It is useful mainly in compressed network protocols, to ensure that
a remote reader has enough data to reconstruct a packet.
Flush does not return until the data has been written.
Calling Flush when there is no pending data still causes the Writer
to emit a sync marker of at least 4 bytes.
If the underlying writer returns an error, Flush returns that error.
In the terminology of the zlib library, Flush is equivalent to Z_SYNC_FLUSH.
Reset discards the writer's state and makes it equivalent to
the result of NewWriter or NewWriterDict called with dst
and w's level and dictionary.
Write writes data to w, which will eventually write the
compressed form of data to its underlying writer.
*T : io.Closer
*T : io.WriteCloser
*T : io.Writer
func NewWriter(w io.Writer, level int) (*Writer, error)
func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error)
Decompress state.
Input bits, in top of b.
Length arrays used to define Huffman codes.
Temporary buffer (avoids repeated allocation).
codebits*[19]intcopyDistintcopyLenint
Output history, buffer.
errerrorfinalbool
Huffman decoders for literal/length, distance.
Huffman decoders for literal/length, distance.
hd*huffmanDecoderhl*huffmanDecodernbuint
Input source.
roffsetint64
Next step in the decompression,
and decompression state.
stepStateinttoRead[]byte(*T) Close() error(*T) Read(b []byte) (int, error)(*T) Reset(r io.Reader, dict []byte) error
copyData copies f.copyLen bytes from the underlying reader into f.hist.
It pauses for reads when f.hist is full.
Copy a single uncompressed data block from input to output.
(*T) finishBlock()
Read the next Huffman-encoded symbol from f according to h.
Decode a single Huffman block from f.
hl and hd are the Huffman states for the lit/length values
and the distance values, respectively. If hd == nil, using the
fixed distance encoding associated with fixed Huffman blocks.
(*T) moreBits() error(*T) nextBlock()(*T) readHuffman() error
*T : Resetter
*T : io.Closer
*T : io.ReadCloser
*T : io.Reader
deflateFast maintains the table for matches,
and the previous byte block for cross block matching.
// Current match offset.
// Previous block, zero length if unknown.
table[16384]tableEntry
encode encodes a block given in src and appends tokens
to dst and returns the result.
matchLen returns the match length between src[s:] and src[t:].
t can be negative to indicate the match is starting in e.prev.
We assume that src[s-4:s] and src[t-4:t] already match.
Reset resets the encoding history.
This ensures that no matches are made to the previous block.
shiftOffsets will shift down all match offset.
This is only called in rare situations to prevent integer overflow.
See https://golang.org/issue/18636 and https://github.com/golang/go/issues/34121.
func newDeflateFast() *deflateFast
dictDecoder implements the LZ77 sliding dictionary as used in decompression.
LZ77 decompresses data through sequences of two forms of commands:
* Literal insertions: Runs of one or more symbols are inserted into the data
stream as is. This is accomplished through the writeByte method for a
single symbol, or combinations of writeSlice/writeMark for multiple symbols.
Any valid stream must start with a literal insertion if no preset dictionary
is used.
* Backward copies: Runs of one or more symbols are copied from previously
emitted data. Backward copies come as the tuple (dist, length) where dist
determines how far back in the stream to copy from and length determines how
many bytes to copy. Note that it is valid for the length to be greater than
the distance. Since LZ77 uses forward copies, that situation is used to
perform a form of run-length encoding on repeated runs of symbols.
The writeCopy and tryWriteCopy are used to implement this command.
For performance reasons, this implementation performs little to no sanity
checks about the arguments. As such, the invariants documented for each
method call must be respected.
// Has a full window length been written yet?
// Sliding window history
// Have emitted hist[:rdPos] already
Invariant: 0 <= rdPos <= wrPos <= len(hist)
// Current output position in buffer
availRead reports the number of bytes that can be flushed by readFlush.
availWrite reports the available amount of output buffer space.
histSize reports the total amount of historical data in the dictionary.
init initializes dictDecoder to have a sliding window dictionary of the given
size. If a preset dict is provided, it will initialize the dictionary with
the contents of dict.
readFlush returns a slice of the historical buffer that is ready to be
emitted to the user. The data returned by readFlush must be fully consumed
before calling any other dictDecoder methods.
tryWriteCopy tries to copy a string at a given (distance, length) to the
output. This specialized version is optimized for short distances.
This method is designed to be inlined for performance reasons.
This invariant must be kept: 0 < dist <= histSize()
writeByte writes a single byte to the dictionary.
This invariant must be kept: 0 < availWrite()
writeCopy copies a string at a given (dist, length) to the output.
This returns the number of bytes copied and may be less than the requested
length if the available space in the output buffer is too small.
This invariant must be kept: 0 < dist <= histSize()
writeMark advances the writer pointer by cnt.
This invariant must be kept: 0 <= cnt <= availWrite()
writeSlice returns a slice of the available buffer to write data to.
This invariant will be kept: len(s) <= availWrite()
hcode is a huffman code with a bit code and bit length.
codeuint16lenuint16
set sets the code and length of an hcode.
Data waiting to be written is bytes[0:nbytes]
and then the low nbits of bits. Data is always written
sequentially into the bytes array.
bytes[248]bytecodegen[]uint8codegenEncoding*huffmanEncodercodegenFreq[19]int32errerrorliteralEncoding*huffmanEncoderliteralFreq[]int32nbitsuintnbytesintoffsetEncoding*huffmanEncoderoffsetFreq[]int32
writer is the underlying writer.
Do not use it directly; use the write method, which ensures
that Write errors are sticky.
dynamicSize returns the size of dynamically encoded data in bits.
fixedSize returns the size of dynamically encoded data in bits.
(*T) flush()
RFC 1951 3.2.7 specifies a special run-length encoding for specifying
the literal and offset lengths arrays (which are concatenated into a single
array). This method generates that run-length encoding.
The result is written into the codegen array, and the frequencies
of each code is written into the codegenFreq array.
Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
information. Code badCode is an end marker
numLiterals The number of literals in literalEncoding
numOffsets The number of offsets in offsetEncoding
litenc, offenc The literal and offset encoder to use
indexTokens indexes a slice of tokens, and updates
literalFreq and offsetFreq, and generates literalEncoding
and offsetEncoding.
The number of literal and offset tokens is returned.
(*T) reset(writer io.Writer)
storedSize calculates the stored size, including header.
The function returns the size in bits and whether the block
fits inside a single block.
(*T) write(b []byte)(*T) writeBits(b int32, nb uint)
writeBlock will write a block of tokens with the smallest encoding.
The original input can be supplied, and if the huffman encoded data
is larger than the original bytes, the data will be written as a
stored block.
If the input is nil, the tokens will always be Huffman encoded.
writeBlockDynamic encodes a block using a dynamic Huffman table.
This should be used if the symbols used have a disproportionate
histogram distribution.
If input is supplied and the compression savings are below 1/16th of the
input size the block is stored.
writeBlockHuff encodes a block of bytes as either
Huffman encoded literals or uncompressed bytes if the
results only gains very little from compression.
(*T) writeBytes(bytes []byte)(*T) writeCode(c hcode)
Write the header of a dynamic Huffman block to the output stream.
numLiterals The number of literals specified in codegen
numOffsets The number of offsets specified in codegen
numCodegens The number of codegens used in codegen
(*T) writeFixedHeader(isEof bool)(*T) writeStoredHeader(length int, isEof bool)
writeTokens writes a slice of tokens to the output.
codes for literal and offset encoding must be supplied.
func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter
// chunks as described above
// mask the width of the link table
// overflow links
// the minimum code length
Initialize Huffman decoding tables from array of code lengths.
Following this function, h is guaranteed to be initialized into a complete
tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
degenerate case where the tree has only a single symbol with length 1. Empty
trees are permitted.
var fixedHuffmanDecoder
bitCount[17]int32codes[]hcodefreqcache[]literalNode
// stored to avoid repeated allocation in generate
// stored to avoid repeated allocation in generate
Look at the leaves and assign them a bit count and an encoding as specified
in RFC 1951 3.2.2
Return the number of literals assigned to each bit size in the Huffman encoding
This method is only called when list.length >= 3
The cases of 0, 1, and 2 literals are handled by special case code.
list An array of the literals with non-zero frequencies
and their associated frequencies. The array is in order of increasing
frequency, and has as its last element a special element with frequency
MaxInt32
maxBits The maximum number of bits that should be used to encode any literal.
Must be less than 16.
return An integer array in which array[i] indicates the number of literals
that should be encoded in i bits.
(*T) bitLength(freq []int32) int
Update this Huffman Code object to be the minimum code for the specified frequency count.
freq An array of frequencies, in which frequency[i] gives the frequency of literal i.
maxBits The maximum number of bits to use for any literal.
func generateFixedLiteralEncoding() *huffmanEncoder
func generateFixedOffsetEncoding() *huffmanEncoder
func newHuffmanEncoder(size int) *huffmanEncoder
var fixedLiteralEncoding *huffmanEncoder
var fixedOffsetEncoding *huffmanEncoder
var huffOffset *huffmanEncoder
A levelInfo describes the state of the constructed tree for a given depth.
The frequency of the last node at this level
Our level. for better printing
The number of chains remaining to generate for this level before moving
up to the next level
The frequency of the next character to add to this level
The frequency of the next pair (from level below) to add to this level.
Only valid if the "needed" value of the next lower level is 0.
Package-Level Functions (total 27, in which 4 are exported)
NewReader returns a new ReadCloser that can be used
to read the uncompressed version of r.
If r does not also implement io.ByteReader,
the decompressor may read more data than necessary from r.
It is the caller's responsibility to call Close on the ReadCloser
when finished reading.
The ReadCloser returned by NewReader also implements Resetter.
NewReaderDict is like NewReader but initializes the reader
with a preset dictionary. The returned Reader behaves as if
the uncompressed data stream started with the given dictionary,
which has already been read. NewReaderDict is typically used
to read data compressed by NewWriterDict.
The ReadCloser returned by NewReader also implements Resetter.
NewWriter returns a new Writer compressing data at the given level.
Following zlib, levels range from 1 (BestSpeed) to 9 (BestCompression);
higher levels typically run slower but compress more. Level 0
(NoCompression) does not attempt any compression; it only adds the
necessary DEFLATE framing.
Level -1 (DefaultCompression) uses the default compression level.
Level -2 (HuffmanOnly) will use Huffman compression only, giving
a very fast compression for all types of input, but sacrificing considerable
compression efficiency.
If level is in the range [-2, 9] then the error returned will be nil.
Otherwise the error returned will be non-nil.
NewWriterDict is like NewWriter but initializes the new
Writer with a preset dictionary. The returned Writer behaves
as if the dictionary had been written to it without producing
any compressed output. The compressed data written to w
can only be decompressed by a Reader initialized with the
same dictionary.
bulkHash4 will compute hashes using the same
algorithm as hash4
HuffmanOnly disables Lempel-Ziv match searching and only performs Huffman
entropy encoding. This mode is useful in compressing data that has
already been compressed with an LZ style algorithm (e.g. Snappy or LZ4)
that lacks an entropy encoder. Compression gains are achieved when
certain bytes in the input stream occur more frequently than others.
Note that HuffmanOnly produces a compressed output that is
RFC 1951 compliant. That is, any valid DEFLATE decompressor will
continue to be able to decompress this output.
The LZ77 step produces a sequence of literal tokens and <length, offset>
pair tokens. The offset is also known as distance. The underlying wire
format limits the range of lengths and offsets. For example, there are
256 legitimate lengths: those in the range [3, 258]. This package's
compressor uses a higher minimum match length, enabling optimizations
such as finding matches via 32-bit loads and compares.
bufferFlushSize indicates the buffer size
after which bytes are flushed to the writer.
Should preferably be a multiple of 6, since
we accumulate 6 bytes between writes to the buffer.
Reset the buffer offset when reaching this.
Offsets are stored between blocks as int32 values.
Since the offset we are checking against is at the beginning
of the buffer, we need to subtract the current and input
buffer to not risk overflowing the int32.
bufferSize is the actual output byte buffer size.
It must have additional headroom for a flush
which can contain up to 8 bytes.
The number of codegen codes.
The special code used to mark the end of a block.
const hashBits = 17 // After 17 performance degrades
These constants are defined by the Snappy implementation so that its
assembly implementation can fast-path some 16-bytes-at-a-time copies. They
aren't necessary in the pure Go implementation, as we don't use those same
optimizations, but using the same thresholds doesn't really hurt.
The next three numbers come from the RFC section 3.2.7, with the
additional proviso in section 3.2.5 which implies that distance codes
30 and 31 should never occur in compressed data.
const minMatchLength = 4 // The smallest match length that the compressor actually emits
These constants are defined by the Snappy implementation so that its
assembly implementation can fast-path some 16-bytes-at-a-time copies. They
aren't necessary in the pure Go implementation, as we don't use those same
optimizations, but using the same thresholds doesn't really hurt.
const numCodes = 19 // number of codes in Huffman meta-code
The pages are generated with Goldsv0.3.2. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @Go100and1 (reachable from the left QR code) to get the latest news of Golds.