Платформа ЦРНП "Мирокод" для разработки проектов
https://git.mirocod.ru
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
374 lines
10 KiB
374 lines
10 KiB
// Copyright 2015, Joe Tsai. All rights reserved. |
|
// Use of this source code is governed by a BSD-style |
|
// license that can be found in the LICENSE.md file. |
|
|
|
package bzip2 |
|
|
|
import ( |
|
"io" |
|
|
|
"github.com/dsnet/compress/internal" |
|
"github.com/dsnet/compress/internal/errors" |
|
"github.com/dsnet/compress/internal/prefix" |
|
) |
|
|
|
const ( |
|
minNumTrees = 2 |
|
maxNumTrees = 6 |
|
|
|
maxPrefixBits = 20 // Maximum bit-width of a prefix code |
|
maxNumSyms = 256 + 2 // Maximum number of symbols in the alphabet |
|
numBlockSyms = 50 // Number of bytes in a block |
|
) |
|
|
|
// encSel and decSel are used to handle the prefix encoding for tree selectors. |
|
// The prefix encoding is as follows: |
|
// |
|
// Code TreeIdx |
|
// 0 <=> 0 |
|
// 10 <=> 1 |
|
// 110 <=> 2 |
|
// 1110 <=> 3 |
|
// 11110 <=> 4 |
|
// 111110 <=> 5 |
|
// 111111 <=> 6 Invalid tree index, so should fail |
|
// |
|
var encSel, decSel = func() (e prefix.Encoder, d prefix.Decoder) { |
|
var selCodes [maxNumTrees + 1]prefix.PrefixCode |
|
for i := range selCodes { |
|
selCodes[i] = prefix.PrefixCode{Sym: uint32(i), Len: uint32(i + 1)} |
|
} |
|
selCodes[maxNumTrees] = prefix.PrefixCode{Sym: maxNumTrees, Len: maxNumTrees} |
|
prefix.GeneratePrefixes(selCodes[:]) |
|
e.Init(selCodes[:]) |
|
d.Init(selCodes[:]) |
|
return |
|
}() |
|
|
|
type prefixReader struct{ prefix.Reader } |
|
|
|
func (pr *prefixReader) Init(r io.Reader) { |
|
pr.Reader.Init(r, true) |
|
} |
|
|
|
func (pr *prefixReader) ReadBitsBE64(nb uint) uint64 { |
|
if nb <= 32 { |
|
v := uint32(pr.ReadBits(nb)) |
|
return uint64(internal.ReverseUint32N(v, nb)) |
|
} |
|
v0 := internal.ReverseUint32(uint32(pr.ReadBits(32))) |
|
v1 := internal.ReverseUint32(uint32(pr.ReadBits(nb - 32))) |
|
v := uint64(v0)<<32 | uint64(v1) |
|
return v >> (64 - nb) |
|
} |
|
|
|
func (pr *prefixReader) ReadPrefixCodes(codes []prefix.PrefixCodes, trees []prefix.Decoder) { |
|
for i, pc := range codes { |
|
clen := int(pr.ReadBitsBE64(5)) |
|
sum := 1 << maxPrefixBits |
|
for sym := range pc { |
|
for { |
|
if clen < 1 || clen > maxPrefixBits { |
|
panicf(errors.Corrupted, "invalid prefix bit-length: %d", clen) |
|
} |
|
|
|
b, ok := pr.TryReadBits(1) |
|
if !ok { |
|
b = pr.ReadBits(1) |
|
} |
|
if b == 0 { |
|
break |
|
} |
|
|
|
b, ok = pr.TryReadBits(1) |
|
if !ok { |
|
b = pr.ReadBits(1) |
|
} |
|
clen -= int(b*2) - 1 // +1 or -1 |
|
} |
|
pc[sym] = prefix.PrefixCode{Sym: uint32(sym), Len: uint32(clen)} |
|
sum -= (1 << maxPrefixBits) >> uint(clen) |
|
} |
|
|
|
if sum == 0 { |
|
// Fast path, but only handles complete trees. |
|
if err := prefix.GeneratePrefixes(pc); err != nil { |
|
errors.Panic(err) // Using complete trees; should never fail |
|
} |
|
} else { |
|
// Slow path, but handles anything. |
|
pc = handleDegenerateCodes(pc) // Never fails, but may fail later |
|
codes[i] = pc |
|
} |
|
trees[i].Init(pc) |
|
} |
|
} |
|
|
|
type prefixWriter struct{ prefix.Writer } |
|
|
|
func (pw *prefixWriter) Init(w io.Writer) { |
|
pw.Writer.Init(w, true) |
|
} |
|
|
|
func (pw *prefixWriter) WriteBitsBE64(v uint64, nb uint) { |
|
if nb <= 32 { |
|
v := internal.ReverseUint32N(uint32(v), nb) |
|
pw.WriteBits(uint(v), nb) |
|
return |
|
} |
|
v <<= (64 - nb) |
|
v0 := internal.ReverseUint32(uint32(v >> 32)) |
|
v1 := internal.ReverseUint32(uint32(v)) |
|
pw.WriteBits(uint(v0), 32) |
|
pw.WriteBits(uint(v1), nb-32) |
|
return |
|
} |
|
|
|
func (pw *prefixWriter) WritePrefixCodes(codes []prefix.PrefixCodes, trees []prefix.Encoder) { |
|
for i, pc := range codes { |
|
if err := prefix.GeneratePrefixes(pc); err != nil { |
|
errors.Panic(err) // Using complete trees; should never fail |
|
} |
|
trees[i].Init(pc) |
|
|
|
clen := int(pc[0].Len) |
|
pw.WriteBitsBE64(uint64(clen), 5) |
|
for _, c := range pc { |
|
for int(c.Len) < clen { |
|
pw.WriteBits(3, 2) // 11 |
|
clen-- |
|
} |
|
for int(c.Len) > clen { |
|
pw.WriteBits(1, 2) // 10 |
|
clen++ |
|
} |
|
pw.WriteBits(0, 1) |
|
} |
|
} |
|
} |
|
|
|
// handleDegenerateCodes converts a degenerate tree into a canonical tree. |
|
// |
|
// For example, when the input is an under-subscribed tree: |
|
// input: []PrefixCode{ |
|
// {Sym: 0, Len: 3}, |
|
// {Sym: 1, Len: 4}, |
|
// {Sym: 2, Len: 3}, |
|
// } |
|
// output: []PrefixCode{ |
|
// {Sym: 0, Len: 3, Val: 0}, // 000 |
|
// {Sym: 1, Len: 4, Val: 2}, // 0010 |
|
// {Sym: 2, Len: 3, Val: 4}, // 100 |
|
// {Sym: 258, Len: 4, Val: 10}, // 1010 |
|
// {Sym: 259, Len: 3, Val: 6}, // 110 |
|
// {Sym: 260, Len: 1, Val: 1}, // 1 |
|
// } |
|
// |
|
// For example, when the input is an over-subscribed tree: |
|
// input: []PrefixCode{ |
|
// {Sym: 0, Len: 1}, |
|
// {Sym: 1, Len: 3}, |
|
// {Sym: 2, Len: 4}, |
|
// {Sym: 3, Len: 3}, |
|
// {Sym: 4, Len: 2}, |
|
// } |
|
// output: []PrefixCode{ |
|
// {Sym: 0, Len: 1, Val: 0}, // 0 |
|
// {Sym: 1, Len: 3, Val: 3}, // 011 |
|
// {Sym: 3, Len: 3, Val: 7}, // 111 |
|
// {Sym: 4, Len: 2, Val: 1}, // 01 |
|
// } |
|
func handleDegenerateCodes(codes prefix.PrefixCodes) prefix.PrefixCodes { |
|
// Since there is no formal definition for the BZip2 format, there is no |
|
// specification that says that the code lengths must form a complete |
|
// prefix tree (IE: it is neither over-subscribed nor under-subscribed). |
|
// Thus, the original C implementation becomes the reference for how prefix |
|
// decoding is done in these edge cases. Unfortunately, the C version does |
|
// not error when an invalid tree is used, but rather allows decoding to |
|
// continue and only errors if some bit pattern happens to cause an error. |
|
// Thus, it is possible for an invalid tree to end up decoding an input |
|
// "properly" so long as invalid bit patterns are not present. In order to |
|
// replicate this non-specified behavior, we use a ported version of the |
|
// C code to generate the codes as a valid canonical tree by substituting |
|
// invalid nodes with invalid symbols. |
|
// |
|
// ==================================================== |
|
// This program, "bzip2", the associated library "libbzip2", and all |
|
// documentation, are copyright (C) 1996-2010 Julian R Seward. All |
|
// rights reserved. |
|
// |
|
// Redistribution and use in source and binary forms, with or without |
|
// modification, are permitted provided that the following conditions |
|
// are met: |
|
// |
|
// 1. Redistributions of source code must retain the above copyright |
|
// notice, this list of conditions and the following disclaimer. |
|
// |
|
// 2. The origin of this software must not be misrepresented; you must |
|
// not claim that you wrote the original software. If you use this |
|
// software in a product, an acknowledgment in the product |
|
// documentation would be appreciated but is not required. |
|
// |
|
// 3. Altered source versions must be plainly marked as such, and must |
|
// not be misrepresented as being the original software. |
|
// |
|
// 4. The name of the author may not be used to endorse or promote |
|
// products derived from this software without specific prior written |
|
// permission. |
|
// |
|
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS |
|
// OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
|
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE |
|
// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
|
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
|
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
// |
|
// Julian Seward, jseward@bzip.org |
|
// bzip2/libbzip2 version 1.0.6 of 6 September 2010 |
|
// ==================================================== |
|
var ( |
|
limits [maxPrefixBits + 2]int32 |
|
bases [maxPrefixBits + 2]int32 |
|
perms [maxNumSyms]int32 |
|
|
|
minLen = uint32(maxPrefixBits) |
|
maxLen = uint32(0) |
|
) |
|
|
|
const ( |
|
statusOkay = iota |
|
statusInvalid |
|
statusNeedBits |
|
statusMaxBits |
|
) |
|
|
|
// createTables is the BZ2_hbCreateDecodeTables function from the C code. |
|
createTables := func(codes []prefix.PrefixCode) { |
|
for _, c := range codes { |
|
if c.Len > maxLen { |
|
maxLen = c.Len |
|
} |
|
if c.Len < minLen { |
|
minLen = c.Len |
|
} |
|
} |
|
|
|
var pp int |
|
for i := minLen; i <= maxLen; i++ { |
|
for j, c := range codes { |
|
if c.Len == i { |
|
perms[pp] = int32(j) |
|
pp++ |
|
} |
|
} |
|
} |
|
|
|
var vec int32 |
|
for _, c := range codes { |
|
bases[c.Len+1]++ |
|
} |
|
for i := 1; i < len(bases); i++ { |
|
bases[i] += bases[i-1] |
|
} |
|
for i := minLen; i <= maxLen; i++ { |
|
vec += bases[i+1] - bases[i] |
|
limits[i] = vec - 1 |
|
vec <<= 1 |
|
} |
|
for i := minLen + 1; i <= maxLen; i++ { |
|
bases[i] = ((limits[i-1] + 1) << 1) - bases[i] |
|
} |
|
} |
|
|
|
// getSymbol is the GET_MTF_VAL macro from the C code. |
|
getSymbol := func(c prefix.PrefixCode) (uint32, int) { |
|
v := internal.ReverseUint32(c.Val) |
|
n := c.Len |
|
|
|
zn := minLen |
|
if zn > n { |
|
return 0, statusNeedBits |
|
} |
|
zvec := int32(v >> (32 - zn)) |
|
v <<= zn |
|
for { |
|
if zn > maxLen { |
|
return 0, statusMaxBits |
|
} |
|
if zvec <= limits[zn] { |
|
break |
|
} |
|
zn++ |
|
if zn > n { |
|
return 0, statusNeedBits |
|
} |
|
zvec = (zvec << 1) | int32(v>>31) |
|
v <<= 1 |
|
} |
|
if zvec-bases[zn] < 0 || zvec-bases[zn] >= maxNumSyms { |
|
return 0, statusInvalid |
|
} |
|
return uint32(perms[zvec-bases[zn]]), statusOkay |
|
} |
|
|
|
// Step 1: Create the prefix trees using the C algorithm. |
|
createTables(codes) |
|
|
|
// Step 2: Starting with the shortest bit pattern, explore the whole tree. |
|
// If tree is under-subscribed, the worst-case runtime is O(1<<maxLen). |
|
// If tree is over-subscribed, the worst-case runtime is O(maxNumSyms). |
|
var pcodesArr [2 * maxNumSyms]prefix.PrefixCode |
|
pcodes := pcodesArr[:maxNumSyms] |
|
var exploreCode func(prefix.PrefixCode) bool |
|
exploreCode = func(c prefix.PrefixCode) (term bool) { |
|
sym, status := getSymbol(c) |
|
switch status { |
|
case statusOkay: |
|
// This code is valid, so insert it. |
|
c.Sym = sym |
|
pcodes[sym] = c |
|
term = true |
|
case statusInvalid: |
|
// This code is invalid, so insert an invalid symbol. |
|
c.Sym = uint32(len(pcodes)) |
|
pcodes = append(pcodes, c) |
|
term = true |
|
case statusNeedBits: |
|
// This code is too short, so explore both children. |
|
c.Len++ |
|
c0, c1 := c, c |
|
c1.Val |= 1 << (c.Len - 1) |
|
|
|
b0 := exploreCode(c0) |
|
b1 := exploreCode(c1) |
|
switch { |
|
case !b0 && b1: |
|
c0.Sym = uint32(len(pcodes)) |
|
pcodes = append(pcodes, c0) |
|
case !b1 && b0: |
|
c1.Sym = uint32(len(pcodes)) |
|
pcodes = append(pcodes, c1) |
|
} |
|
term = b0 || b1 |
|
case statusMaxBits: |
|
// This code is too long, so report it upstream. |
|
term = false |
|
} |
|
return term // Did this code terminate? |
|
} |
|
exploreCode(prefix.PrefixCode{}) |
|
|
|
// Step 3: Copy new sparse codes to old output codes. |
|
codes = codes[:0] |
|
for _, c := range pcodes { |
|
if c.Len > 0 { |
|
codes = append(codes, c) |
|
} |
|
} |
|
return codes |
|
}
|
|
|