2020-06-05 13:47:39 -07:00
package archiver
import (
"archive/zip"
"bytes"
"compress/flate"
"fmt"
"io"
2020-10-15 22:06:27 -07:00
"io/ioutil"
2020-06-05 13:47:39 -07:00
"log"
"os"
"path"
"path/filepath"
"strings"
2020-10-15 22:06:27 -07:00
"github.com/dsnet/compress/bzip2"
"github.com/klauspost/compress/zstd"
"github.com/ulikunitz/xz"
)
// ZipCompressionMethod Compression type
type ZipCompressionMethod uint16
// Compression methods.
// see https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT.
// Note LZMA: Disabled - because 7z isn't able to unpack ZIP+LZMA ZIP+LZMA2 archives made this way - and vice versa.
const (
Store ZipCompressionMethod = 0
Deflate ZipCompressionMethod = 8
BZIP2 ZipCompressionMethod = 12
LZMA ZipCompressionMethod = 14
ZSTD ZipCompressionMethod = 93
XZ ZipCompressionMethod = 95
2020-06-05 13:47:39 -07:00
)
// Zip provides facilities for operating ZIP archives.
// See https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT.
type Zip struct {
// The compression level to use, as described
// in the compress/flate package.
CompressionLevel int
// Whether to overwrite existing files; if false,
// an error is returned if the file exists.
OverwriteExisting bool
// Whether to make all the directories necessary
// to create a zip archive in the desired path.
MkdirAll bool
// If enabled, selective compression will only
// compress files which are not already in a
// compressed format; this is decided based
// simply on file extension.
SelectiveCompression bool
// A single top-level folder can be implicitly
// created by the Archive or Unarchive methods
// if the files to be added to the archive
// or the files to be extracted from the archive
// do not all have a common root. This roughly
// mimics the behavior of archival tools integrated
// into OS file browsers which create a subfolder
// to avoid unexpectedly littering the destination
// folder with potentially many files, causing a
// problematic cleanup/organization situation.
// This feature is available for both creation
// and extraction of archives, but may be slightly
// inefficient with lots and lots of files,
// especially on extraction.
ImplicitTopLevelFolder bool
2020-11-06 11:41:42 -07:00
// Strip number of leading paths. This feature is available
// only during unpacking of the entire archive.
StripComponents int
2020-06-05 13:47:39 -07:00
// If true, errors encountered during reading
// or writing a single file will be logged and
// the operation will continue on remaining files.
ContinueOnError bool
2020-10-15 22:06:27 -07:00
// Compression algorithm
FileMethod ZipCompressionMethod
zw * zip . Writer
zr * zip . Reader
ridx int
//decinitialized bool
2020-06-05 13:47:39 -07:00
}
// CheckExt ensures the file extension matches the format.
func ( * Zip ) CheckExt ( filename string ) error {
if ! strings . HasSuffix ( filename , ".zip" ) {
return fmt . Errorf ( "filename must have a .zip extension" )
}
return nil
}
2020-10-15 22:06:27 -07:00
// Registering a global decompressor is not reentrant and may panic
func registerDecompressor ( zr * zip . Reader ) {
// register zstd decompressor
zr . RegisterDecompressor ( uint16 ( ZSTD ) , func ( r io . Reader ) io . ReadCloser {
zr , err := zstd . NewReader ( r )
if err != nil {
return nil
}
return zr . IOReadCloser ( )
} )
zr . RegisterDecompressor ( uint16 ( BZIP2 ) , func ( r io . Reader ) io . ReadCloser {
bz2r , err := bzip2 . NewReader ( r , nil )
if err != nil {
return nil
}
return bz2r
} )
zr . RegisterDecompressor ( uint16 ( XZ ) , func ( r io . Reader ) io . ReadCloser {
xr , err := xz . NewReader ( r )
if err != nil {
return nil
}
return ioutil . NopCloser ( xr )
} )
}
// CheckPath ensures the file extension matches the format.
func ( * Zip ) CheckPath ( to , filename string ) error {
to , _ = filepath . Abs ( to ) //explicit the destination folder to prevent that 'string.HasPrefix' check can be 'bypassed' when no destination folder is supplied in input
dest := filepath . Join ( to , filename )
//prevent path traversal attacks
if ! strings . HasPrefix ( dest , to ) {
2020-11-06 11:41:42 -07:00
return & IllegalPathError { AbsolutePath : dest , Filename : filename }
2020-10-15 22:06:27 -07:00
}
return nil
}
2020-06-05 13:47:39 -07:00
// Archive creates a .zip file at destination containing
// the files listed in sources. The destination must end
// with ".zip". File paths can be those of regular files
// or directories. Regular files are stored at the 'root'
// of the archive, and directories are recursively added.
func ( z * Zip ) Archive ( sources [ ] string , destination string ) error {
err := z . CheckExt ( destination )
if err != nil {
return fmt . Errorf ( "checking extension: %v" , err )
}
if ! z . OverwriteExisting && fileExists ( destination ) {
return fmt . Errorf ( "file already exists: %s" , destination )
}
// make the folder to contain the resulting archive
// if it does not already exist
destDir := filepath . Dir ( destination )
if z . MkdirAll && ! fileExists ( destDir ) {
err := mkdir ( destDir , 0755 )
if err != nil {
return fmt . Errorf ( "making folder for destination: %v" , err )
}
}
out , err := os . Create ( destination )
if err != nil {
return fmt . Errorf ( "creating %s: %v" , destination , err )
}
defer out . Close ( )
err = z . Create ( out )
if err != nil {
return fmt . Errorf ( "creating zip: %v" , err )
}
defer z . Close ( )
var topLevelFolder string
if z . ImplicitTopLevelFolder && multipleTopLevels ( sources ) {
topLevelFolder = folderNameFromFileName ( destination )
}
for _ , source := range sources {
err := z . writeWalk ( source , topLevelFolder , destination )
if err != nil {
return fmt . Errorf ( "walking %s: %v" , source , err )
}
}
return nil
}
// Unarchive unpacks the .zip file at source to destination.
// Destination will be treated as a folder name.
func ( z * Zip ) Unarchive ( source , destination string ) error {
if ! fileExists ( destination ) && z . MkdirAll {
err := mkdir ( destination , 0755 )
if err != nil {
return fmt . Errorf ( "preparing destination: %v" , err )
}
}
file , err := os . Open ( source )
if err != nil {
return fmt . Errorf ( "opening source file: %v" , err )
}
defer file . Close ( )
fileInfo , err := file . Stat ( )
if err != nil {
return fmt . Errorf ( "statting source file: %v" , err )
}
err = z . Open ( file , fileInfo . Size ( ) )
if err != nil {
return fmt . Errorf ( "opening zip archive for reading: %v" , err )
}
defer z . Close ( )
// if the files in the archive do not all share a common
// root, then make sure we extract to a single subfolder
// rather than potentially littering the destination...
if z . ImplicitTopLevelFolder {
files := make ( [ ] string , len ( z . zr . File ) )
for i := range z . zr . File {
files [ i ] = z . zr . File [ i ] . Name
}
if multipleTopLevels ( files ) {
destination = filepath . Join ( destination , folderNameFromFileName ( source ) )
}
}
for {
err := z . extractNext ( destination )
if err == io . EOF {
break
}
if err != nil {
2020-11-06 11:41:42 -07:00
if z . ContinueOnError || IsIllegalPathError ( err ) {
2020-06-05 13:47:39 -07:00
log . Printf ( "[ERROR] Reading file in zip archive: %v" , err )
continue
}
return fmt . Errorf ( "reading file in zip archive: %v" , err )
}
}
return nil
}
func ( z * Zip ) extractNext ( to string ) error {
f , err := z . Read ( )
if err != nil {
return err // don't wrap error; calling loop must break on io.EOF
}
defer f . Close ( )
2020-10-15 22:06:27 -07:00
2020-11-06 11:41:42 -07:00
header , ok := f . Header . ( zip . FileHeader )
if ! ok {
return fmt . Errorf ( "expected header to be zip.FileHeader but was %T" , f . Header )
}
errPath := z . CheckPath ( to , header . Name )
2020-10-15 22:06:27 -07:00
if errPath != nil {
return fmt . Errorf ( "checking path traversal attempt: %v" , errPath )
}
2020-06-05 13:47:39 -07:00
2020-11-06 11:41:42 -07:00
if z . StripComponents > 0 {
if strings . Count ( header . Name , "/" ) < z . StripComponents {
return nil // skip path with fewer components
}
for i := 0 ; i < z . StripComponents ; i ++ {
slash := strings . Index ( header . Name , "/" )
header . Name = header . Name [ slash + 1 : ]
}
2020-06-05 13:47:39 -07:00
}
2020-11-06 11:41:42 -07:00
return z . extractFile ( f , to , & header )
}
2020-06-05 13:47:39 -07:00
2020-11-06 11:41:42 -07:00
func ( z * Zip ) extractFile ( f File , to string , header * zip . FileHeader ) error {
2020-06-05 13:47:39 -07:00
to = filepath . Join ( to , header . Name )
// if a directory, no content; simply make the directory and return
if f . IsDir ( ) {
return mkdir ( to , f . Mode ( ) )
}
// do not overwrite existing files, if configured
if ! z . OverwriteExisting && fileExists ( to ) {
return fmt . Errorf ( "file already exists: %s" , to )
}
// extract symbolic links as symbolic links
if isSymlink ( header . FileInfo ( ) ) {
// symlink target is the contents of the file
buf := new ( bytes . Buffer )
_ , err := io . Copy ( buf , f )
if err != nil {
return fmt . Errorf ( "%s: reading symlink target: %v" , header . Name , err )
}
return writeNewSymbolicLink ( to , strings . TrimSpace ( buf . String ( ) ) )
}
return writeNewFile ( to , f , f . Mode ( ) )
}
func ( z * Zip ) writeWalk ( source , topLevelFolder , destination string ) error {
sourceInfo , err := os . Stat ( source )
if err != nil {
return fmt . Errorf ( "%s: stat: %v" , source , err )
}
destAbs , err := filepath . Abs ( destination )
if err != nil {
return fmt . Errorf ( "%s: getting absolute path of destination %s: %v" , source , destination , err )
}
return filepath . Walk ( source , func ( fpath string , info os . FileInfo , err error ) error {
handleErr := func ( err error ) error {
if z . ContinueOnError {
log . Printf ( "[ERROR] Walking %s: %v" , fpath , err )
return nil
}
return err
}
if err != nil {
return handleErr ( fmt . Errorf ( "traversing %s: %v" , fpath , err ) )
}
if info == nil {
return handleErr ( fmt . Errorf ( "%s: no file info" , fpath ) )
}
// make sure we do not copy the output file into the output
// file; that results in an infinite loop and disk exhaustion!
fpathAbs , err := filepath . Abs ( fpath )
if err != nil {
return handleErr ( fmt . Errorf ( "%s: getting absolute path: %v" , fpath , err ) )
}
if within ( fpathAbs , destAbs ) {
return nil
}
// build the name to be used within the archive
nameInArchive , err := makeNameInArchive ( sourceInfo , source , topLevelFolder , fpath )
if err != nil {
return handleErr ( err )
}
var file io . ReadCloser
if info . Mode ( ) . IsRegular ( ) {
file , err = os . Open ( fpath )
if err != nil {
return handleErr ( fmt . Errorf ( "%s: opening: %v" , fpath , err ) )
}
defer file . Close ( )
}
err = z . Write ( File {
FileInfo : FileInfo {
FileInfo : info ,
CustomName : nameInArchive ,
} ,
ReadCloser : file ,
} )
if err != nil {
return handleErr ( fmt . Errorf ( "%s: writing: %s" , fpath , err ) )
}
return nil
} )
}
// Create opens z for writing a ZIP archive to out.
func ( z * Zip ) Create ( out io . Writer ) error {
if z . zw != nil {
return fmt . Errorf ( "zip archive is already created for writing" )
}
z . zw = zip . NewWriter ( out )
if z . CompressionLevel != flate . DefaultCompression {
z . zw . RegisterCompressor ( zip . Deflate , func ( out io . Writer ) ( io . WriteCloser , error ) {
return flate . NewWriter ( out , z . CompressionLevel )
} )
}
2020-10-15 22:06:27 -07:00
switch z . FileMethod {
case BZIP2 :
z . zw . RegisterCompressor ( uint16 ( BZIP2 ) , func ( out io . Writer ) ( io . WriteCloser , error ) {
return bzip2 . NewWriter ( out , & bzip2 . WriterConfig { Level : z . CompressionLevel } )
} )
case ZSTD :
z . zw . RegisterCompressor ( uint16 ( ZSTD ) , func ( out io . Writer ) ( io . WriteCloser , error ) {
return zstd . NewWriter ( out )
} )
case XZ :
z . zw . RegisterCompressor ( uint16 ( XZ ) , func ( out io . Writer ) ( io . WriteCloser , error ) {
return xz . NewWriter ( out )
} )
}
2020-06-05 13:47:39 -07:00
return nil
}
// Write writes f to z, which must have been opened for writing first.
func ( z * Zip ) Write ( f File ) error {
if z . zw == nil {
return fmt . Errorf ( "zip archive was not created for writing first" )
}
if f . FileInfo == nil {
return fmt . Errorf ( "no file info" )
}
if f . FileInfo . Name ( ) == "" {
return fmt . Errorf ( "missing file name" )
}
header , err := zip . FileInfoHeader ( f )
if err != nil {
return fmt . Errorf ( "%s: getting header: %v" , f . Name ( ) , err )
}
if f . IsDir ( ) {
header . Name += "/" // required - strangely no mention of this in zip spec? but is in godoc...
header . Method = zip . Store
} else {
ext := strings . ToLower ( path . Ext ( header . Name ) )
if _ , ok := compressedFormats [ ext ] ; ok && z . SelectiveCompression {
header . Method = zip . Store
} else {
2020-10-15 22:06:27 -07:00
header . Method = uint16 ( z . FileMethod )
2020-06-05 13:47:39 -07:00
}
}
writer , err := z . zw . CreateHeader ( header )
if err != nil {
return fmt . Errorf ( "%s: making header: %v" , f . Name ( ) , err )
}
return z . writeFile ( f , writer )
}
func ( z * Zip ) writeFile ( f File , writer io . Writer ) error {
if f . IsDir ( ) {
return nil // directories have no contents
}
if isSymlink ( f ) {
// file body for symlinks is the symlink target
linkTarget , err := os . Readlink ( f . Name ( ) )
if err != nil {
return fmt . Errorf ( "%s: readlink: %v" , f . Name ( ) , err )
}
_ , err = writer . Write ( [ ] byte ( filepath . ToSlash ( linkTarget ) ) )
if err != nil {
return fmt . Errorf ( "%s: writing symlink target: %v" , f . Name ( ) , err )
}
return nil
}
if f . ReadCloser == nil {
return fmt . Errorf ( "%s: no way to read file contents" , f . Name ( ) )
}
_ , err := io . Copy ( writer , f )
if err != nil {
return fmt . Errorf ( "%s: copying contents: %v" , f . Name ( ) , err )
}
return nil
}
// Open opens z for reading an archive from in,
// which is expected to have the given size and
// which must be an io.ReaderAt.
func ( z * Zip ) Open ( in io . Reader , size int64 ) error {
inRdrAt , ok := in . ( io . ReaderAt )
if ! ok {
return fmt . Errorf ( "reader must be io.ReaderAt" )
}
if z . zr != nil {
return fmt . Errorf ( "zip archive is already open for reading" )
}
var err error
z . zr , err = zip . NewReader ( inRdrAt , size )
if err != nil {
return fmt . Errorf ( "creating reader: %v" , err )
}
2020-10-15 22:06:27 -07:00
registerDecompressor ( z . zr )
2020-06-05 13:47:39 -07:00
z . ridx = 0
return nil
}
// Read reads the next file from z, which must have
// already been opened for reading. If there are no
// more files, the error is io.EOF. The File must
// be closed when finished reading from it.
func ( z * Zip ) Read ( ) ( File , error ) {
if z . zr == nil {
return File { } , fmt . Errorf ( "zip archive is not open" )
}
if z . ridx >= len ( z . zr . File ) {
return File { } , io . EOF
}
// access the file and increment counter so that
// if there is an error processing this file, the
// caller can still iterate to the next file
zf := z . zr . File [ z . ridx ]
z . ridx ++
file := File {
FileInfo : zf . FileInfo ( ) ,
Header : zf . FileHeader ,
}
rc , err := zf . Open ( )
if err != nil {
return file , fmt . Errorf ( "%s: open compressed file: %v" , zf . Name , err )
}
file . ReadCloser = rc
return file , nil
}
// Close closes the zip archive(s) opened by Create and Open.
func ( z * Zip ) Close ( ) error {
if z . zr != nil {
z . zr = nil
}
if z . zw != nil {
zw := z . zw
z . zw = nil
return zw . Close ( )
}
return nil
}
// Walk calls walkFn for each visited item in archive.
func ( z * Zip ) Walk ( archive string , walkFn WalkFunc ) error {
zr , err := zip . OpenReader ( archive )
if err != nil {
return fmt . Errorf ( "opening zip reader: %v" , err )
}
defer zr . Close ( )
2020-10-15 22:06:27 -07:00
registerDecompressor ( & zr . Reader )
2020-06-05 13:47:39 -07:00
for _ , zf := range zr . File {
zfrc , err := zf . Open ( )
if err != nil {
2020-10-15 22:06:27 -07:00
if zfrc != nil {
zfrc . Close ( )
}
2020-06-05 13:47:39 -07:00
if z . ContinueOnError {
log . Printf ( "[ERROR] Opening %s: %v" , zf . Name , err )
continue
}
return fmt . Errorf ( "opening %s: %v" , zf . Name , err )
}
err = walkFn ( File {
FileInfo : zf . FileInfo ( ) ,
Header : zf . FileHeader ,
ReadCloser : zfrc ,
} )
zfrc . Close ( )
if err != nil {
if err == ErrStopWalk {
break
}
if z . ContinueOnError {
log . Printf ( "[ERROR] Walking %s: %v" , zf . Name , err )
continue
}
return fmt . Errorf ( "walking %s: %v" , zf . Name , err )
}
}
return nil
}
// Extract extracts a single file from the zip archive.
// If the target is a directory, the entire folder will
// be extracted into destination.
func ( z * Zip ) Extract ( source , target , destination string ) error {
// target refers to a path inside the archive, which should be clean also
target = path . Clean ( target )
// if the target ends up being a directory, then
// we will continue walking and extracting files
// until we are no longer within that directory
var targetDirPath string
return z . Walk ( source , func ( f File ) error {
zfh , ok := f . Header . ( zip . FileHeader )
if ! ok {
return fmt . Errorf ( "expected header to be zip.FileHeader but was %T" , f . Header )
}
// importantly, cleaning the path strips tailing slash,
// which must be appended to folders within the archive
name := path . Clean ( zfh . Name )
if f . IsDir ( ) && target == name {
targetDirPath = path . Dir ( name )
}
if within ( target , zfh . Name ) {
// either this is the exact file we want, or is
// in the directory we want to extract
// build the filename we will extract to
end , err := filepath . Rel ( targetDirPath , zfh . Name )
if err != nil {
return fmt . Errorf ( "relativizing paths: %v" , err )
}
joined := filepath . Join ( destination , end )
2020-11-06 11:41:42 -07:00
err = z . extractFile ( f , joined , & zfh )
2020-06-05 13:47:39 -07:00
if err != nil {
return fmt . Errorf ( "extracting file %s: %v" , zfh . Name , err )
}
// if our target was not a directory, stop walk
if targetDirPath == "" {
return ErrStopWalk
}
} else if targetDirPath != "" {
// finished walking the entire directory
return ErrStopWalk
}
return nil
} )
}
// Match returns true if the format of file matches this
// type's format. It should not affect reader position.
func ( * Zip ) Match ( file io . ReadSeeker ) ( bool , error ) {
currentPos , err := file . Seek ( 0 , io . SeekCurrent )
if err != nil {
return false , err
}
_ , err = file . Seek ( 0 , 0 )
if err != nil {
return false , err
}
2020-10-15 22:06:27 -07:00
defer func ( ) {
_ , _ = file . Seek ( currentPos , io . SeekStart )
} ( )
2020-06-05 13:47:39 -07:00
buf := make ( [ ] byte , 4 )
if n , err := file . Read ( buf ) ; err != nil || n < 4 {
return false , nil
}
return bytes . Equal ( buf , [ ] byte ( "PK\x03\x04" ) ) , nil
}
func ( z * Zip ) String ( ) string { return "zip" }
// NewZip returns a new, default instance ready to be customized and used.
func NewZip ( ) * Zip {
return & Zip {
CompressionLevel : flate . DefaultCompression ,
MkdirAll : true ,
SelectiveCompression : true ,
2020-10-15 22:06:27 -07:00
FileMethod : Deflate ,
2020-06-05 13:47:39 -07:00
}
}
// Compile-time checks to ensure type implements desired interfaces.
var (
_ = Reader ( new ( Zip ) )
_ = Writer ( new ( Zip ) )
_ = Archiver ( new ( Zip ) )
_ = Unarchiver ( new ( Zip ) )
_ = Walker ( new ( Zip ) )
_ = Extractor ( new ( Zip ) )
_ = Matcher ( new ( Zip ) )
_ = ExtensionChecker ( new ( Zip ) )
2020-10-15 22:06:27 -07:00
_ = FilenameChecker ( new ( Zip ) )
2020-06-05 13:47:39 -07:00
)
// compressedFormats is a (non-exhaustive) set of lowercased
// file extensions for formats that are typically already
// compressed. Compressing files that are already compressed
// is inefficient, so use this set of extension to avoid that.
var compressedFormats = map [ string ] struct { } {
".7z" : { } ,
".avi" : { } ,
".br" : { } ,
".bz2" : { } ,
".cab" : { } ,
".docx" : { } ,
".gif" : { } ,
".gz" : { } ,
".jar" : { } ,
".jpeg" : { } ,
".jpg" : { } ,
".lz" : { } ,
".lz4" : { } ,
".lzma" : { } ,
".m4v" : { } ,
".mov" : { } ,
".mp3" : { } ,
".mp4" : { } ,
".mpeg" : { } ,
".mpg" : { } ,
".png" : { } ,
".pptx" : { } ,
".rar" : { } ,
".sz" : { } ,
".tbz2" : { } ,
".tgz" : { } ,
".tsz" : { } ,
".txz" : { } ,
".xlsx" : { } ,
".xz" : { } ,
".zip" : { } ,
".zipx" : { } ,
}
// DefaultZip is a default instance that is conveniently ready to use.
var DefaultZip = NewZip ( )