Mercurial > hg > dml-open-cliopatria
diff cpack/dml/lib/compression.pl @ 0:718306e29690 tip
commiting public release
author | Daniel Wolff |
---|---|
date | Tue, 09 Feb 2016 21:05:06 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpack/dml/lib/compression.pl Tue Feb 09 21:05:06 2016 +0100 @@ -0,0 +1,464 @@ +/* Part of DML (Digital Music Laboratory) + Copyright 2014-2015 Samer Abdallah, University of London + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +:- module(compression, + [ with_transducer/3 + , stream_length/1 + , print_delimited_dcg/1 + , complexity_method/1 + , complexity/3 + , conditional_complexity/4 + , min_complexity/3 + , min_conditional_complexity/4 + , ncd/4 + , conditional_ncd/5 + , write_bytes/1 + , length_prob/3 + , with_stream_encoding/3 + , with_zlib_stream/1 + , zsync//1 + ]). + +/** <module> Tools for applying compression programs to abitrary streams. + + This module provides a framework for applying a range of compression and + delta compression programs to arbitrary streams generated by Prolog code. + In particular, the predicates complexity/3 and conditional_complexity/4 + use compression programs to estimate Kolmogorov complexity and conditional + complexity. + + Compression programs are invoked by a collection of shell scripts included + with this software, in the scripts/compression subdirectory of the dml cpack. + The method to be used is determined by a term of type compression_method: + == + compression_method ---> id % no compression + ; compress(c_method) + ; diff(d_method,pred). + c_method ---> lzma ; gz ; bz. + + d_method ---> bsdiff + ; xdelta % using xdelta3 program + ; vcdiff % using open-vcdiff + ; vczip % using vczip tools from AT&T + ; xdiff % using binary to text encoding and diff + ; zzd(c_method, d_method) + ; zzcd(c_method, d_method) + ; dz(d_method, c_method). + == + + Currently we do not subtract an known overheads associated with redundant + information in compression program outputs, such as 'magic bytes' in a header + used to enable tools to recognise which program was used to compress a given file. +*/ + +:- use_module(library(settings)). +:- use_module(library(memo)). +:- use_module(library(process)). +:- use_module(library(fileutils)). +:- use_module(library(swipe)). +:- use_module(library(dcg_core)). +:- use_module(library(dcg_codes)). +:- use_module(library(zlib)). + +user:file_search_path(compression,dml(scripts/compression)). + +:- initialization time(memo_attach(memo(compression),[])). + +%% print_delimited_dcg(+P:phrase(list(code))) is det. +% +% Runs the DCG phrase to generate a list of character codes, and then +% print the textually encoded length of the resulting list, followed by +% a carriage return, followed by the characters themselves. Thus, the resulting +% sequence of bytes is self-delimiting. +print_delimited_dcg(Phrase) :- + phrase(Phrase,Codes), + length(Codes,Length), + format('~d\n~s',[Length,Codes]). + +:- meta_predicate + complexity(+,0,-), + min_complexity(-,0,-), + conditional_complexity(+,0,0,-), + min_conditional_complexity(-,0,0,-), + conditional_ncd(+,0,0,0,-), + ncd(+,0,0,-). + +:- meta_predicate with_zlib_stream(//). + +%% complexity(+Method:compression_method, +Writer:pred, -K:nonneg) is det. +%% complexity(-Method:compression_method, +Writer:pred, -K:nonneg) is nondet. +%% complexity(-Method:compression_method, -Writer:pred, -K:nonneg) is nondet. +% +% Estimates the Kolmogorov complexity in bits of the sequence of bytes written to the +% current output stream by the goal Writer. If Writer is unbound on entry, then +% browses the set of previously computed results, which are stored in a volatile memo. +% +% Method can be =|id|= for no compression, or =|compress(M)|= for compression using +% program M. If Method is a ground term of the form =|diff(M,RefWriter)|=, then +% complexity is estimated using a delta compression with respect to the bytes written +% by RefWriter. Methods are described in the module header. Conditional complexity +% can more conveniently be estimated using conditional_complexity/4. +complexity(Method,Mod:Writer,K) :- + ( nonvar(Writer), complexity_method(Method), ground(Method) + *->complexity1(Method,Mod:Writer,K) + ; browse(complexity1(Method,Mod:Writer,K)) + ). + +execable(A) :- var(A), !, fail. +execable((A,B)) :- !, execable(A), execable(B). +execable(_). + +:- volatile_memo min_complexity(-ground,+callable,-number). +min_complexity(Best,Writer,KMin) :- + aggregate(min(K,Meth),complexity(Meth,Writer,K),min(KMin,Best)). + +%% conditional_complexity(+M:cc_method, +RefWriter:pred, +Writer:pred, -K:nonneg) is det. +% +% Estimate conditional Kolmogorov complexity of byte sequence produced by Writer +% given bit sequence produced by RefWriter, using one of several methods: +% == +% cc_method ---> id % no compression, complexity is length of data from Writer +% ; compress(c_method) % ignore reference sequence and compress +% ; delta(d_method) % do delta compression using given method +% ; naive(c_method). % use naive estimate using concatentation method +% == +% The naive estimate of K(x|y) is C(yx) - C(y), where C(.) is compressed length and +% xy is simply x concatenated with y. It may give nonsensical zero results. +% It is not the length of any concrete message that can produce x given y. +conditional_complexity(Method,Ref,Writer,K) :- + ( Method=delta(Diff), M=diff(Diff,Ref) +% ; Method=compress(C), M=compress(C) +% ; Method=compress(zlib), M=zlib + ; Method=delta(zlib), M=zlib(Ref) + ; Method=delta(dlzma), M=dlzma(Ref) +% ; Method=id, M=id + ), + complexity(M,Writer,K). + +conditional_complexity(naive(C),Ref,Writer,K) :- + complexity(compress(C),Ref,KY), + complexity(compress(C),(Ref,Writer),KXY), + K is KXY - KY. + +ncd(naive(C),Writer1,Writer2,D) :- + complexity(compress(C),Writer1,K1), + complexity(compress(C),Writer2,K2), + complexity(compress(C),(Writer1,Writer2),K12), + D is (K12 - min(K1,K2))/max(K1,K2). + +ncd(CM-DM,Writer1,Writer2,D) :- + complexity(CM,Writer1,K1), + complexity(CM,Writer2,K2), + conditional_complexity(DM,Writer1,Writer2,K2g1), + conditional_complexity(DM,Writer2,Writer1,K1g2), + D is max(min(K1,K1g2)/K1, min(K2,K2g1)/K2). + +ncd(opt,Writer1,Writer2,D) :- + min_complexity(_,Writer1,K1), + min_complexity(_,Writer2,K2), + min_conditional_complexity(_,Writer1,Writer2,K2g1), + min_conditional_complexity(_,Writer2,Writer1,K1g2), + D is max(min(K1,K1g2)/K1, min(K2,K2g1)/K2). + +conditional_ncd(naive(Method),Ref,Writer1,Writer2,D) :- + conditional_complexity(Method,Ref,Writer1,K1), + conditional_complexity(Method,Ref,Writer2,K2), + conditional_complexity(Method,Ref,(Writer1,Writer2),K12), + D is (K12 - min(K1,K2))/max(K1,K2). + +conditional_ncd(DM,Ref,Writer1,Writer2,D) :- + conditional_complexity(DM,Ref,Writer1,K1), + conditional_complexity(DM,Ref,Writer2,K2), + conditional_complexity(DM,(Ref,Writer1),Writer2,K2g1), + conditional_complexity(DM,(Ref,Writer2),Writer1,K1g2), + D is max(min(K1,K1g2)/K1, min(K2,K2g1)/K2). + +:- volatile_memo min_conditional_complexity(-ground,+callable,+callable,-number). +min_conditional_complexity(Best,Ref,Writer,KMin) :- + aggregate(min(K,Meth),(conditional_complexity(Meth,Ref,Writer,K),Meth\=naive(_)), min(KMin,Best)). + +%% complexity_method(-M:compression_method) is nondet. +complexity_method(diff(Diff,_)) :- command(diff0(Diff,library('swipe.pl')),_,_). +complexity_method(compress(Comp)) :- command(compress(Comp),_,_). +complexity_method(id). +complexity_method(zlib). +complexity_method(zlibi). +complexity_method(zlib(_)). +complexity_method(dlzma(_)). + +:- persistent_memo complexity1(+ground,+callable,-number). +complexity1(id,Writer,K) :- with_transducer(cat,Writer,stream_length(K)). +complexity1(compress(Meth),Writer,K) :- with_transducer(compress(Meth), Writer, stream_length(K)). +complexity1(diff(Diff,RefWriter),Writer,K) :- + with_temp_files([F1], + ( with_output_to_file(F1,RefWriter), + with_transducer(diff0(Diff,F1), Writer, stream_length(K)))). +complexity1(zlib,Writer,K) :- + with_zlib_stream(call_zsync(Writer,K)). + +complexity1(zlibi,Writer,K) :- + with_zlib_stream((call_zsync(write_bytes([255]),_), call_zsync(Writer,K))). + +complexity1(zlib(Ref),Writer,K) :- + with_zlib_stream((call_zsync(Ref,_), call_zsync(Writer,K))). + +complexity1(dlzma(RefWriter),Writer,K) :- + with_temp_files([F1], + ( with_output_to_file(F1,RefWriter), + with_transducer(dlzma(F1,['-b0']), Writer, read_line_to_string(current_input,KString)))), + number_string(K,KString). + + +subs(_,_,T1,T2) :- var(T1), !, T1=T2. +subs(Old,New,T1,T2) :- T1==Old, !, T2=New. +subs(Old,New,T1,T2) :- + T1 =.. [F | Args1], + maplist(subs(Old,New),Args1,Args2), + T2 =.. [F | Args2]. + + +% this is hopeless... +length_prob(Meth,L,Prob) :- + length(X,L), + aggregate( sum(2**(-8*B)), + X^(maplist(between(0,255),X), + with_transducer(compress(Meth), write_bytes(X), stream_length(B)), + format(user_error, '\r~w : ~w', [X,B]) + ), + Prob). + +%% write_bytes(+Bytes:list(between(0,255))) is det. +% Writes a sequence of bytes to current_output. +write_bytes(Bytes) :- with_stream_encoding(current_output,octet,maplist(put_code,Bytes)). + + +% -------------------------------- Using ZLIB ------------------------------ + + +with_zlib_stream(Phrase) :- + setup_call_cleanup( + open_null_stream(Out), + setup_call_cleanup( + zopen(Out,ZOut,[level(9),close_parent(false)]), + with_output_to(ZOut,call_dcg(Phrase,Out-0,_)), + close(ZOut)), + close(Out)). + +zpos(Pos,Out-Pos,Out-Pos). +zsync(Bits,Out-Pos0,Out-Pos1) :- + flush_output, + byte_count(Out,Pos1), + ( Pos1=Pos0 -> Bits=0 % no data since last sync + ; Bits is 8*((Pos1-Pos0)-4)-(3+3) % subtract estimate of final type 0 block. + ). + +call_zsync(Goal,Delta) --> {call(Goal)}, zsync(Delta). + + +% ------ Method comparison ----- +evaluate_delta(KRel,Method) :- + call_with_mode(browse, setof(K-Meth,conditional_complexity(Meth,_:_,_:_,K),KMeths)), + memberchk(_-delta(_),KMeths), + exclude(eval_exclude,KMeths,KMeths1), + KMeths1 = [KBest-_|_], + member(K-Method,KMeths1), + KRel is K/KBest. + +:- public method_rankings/2. +method_rankings(Method,Scores) :- + setof( Method-KRels, + bagof(KRel, evaluate_delta(KRel,Method), KRels), + AllScores), + member(Method-Scores,AllScores). + +eval_exclude(_-Method) :- exclude_method(Method). +exclude_method(naive(_)). +exclude_method(delta(DMethod)) :- exclude_dmethod(DMethod). +exclude_dmethod(xdiff). +exclude_dmethod(dz(DM,_)) :- exclude_dmethod(DM). +exclude_dmethod(zzcd(_,DM)) :- exclude_dmethod(DM). +exclude_dmethod(zzd(_,DM)) :- exclude_dmethod(DM). + +delta_method(compress(C),compress(C)). +delta_method(delta(M),delta(M)). +delta_method(id,id). + + +% ----------------------- Compression framework -------------------------------------- + + +swipe:def(Pipe,Def) :- def(Pipe,Def). + +def( findcat(Dir), sh(0 >> $_, '~s ~s',[compression(findcat)+execute, file(Dir,[file_type(directory)])])). +def( humdump(Dir), findcat(Dir) >> sh($hum >> $hum,'rid -G')). +def( length, sh($_ >> $number, 'wc -c')). +def( prepend(File^T), sh($T >> $T,'cat ~s -',[File+read])). +def( prepend(_<Pipe), Pipe * cat). % sh($T >> $T,'cat ~s -',[T<Pipe])). +def( unprepend(File^T), sh($T >> $T, 'tail -c +$((1+~s))',[$(File^T :> length)])). +def( unprepend(T<Pipe), sh($T >> $T, 'tail -c +$((1+~s))',[$(Pipe >> length)])). +def( compress(Method), sh( $X >> $z(Method,X), Cmd)) :- compressor(Method,Cmd,_). +def( decompress(Method), sh( $z(Method,X) >> $X, Cmd)) :- compressor(Method,_,Cmd). + +def( diff0(Diff,Ref), diff(Diff,Ref)). +% def( diff0(dz(Diff,Comp),Ref), diff(Diff,Ref) >> compress(Comp)). +def( diff(Diff,Ref), diff(Diff, encode, Ref+read)). +% def( diff(zzd(Comp,Diff),Ref), compress(Comp) >> buf(2,diff(Diff, encode, T<(Ref^T :> compress(Comp))))). +def( diff(zzcd(Comp,Diff),Ref), prepend(Ref^T) >> compress(Comp) >> buf(2,diff(Diff, encode, _<(Ref^T :> compress(Comp))))). + +def( patch(Method,Ref), diff(Method, decode, Ref+read)). +def( patch(zzd(Comp,Diff),Ref), buf(2,diff(Diff, decode, _<(Ref^_ :> compress(Comp)))) >> decompress(Comp)). +def( patch(zzcd(Comp,Diff),Ref), buf(2,diff(Diff, decode, _<(Ref^T :> compress(Comp)))) >> decompress(Comp) >> unprepend(Ref^T)). + +def( diff(Method,encode,RefSource), sh( $X >> $dz(Method,X), [compression(Script)+execute, encode, RefSource | Args])) :- differ(Method,Script,Args,_). +def( diff(Method,decode,RefSource), sh( $dz(Method,X) >> $X, [compression(Script)+execute, decode, RefSource | Args])) :- differ(Method,Script,_,Args), Method\=zvcz. +def( buf(N,Pipe), sh(Type,'~s ~d ~s',[compression(bufs)+execute, \N, \Cmd])) :- command(Pipe,Type,Cmd). + +def( diff1(Diff,Ref), diff2(Diff, Ref)). +def( diff1(zzd(Comp,Diff),Ref), compress(Comp) >> diff2(Diff, Ref >> compress(Comp))). +def( diff1(zzcd(Comp,Diff),Ref), prepend(_<Ref) >> compress(Comp) >> diff2(Diff, Ref >> compress(Comp))). +def( patch1(Diff,Ref), patch2(Diff, Ref)). +def( patch1(zzd(Comp,Diff),Ref), patch2(Diff, Ref >> compress(Comp)) >> decompress(Comp)). +def( patch1(zzcd(Comp,Diff),Ref), patch2(Diff, Ref >> compress(Comp)) >> decompress(Comp) >> unprepend(_<Ref)). + +def( diff2(Method,Ref), sh( $X >> $dz(Method,X), [compression(encode)+execute, Script, X<Ref | Args])) :- differ(Method,Script,Args,_). +def( patch2(Method,Ref), sh( $dz(Method,X) >> $X, [compression(decode)+execute, Script, X<Ref | Args])) :- differ(Method,Script,_,Args), Method\=zvcz. + +def( dlzma(Ref,Opts), sh($X >> $dz(dlzma,X), [compression(dlzma)+execute | Args])) :- + append(Opts,[Ref],Args). + +% compressor(xz,xz,xzcat). +compressor(lzma,lzma,lzcat). +compressor(xzraw,'xz -q -F raw','xzcat -F raw'). +compressor(gz,gzip,zcat). +compressor(bz,bzip2,bzcat). + +contains(Diff,Diff). +contains(dz(D1,_),D2) :- contains(D1,D2). +contains(zzd(_,D1),D2) :- contains(D1,D2). +contains(zzcd(_,D1),D2) :- contains(D1,D2). + + +% :- setting(max_vczip_chain_length, nonneg, 2, 'Maximum length of vczip processing chain'). + +differ(Method,Exec,[],[]) :- differ(Method,Exec). +differ(Method-Opts, Exec, EncArgs, []) :- differ(Method,Exec), options_for(Method,Opts,EncArgs). +differ(vczip(vcdiff),zvcz,['-Vcdiff'],[]). +differ(vczip(delta),zvcz,['-mdelta'],[]). +differ(vczip(sieve),zvcz,['-msieve.delta'],[]). +% differ(vczip(Delta,Chain,Encode),zvcz,[\Codes],[]) :- +% setting(max_vczip_chain_length,N), +% between(1,N,L), +% length(Chain,L), +% vczip_chain(Delta,Chain,Encode,Codes,[]). + +% differ(bsdiff,zbs). +% differ(xdelta,zxd). +differ(vcdiff,zvcd). +% differ(vczip,zvcz). +% differ(xdiff,zdiff). + +options_for(Method,Opts,Args) :- + Opts=[_|_], + setof(opt(Opt,Gen,A1,A2),method_option(Method,Opt,Gen,A1,A2),Possible), + seqmap(maybe_option,Possible,Opts-Args,[]-[]). + +maybe_option(_) --> []. +maybe_option(opt(Opt,Gen,A1,A2),[Opt|Opts]-A1,Opts-A2) :- call(Gen). + +method_option(xdelta, secondary(A), member(A,[djw,fgk])) --> ['-S',A]. +method_option(vcdiff, target_matches, true) --> ['-target_matches']. + +% vczip_chain(Delta,Chain,Encode) --> +% "-m", +% vczip_delta(Delta), ",", +% seqmap_with_sep(",",vczip_transform,Chain), ",", +% vczip_encode(Encode). + +% vczip_delta(delta) --> "delta". +% vczip_delta(sieve) --> "sieve.delta". + +% vczip_transform(bwt) --> "bwt". +% vczip_transform(mtf) --> "mtf". +% vczip_transform(rle) --> "rle". +% vczip_transform(rle(N)) --> "rle.", {between(0,1,N)}, at(N). + +% vczip_encode(huffman) --> "huffman". +% vczip_encode(huffgroup) --> "huffgroup". +% vczip_encode(huffpart) --> "huffpart". + + +compressor_overhead(gz,18). % see http://www.onicos.com/staff/iz/formats/gzip.html, https://tools.ietf.org/html/rfc1952 +compressor_overhead(bz,4). % http://en.wikipedia.org/wiki/Bzip2 (need to know number of blocks for better estimate) +compressor_overhead(lzma,0). % http://svn.python.org/projects/external/xz-5.0.3/doc/lzma-file-format.txt + +% ------------------------- Pipe and file tools ---------------------------------- + +splice_in(Pipe) :- with_pipe_input(S,Pipe,copy_stream_data(current_input,S)). +splice_out(Pipe) :- + with_pipe_output(S,Pipe, + ( with_stream_encoding(S,octet, + with_stream_encoding(current_output,octet, + copy_stream_data(S,current_output))))). + +:- op(1050,xfy,&). + + +%% with_transducer(+P:pipe, +Writer:pred, +Reader:pred) is det. +% Runs a pipeline defined using the framework provided by library(swipe), +% while concurrently sending data written by Writer to the process +% on its standard input and making data from the standard output of the +% process available to Reader on current_input. +:- meta_predicate with_transducer(+,0,0). +with_transducer(Pipe,Writer,Reader) :- + with_pipe_io( To-From, Pipe, + ( call_cleanup(with_output_to(To,Writer), close(To)) + & call_cleanup(with_input_from(From,Reader), close(From)) + )). + + +A & B :- concurrent(2, [A,B], []). + +%% with_stream_encoding(+S:stream,+E:encoding,+G:pred) is det. +% Call goall G with encoding of stream S temporarily set to E. Encoding +% is restored afterwards. +:- meta_predicate with_stream_encoding(+,+,0). +with_stream_encoding(S,Enc,Goal) :- + stream_property(S,encoding(Old)), + setup_call_cleanup(set_stream(S,encoding(Enc)), Goal, set_stream(S,encoding(Old))). + +%% stream_length(-L:natural) is det. +% Reads all the data available on Prolog stream current_input and returns the +% number of bits consumed. +stream_length(S,Bits) :- with_input_from(S,stream_length(Bits)). +stream_length(Bits) :- + with_stream_encoding(current_input,octet,accum_length(0,Bytes)), + Bits is 8*Bytes. + +accum_length(L,L) :- at_end_of_stream, !. +accum_length(L1,L3) :- + read_pending_input(current_input, Codes, []), + length(Codes,L), + L2 is L1 + L, + accum_length(L2,L3). + +with_temp_files(Files,Goal) :- + must_be(list(var),Files), + setup_call_cleanup( + maplist(tmp_file_stream(binary),Files,Streams), + (maplist(close,Streams), call(Goal)), + maplist(delete_file,Files)).