diff cpack/dml/lib/compression.pl @ 0:718306e29690 tip

commiting public release
author Daniel Wolff
date Tue, 09 Feb 2016 21:05:06 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpack/dml/lib/compression.pl	Tue Feb 09 21:05:06 2016 +0100
@@ -0,0 +1,464 @@
+/* Part of DML (Digital Music Laboratory)
+	Copyright 2014-2015 Samer Abdallah, University of London
+	 
+	This program is free software; you can redistribute it and/or
+	modify it under the terms of the GNU General Public License
+	as published by the Free Software Foundation; either version 2
+	of the License, or (at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public
+	License along with this library; if not, write to the Free Software
+	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+:- module(compression, 
+	[	with_transducer/3
+   ,  stream_length/1
+   ,  print_delimited_dcg/1
+   ,  complexity_method/1
+   ,  complexity/3
+   ,  conditional_complexity/4
+   ,  min_complexity/3
+   ,  min_conditional_complexity/4
+   ,  ncd/4
+   ,  conditional_ncd/5
+   ,  write_bytes/1
+   ,  length_prob/3
+   ,  with_stream_encoding/3
+   ,  with_zlib_stream/1
+   ,  zsync//1
+	]).
+
+/** <module> Tools for applying compression programs to abitrary streams.
+ 
+   This module provides a framework for applying a range of compression and
+   delta compression programs to arbitrary streams generated by Prolog code.
+   In particular, the predicates complexity/3 and conditional_complexity/4
+   use compression programs to estimate Kolmogorov complexity and conditional
+   complexity.
+
+   Compression programs are invoked by a collection of shell scripts included
+   with this software, in the scripts/compression subdirectory of the dml cpack.
+   The method to be used is determined by a term of type compression_method:
+   ==
+   compression_method ---> id % no compression
+                         ; compress(c_method)
+                         ; diff(d_method,pred).
+   c_method ---> lzma ; gz ; bz.
+
+   d_method ---> bsdiff
+               ; xdelta % using xdelta3 program
+               ; vcdiff % using open-vcdiff
+               ; vczip  % using vczip tools from AT&T
+               ; xdiff  % using binary to text encoding and diff
+               ; zzd(c_method, d_method)
+               ; zzcd(c_method, d_method)
+               ; dz(d_method, c_method).
+   ==
+   
+   Currently we do not subtract an known overheads associated with redundant
+   information in compression program outputs, such as 'magic bytes' in a header
+   used to enable tools to recognise which program was used to compress a given file.
+*/
+
+:- use_module(library(settings)).
+:- use_module(library(memo)).
+:- use_module(library(process)).
+:- use_module(library(fileutils)).
+:- use_module(library(swipe)).
+:- use_module(library(dcg_core)).
+:- use_module(library(dcg_codes)).
+:- use_module(library(zlib)).
+
+user:file_search_path(compression,dml(scripts/compression)).
+
+:- initialization time(memo_attach(memo(compression),[])).
+
+%% print_delimited_dcg(+P:phrase(list(code))) is det.
+%
+%  Runs the DCG phrase to generate a list of character codes, and then
+%  print the textually encoded length of the resulting list, followed by
+%  a carriage return, followed by the characters themselves. Thus, the resulting
+%  sequence of bytes is self-delimiting.
+print_delimited_dcg(Phrase) :- 
+   phrase(Phrase,Codes),
+   length(Codes,Length),
+   format('~d\n~s',[Length,Codes]).
+
+:- meta_predicate 
+      complexity(+,0,-),
+      min_complexity(-,0,-),
+      conditional_complexity(+,0,0,-),
+      min_conditional_complexity(-,0,0,-),
+      conditional_ncd(+,0,0,0,-),
+      ncd(+,0,0,-).
+
+:- meta_predicate with_zlib_stream(//).
+
+%% complexity(+Method:compression_method, +Writer:pred, -K:nonneg) is det.
+%% complexity(-Method:compression_method, +Writer:pred, -K:nonneg) is nondet.
+%% complexity(-Method:compression_method, -Writer:pred, -K:nonneg) is nondet.
+%
+%  Estimates the Kolmogorov complexity in bits of the sequence of bytes written to the
+%  current output stream by the goal Writer. If Writer is unbound on entry, then
+%  browses the set of previously computed results, which are stored in a volatile memo.
+%
+%  Method can be =|id|= for no compression, or =|compress(M)|= for compression using
+%  program M. If Method is a ground term of the form =|diff(M,RefWriter)|=, then
+%  complexity is estimated using a delta compression with respect to the bytes written
+%  by RefWriter. Methods are described in the module header. Conditional complexity
+%  can more conveniently be estimated using conditional_complexity/4.
+complexity(Method,Mod:Writer,K) :- 
+   (  nonvar(Writer), complexity_method(Method), ground(Method)
+   *->complexity1(Method,Mod:Writer,K)
+   ;  browse(complexity1(Method,Mod:Writer,K))
+   ).
+
+execable(A) :- var(A), !, fail.
+execable((A,B)) :- !, execable(A), execable(B).
+execable(_).
+
+:- volatile_memo min_complexity(-ground,+callable,-number).
+min_complexity(Best,Writer,KMin) :- 
+   aggregate(min(K,Meth),complexity(Meth,Writer,K),min(KMin,Best)).
+
+%% conditional_complexity(+M:cc_method, +RefWriter:pred, +Writer:pred, -K:nonneg) is det.
+%
+%  Estimate conditional Kolmogorov complexity of byte sequence produced by Writer
+%  given bit sequence produced by RefWriter, using one of several methods:
+%  ==
+%  cc_method ---> id % no compression, complexity is length of data from Writer
+%               ; compress(c_method) % ignore reference sequence and compress
+%               ; delta(d_method)    % do delta compression using given method
+%               ; naive(c_method).   % use naive estimate using concatentation method
+%  ==
+%  The naive estimate of K(x|y) is C(yx) - C(y), where C(.) is compressed length and
+%  xy is simply x concatenated with y. It may give nonsensical zero results.
+%  It is not the length of any concrete message that can produce x given y.
+conditional_complexity(Method,Ref,Writer,K) :-
+   (  Method=delta(Diff), M=diff(Diff,Ref)
+%   ;  Method=compress(C), M=compress(C)
+%   ;  Method=compress(zlib), M=zlib
+   ;  Method=delta(zlib), M=zlib(Ref)
+   ;  Method=delta(dlzma), M=dlzma(Ref)
+%   ;  Method=id, M=id
+   ),
+   complexity(M,Writer,K).
+
+conditional_complexity(naive(C),Ref,Writer,K) :-
+   complexity(compress(C),Ref,KY),
+   complexity(compress(C),(Ref,Writer),KXY),
+   K is KXY - KY.
+
+ncd(naive(C),Writer1,Writer2,D) :-
+   complexity(compress(C),Writer1,K1),
+   complexity(compress(C),Writer2,K2),
+   complexity(compress(C),(Writer1,Writer2),K12),
+   D is (K12 - min(K1,K2))/max(K1,K2).
+
+ncd(CM-DM,Writer1,Writer2,D) :-
+   complexity(CM,Writer1,K1),
+   complexity(CM,Writer2,K2),
+   conditional_complexity(DM,Writer1,Writer2,K2g1),
+   conditional_complexity(DM,Writer2,Writer1,K1g2),
+   D is max(min(K1,K1g2)/K1, min(K2,K2g1)/K2).
+
+ncd(opt,Writer1,Writer2,D) :-
+   min_complexity(_,Writer1,K1),
+   min_complexity(_,Writer2,K2),
+   min_conditional_complexity(_,Writer1,Writer2,K2g1),
+   min_conditional_complexity(_,Writer2,Writer1,K1g2),
+   D is max(min(K1,K1g2)/K1, min(K2,K2g1)/K2).
+
+conditional_ncd(naive(Method),Ref,Writer1,Writer2,D) :-
+   conditional_complexity(Method,Ref,Writer1,K1),
+   conditional_complexity(Method,Ref,Writer2,K2),
+   conditional_complexity(Method,Ref,(Writer1,Writer2),K12),
+   D is (K12 - min(K1,K2))/max(K1,K2).
+
+conditional_ncd(DM,Ref,Writer1,Writer2,D) :-
+   conditional_complexity(DM,Ref,Writer1,K1),
+   conditional_complexity(DM,Ref,Writer2,K2),
+   conditional_complexity(DM,(Ref,Writer1),Writer2,K2g1),
+   conditional_complexity(DM,(Ref,Writer2),Writer1,K1g2),
+   D is max(min(K1,K1g2)/K1, min(K2,K2g1)/K2).
+
+:- volatile_memo min_conditional_complexity(-ground,+callable,+callable,-number).
+min_conditional_complexity(Best,Ref,Writer,KMin) :- 
+   aggregate(min(K,Meth),(conditional_complexity(Meth,Ref,Writer,K),Meth\=naive(_)), min(KMin,Best)).
+
+%% complexity_method(-M:compression_method) is nondet.
+complexity_method(diff(Diff,_)) :- command(diff0(Diff,library('swipe.pl')),_,_).
+complexity_method(compress(Comp)) :- command(compress(Comp),_,_).
+complexity_method(id).
+complexity_method(zlib).
+complexity_method(zlibi).
+complexity_method(zlib(_)).
+complexity_method(dlzma(_)).
+
+:- persistent_memo complexity1(+ground,+callable,-number).
+complexity1(id,Writer,K) :- with_transducer(cat,Writer,stream_length(K)).
+complexity1(compress(Meth),Writer,K) :- with_transducer(compress(Meth), Writer, stream_length(K)).
+complexity1(diff(Diff,RefWriter),Writer,K) :-
+   with_temp_files([F1],
+      (  with_output_to_file(F1,RefWriter), 
+         with_transducer(diff0(Diff,F1), Writer, stream_length(K)))).
+complexity1(zlib,Writer,K) :-
+   with_zlib_stream(call_zsync(Writer,K)).
+
+complexity1(zlibi,Writer,K) :-
+   with_zlib_stream((call_zsync(write_bytes([255]),_), call_zsync(Writer,K))).
+
+complexity1(zlib(Ref),Writer,K) :-
+   with_zlib_stream((call_zsync(Ref,_), call_zsync(Writer,K))).
+
+complexity1(dlzma(RefWriter),Writer,K) :-
+   with_temp_files([F1],
+      (  with_output_to_file(F1,RefWriter), 
+         with_transducer(dlzma(F1,['-b0']), Writer, read_line_to_string(current_input,KString)))),
+   number_string(K,KString).
+
+
+subs(_,_,T1,T2) :- var(T1), !, T1=T2.
+subs(Old,New,T1,T2) :- T1==Old, !, T2=New.
+subs(Old,New,T1,T2) :-
+   T1 =.. [F | Args1],
+   maplist(subs(Old,New),Args1,Args2),
+   T2 =.. [F | Args2].
+
+
+% this is hopeless...
+length_prob(Meth,L,Prob) :-
+   length(X,L),
+   aggregate( sum(2**(-8*B)), 
+              X^(maplist(between(0,255),X),
+                 with_transducer(compress(Meth), write_bytes(X), stream_length(B)),
+                 format(user_error, '\r~w : ~w', [X,B])
+              ),
+              Prob).
+
+%% write_bytes(+Bytes:list(between(0,255))) is det.
+%  Writes a sequence of bytes to current_output.
+write_bytes(Bytes) :- with_stream_encoding(current_output,octet,maplist(put_code,Bytes)).
+
+
+% --------------------------------  Using ZLIB ------------------------------
+
+
+with_zlib_stream(Phrase) :-
+   setup_call_cleanup( 
+      open_null_stream(Out),
+      setup_call_cleanup(
+         zopen(Out,ZOut,[level(9),close_parent(false)]),
+         with_output_to(ZOut,call_dcg(Phrase,Out-0,_)),
+         close(ZOut)),
+      close(Out)).
+
+zpos(Pos,Out-Pos,Out-Pos).
+zsync(Bits,Out-Pos0,Out-Pos1) :-
+   flush_output,
+   byte_count(Out,Pos1),
+   (  Pos1=Pos0 -> Bits=0 % no data since last sync
+   ;  Bits is 8*((Pos1-Pos0)-4)-(3+3) % subtract estimate of final type 0 block.
+   ).
+
+call_zsync(Goal,Delta) --> {call(Goal)}, zsync(Delta).
+                     
+
+% ------ Method comparison -----
+evaluate_delta(KRel,Method) :-
+   call_with_mode(browse, setof(K-Meth,conditional_complexity(Meth,_:_,_:_,K),KMeths)),
+   memberchk(_-delta(_),KMeths),
+   exclude(eval_exclude,KMeths,KMeths1),
+   KMeths1 = [KBest-_|_],
+   member(K-Method,KMeths1),
+   KRel is K/KBest.
+
+:- public method_rankings/2.
+method_rankings(Method,Scores) :-
+   setof( Method-KRels,
+          bagof(KRel, evaluate_delta(KRel,Method), KRels),
+          AllScores),
+   member(Method-Scores,AllScores).
+
+eval_exclude(_-Method) :- exclude_method(Method).
+exclude_method(naive(_)).
+exclude_method(delta(DMethod)) :- exclude_dmethod(DMethod).
+exclude_dmethod(xdiff).
+exclude_dmethod(dz(DM,_)) :- exclude_dmethod(DM).
+exclude_dmethod(zzcd(_,DM)) :- exclude_dmethod(DM).
+exclude_dmethod(zzd(_,DM)) :- exclude_dmethod(DM).
+
+delta_method(compress(C),compress(C)). 
+delta_method(delta(M),delta(M)).
+delta_method(id,id).
+
+
+% ----------------------- Compression framework --------------------------------------
+
+
+swipe:def(Pipe,Def) :- def(Pipe,Def).
+
+def( findcat(Dir), sh(0 >> $_, '~s ~s',[compression(findcat)+execute, file(Dir,[file_type(directory)])])).
+def( humdump(Dir), findcat(Dir) >> sh($hum >> $hum,'rid -G')).
+def( length, sh($_ >> $number, 'wc -c')). 
+def( prepend(File^T),   sh($T >> $T,'cat ~s -',[File+read])).
+def( prepend(_<Pipe),   Pipe * cat). % sh($T >> $T,'cat ~s -',[T<Pipe])).
+def( unprepend(File^T), sh($T >> $T, 'tail -c +$((1+~s))',[$(File^T :> length)])).
+def( unprepend(T<Pipe), sh($T >> $T, 'tail -c +$((1+~s))',[$(Pipe >> length)])).
+def( compress(Method),   sh( $X >> $z(Method,X), Cmd)) :- compressor(Method,Cmd,_).
+def( decompress(Method), sh( $z(Method,X) >> $X, Cmd)) :- compressor(Method,_,Cmd).
+
+def( diff0(Diff,Ref),            diff(Diff,Ref)).
+% def( diff0(dz(Diff,Comp),Ref),   diff(Diff,Ref) >> compress(Comp)).
+def( diff(Diff,Ref),             diff(Diff, encode, Ref+read)).
+% def( diff(zzd(Comp,Diff),Ref),   compress(Comp) >> buf(2,diff(Diff, encode, T<(Ref^T :> compress(Comp))))).
+def( diff(zzcd(Comp,Diff),Ref),  prepend(Ref^T) >> compress(Comp) >> buf(2,diff(Diff, encode, _<(Ref^T :> compress(Comp))))).
+
+def( patch(Method,Ref),          diff(Method, decode, Ref+read)).
+def( patch(zzd(Comp,Diff),Ref),  buf(2,diff(Diff, decode, _<(Ref^_ :> compress(Comp)))) >> decompress(Comp)).
+def( patch(zzcd(Comp,Diff),Ref), buf(2,diff(Diff, decode, _<(Ref^T :> compress(Comp)))) >> decompress(Comp) >> unprepend(Ref^T)).
+
+def( diff(Method,encode,RefSource), sh( $X >> $dz(Method,X), [compression(Script)+execute, encode, RefSource | Args])) :- differ(Method,Script,Args,_).
+def( diff(Method,decode,RefSource), sh( $dz(Method,X) >> $X, [compression(Script)+execute, decode, RefSource | Args])) :- differ(Method,Script,_,Args), Method\=zvcz.
+def( buf(N,Pipe), sh(Type,'~s ~d ~s',[compression(bufs)+execute, \N, \Cmd])) :- command(Pipe,Type,Cmd).
+
+def( diff1(Diff,Ref),             diff2(Diff, Ref)).
+def( diff1(zzd(Comp,Diff),Ref),   compress(Comp) >> diff2(Diff, Ref >> compress(Comp))).
+def( diff1(zzcd(Comp,Diff),Ref),  prepend(_<Ref) >> compress(Comp) >> diff2(Diff, Ref >> compress(Comp))).
+def( patch1(Diff,Ref),            patch2(Diff, Ref)).
+def( patch1(zzd(Comp,Diff),Ref),  patch2(Diff, Ref >> compress(Comp)) >> decompress(Comp)).
+def( patch1(zzcd(Comp,Diff),Ref), patch2(Diff, Ref >> compress(Comp)) >> decompress(Comp) >> unprepend(_<Ref)).
+
+def( diff2(Method,Ref),           sh( $X >> $dz(Method,X), [compression(encode)+execute, Script, X<Ref | Args])) :- differ(Method,Script,Args,_).
+def( patch2(Method,Ref),          sh( $dz(Method,X) >> $X, [compression(decode)+execute, Script, X<Ref | Args])) :- differ(Method,Script,_,Args), Method\=zvcz.
+
+def( dlzma(Ref,Opts), sh($X >> $dz(dlzma,X), [compression(dlzma)+execute | Args])) :-
+   append(Opts,[Ref],Args).
+
+% compressor(xz,xz,xzcat).
+compressor(lzma,lzma,lzcat).
+compressor(xzraw,'xz -q -F raw','xzcat -F raw').
+compressor(gz,gzip,zcat).
+compressor(bz,bzip2,bzcat).
+
+contains(Diff,Diff).
+contains(dz(D1,_),D2) :- contains(D1,D2).
+contains(zzd(_,D1),D2) :- contains(D1,D2).
+contains(zzcd(_,D1),D2) :- contains(D1,D2).
+
+
+% :- setting(max_vczip_chain_length, nonneg, 2, 'Maximum length of vczip processing chain').
+
+differ(Method,Exec,[],[]) :- differ(Method,Exec).
+differ(Method-Opts, Exec, EncArgs, []) :- differ(Method,Exec), options_for(Method,Opts,EncArgs).
+differ(vczip(vcdiff),zvcz,['-Vcdiff'],[]).
+differ(vczip(delta),zvcz,['-mdelta'],[]).
+differ(vczip(sieve),zvcz,['-msieve.delta'],[]).
+% differ(vczip(Delta,Chain,Encode),zvcz,[\Codes],[]) :- 
+%    setting(max_vczip_chain_length,N),
+%    between(1,N,L), 
+%    length(Chain,L),
+%    vczip_chain(Delta,Chain,Encode,Codes,[]). 
+
+% differ(bsdiff,zbs).
+% differ(xdelta,zxd).
+differ(vcdiff,zvcd).
+% differ(vczip,zvcz).
+% differ(xdiff,zdiff).
+
+options_for(Method,Opts,Args) :-
+   Opts=[_|_],
+   setof(opt(Opt,Gen,A1,A2),method_option(Method,Opt,Gen,A1,A2),Possible),
+   seqmap(maybe_option,Possible,Opts-Args,[]-[]).
+
+maybe_option(_) --> [].
+maybe_option(opt(Opt,Gen,A1,A2),[Opt|Opts]-A1,Opts-A2) :- call(Gen).
+
+method_option(xdelta,  secondary(A), member(A,[djw,fgk])) --> ['-S',A].
+method_option(vcdiff, target_matches, true) --> ['-target_matches'].
+
+% vczip_chain(Delta,Chain,Encode) --> 
+%    "-m", 
+%    vczip_delta(Delta), ",",
+%    seqmap_with_sep(",",vczip_transform,Chain), ",",
+%    vczip_encode(Encode).
+
+% vczip_delta(delta) --> "delta".
+% vczip_delta(sieve) --> "sieve.delta".
+
+% vczip_transform(bwt) --> "bwt".
+% vczip_transform(mtf) --> "mtf".
+% vczip_transform(rle) --> "rle".
+% vczip_transform(rle(N)) --> "rle.", {between(0,1,N)}, at(N).
+
+% vczip_encode(huffman) --> "huffman".
+% vczip_encode(huffgroup) --> "huffgroup".
+% vczip_encode(huffpart) --> "huffpart".
+
+
+compressor_overhead(gz,18). % see http://www.onicos.com/staff/iz/formats/gzip.html, https://tools.ietf.org/html/rfc1952
+compressor_overhead(bz,4). % http://en.wikipedia.org/wiki/Bzip2 (need to know number of blocks for better estimate)
+compressor_overhead(lzma,0). % http://svn.python.org/projects/external/xz-5.0.3/doc/lzma-file-format.txt
+
+% ------------------------- Pipe and file tools ----------------------------------
+
+splice_in(Pipe) :- with_pipe_input(S,Pipe,copy_stream_data(current_input,S)).
+splice_out(Pipe) :- 
+   with_pipe_output(S,Pipe,
+      (  with_stream_encoding(S,octet,
+            with_stream_encoding(current_output,octet,
+               copy_stream_data(S,current_output))))).
+
+:- op(1050,xfy,&).
+
+
+%% with_transducer(+P:pipe, +Writer:pred, +Reader:pred) is det.
+%  Runs a pipeline defined using the framework provided by library(swipe),
+%  while concurrently sending data written by Writer to the process
+%  on its standard input and making data from the standard output of the
+%  process available to Reader on current_input.
+:- meta_predicate with_transducer(+,0,0).
+with_transducer(Pipe,Writer,Reader) :-
+   with_pipe_io( To-From, Pipe,
+      (  call_cleanup(with_output_to(To,Writer), close(To))
+      &  call_cleanup(with_input_from(From,Reader), close(From))
+      )).
+
+
+A & B :- concurrent(2, [A,B], []).
+
+%% with_stream_encoding(+S:stream,+E:encoding,+G:pred) is det.
+%  Call goall G with encoding of stream S temporarily set to E. Encoding
+%  is restored afterwards.
+:- meta_predicate with_stream_encoding(+,+,0).
+with_stream_encoding(S,Enc,Goal) :-
+   stream_property(S,encoding(Old)),
+   setup_call_cleanup(set_stream(S,encoding(Enc)), Goal, set_stream(S,encoding(Old))).
+
+%% stream_length(-L:natural) is det.
+%  Reads all the data available on Prolog stream current_input and returns the
+%  number of bits consumed.
+stream_length(S,Bits) :- with_input_from(S,stream_length(Bits)).
+stream_length(Bits) :- 
+   with_stream_encoding(current_input,octet,accum_length(0,Bytes)),
+   Bits is 8*Bytes.
+
+accum_length(L,L) :- at_end_of_stream, !.
+accum_length(L1,L3) :-
+   read_pending_input(current_input, Codes, []),
+   length(Codes,L),
+   L2 is L1 + L,
+   accum_length(L2,L3).
+
+with_temp_files(Files,Goal) :-
+   must_be(list(var),Files),
+   setup_call_cleanup(
+      maplist(tmp_file_stream(binary),Files,Streams),
+      (maplist(close,Streams), call(Goal)),
+      maplist(delete_file,Files)).