view talk/talk.tex @ 75:8a146c651475 tip

Added ready made bbl
author samer
date Fri, 01 Jun 2012 16:19:55 +0100
parents 90901fd611d1
children
line wrap: on
line source
\documentclass{beamer}

\usepackage[T1]{fontenc}
\usepackage{microtype}
\usepackage{multimedia}
\usepackage{tikz}
\usetikzlibrary{matrix}
\usetikzlibrary{patterns}
\usetikzlibrary{arrows}
\usetikzlibrary{calc}
\usepackage{tools}
%\usepackage{amsfonts,amssymb}

\tikzset{every picture/.style=semithick}

%%% font options:
% atypewri, frankgth, gillsans, centuryg, futura, eurostil 
%\usepackage{fourier}    	% Maths in serif Utopia
\usepackage[sf]{frankgth}
%\usepackage[sf]{optima}

%%% Monospace font
%\usepackage[scaled=0.88]{ulgothic} % 0.88 % suits narrow faces
\renewcommand{\ttdefault}{plg}  % Adobe Letter Gothic - suits light medium width face
%\renewcommand{\ttdefault}{pcr}  % Courier - suits wide faces
% remember to match up size and weight of monospace font to main font

\newcommand{\mytt}[1]{{\texttt{\footnotesize\fontseries{bx}\selectfont #1}}}

\DeclareMathAlphabet{\mathcal}{OMS}{cmsy}{m}{n}


%%% Black on white
\definecolor{base}{rgb}{0,0,0}
\definecolor{comp}{named}{green}
\definecolor{paper}{named}{white}

\logo{%
	\includegraphics[height=16pt]{qmul-black}\hspace*{45pt}%
	\raisebox{1pt}{\includegraphics[height=12pt]{c4dm-black-white}}%
}

%%% Red on black
\comment{
\definecolor{base}{rgb}{1,0,0}
\definecolor{comp}{rgb}{0,0.8,0.2}
\definecolor{paper}{named}{black}

\logo{%
	\includegraphics[height=16pt]{qmul-red}\hspace*{45pt}%
	\raisebox{1pt}{\includegraphics[height=12pt]{c4dm-red-black}}%
}
}

																								 
\useinnertheme{default}%circles
\useoutertheme{default}
\usefonttheme[onlymath]{serif}

\setbeamercolor{normal text}{bg=paper,fg=base!90!-paper}
\setbeamercolor{background}{bg=comp!50!paper,fg=comp}
%\setbeamercolor{structure}{fg=base!75!-paper}
\setbeamercolor{structure}{fg=red!50!base}
\setbeamercolor{palette primary}{bg=yellow!50!paper,fg=yellow}
\setbeamercolor{palette secondary}{bg=orange!50!paper,fg=orange}
\setbeamercolor{palette tertiary}{bg=blue!50!paper,fg=blue}
\setbeamercolor{palette quaternary}{bg=green!50!paper,fg=green}
\setbeamercolor{block body}{bg=base!20!paper}
\setbeamercolor{block title}{bg=base!60!paper,fg=paper}
\setbeamercolor{navigation symbols}{fg=base!90!paper}
\setbeamercolor{separation line}{bg=blue,fg=yellow}
\setbeamercolor{fine separation line}{bg=blue,fg=orange}

% Title page
%	\setbeamercolor{title}{bg=base!20!paper}
%	\setbeamercolor{subtitle}{bg=base!20!paper}
%	\setbeamercolor{title page}{bg=base!40!paper}

%	\setbeamercolor{headline}{bg=blue}
%	\setbeamercolor{footline}{bg=blue}
%	\setbeamercolor{frametitle}{bg=base!30!paper}
%	\setbeamercolor{framesubtitle}{bg=base!40!paper}

%	\setbeamercolor{section in toc}{bg=base!25!paper,fg=orange}
%	\setbeamercolor{section in toc shaded}{bg=base!25!paper,fg=orange!80!paper}
%	\setbeamercolor{subsection in toc}{bg=base!25!paper,fg=orange}
%	\setbeamercolor{subsection in toc shaded}{bg=yellow!25!paper,fg=orange!80!paper}
%  page number in head/foot
%  section in head/foot
%	section in head/foot shaded


\setbeamerfont{structure}{series=\bfseries}
\setbeamerfont{title}{series=\mdseries,size=\Large}
%\setbeamerfont{title}{series=\ltseries,size=\huge}
\setbeamerfont{date}{size=\footnotesize}%,series=\mdcseries}
\setbeamerfont{institute}{size=\footnotesize}%,series=\mdcseries}
\setbeamerfont{author}{size=\footnotesize,series=\bfseries}
\setbeamercolor{bibliography item}{parent={normal text}}
\setbeamercolor{bibliography entry author}{fg=base}
\setbeamercolor{bibliography entry location}{fg=base!70!paper}

%%% Templates

\setbeamertemplate{bibliography item}[text]
\setbeamertemplate{bibliography entry title}{ }
\setbeamertemplate{bibliography entry location}{ }
\setbeamertemplate{blocks}[rounded][shadow=false]
\setbeamertemplate{items}[circle]
%\setbeamertemplate{bibliography item}[triangle]
%	\setbeamertemplate{title page}[default][rounded=true,shadow=false]
%	\setbeamertemplate{frametitle}[default][rounded=true,shadow=false]
\setbeamertemplate{sidebar right}{}
\setbeamertemplate{footline}{
	\hspace*{0.2cm}
	\insertlogo
	\hfill
	\usebeamertemplate***{navigation symbols}%
	\hfill
	\makebox[6ex]{\hfill\insertframenumber/\inserttotalframenumber}%
	\hspace*{0.2cm}

	\vskip 4pt
}
			 
\setbeamertemplate{navigation symbols}
{%
  \hbox{%
    \hbox{\insertslidenavigationsymbol}
    \hbox{\insertframenavigationsymbol}
%    \hbox{\insertsubsectionnavigationsymbol}
    \hbox{\insertsectionnavigationsymbol}
    \hbox{\insertdocnavigationsymbol}
%    \hbox{\insertbackfindforwardnavigationsymbol}%
  }%
}


\AtBeginSection[]{
	\begin{iframe}[Outline]
		\tableofcontents[currentsection]
	\end{iframe}
}                                                                                                                    
%\linespread{1.1}

\setlength{\parskip}{0.5em}

\newenvironment{bframe}[1][untitled]{\begin{frame}[allowframebreaks]\frametitle{#1}}{\end{frame}}
\newenvironment{iframe}[1][untitled]{\begin{frame}\frametitle{#1}}{\end{frame}}
\newenvironment{isframe}[1][untitled]{\begin{frame}[fragile=singleslide,environment=isframe]\frametitle{#1}}{\end{frame}}

\renewenvironment{fig}[1]
	{%
		\begin{figure}
		\def\fglbl{f:#1}
		\let\ocap=\caption
		\renewcommand{\caption}[2][]{\ocap[##1]{\small ##2}}
		\centering\small
	}{%
		\label{\fglbl}
		\end{figure}
	}

\newcommand{\paragraph}[1]{\textbf{#1}\qquad}
\newcommand{\colfig}[2][1]{\includegraphics[width=#1\linewidth]{figs/#2}}%
\let\citep=\cite
%\newcommand{\dotmath}[2]{\psfrag{#1}[Bc][Bc]{\small $#2$}}

\title{Cognitive Music Modelling:\\An Information Dynamics Approach}
\author{Samer Abdallah, Henrik Ekeus, Peter Foster,\\Andrew Robertson and Mark Plumbley}
\institute{Centre for Digital Music\\Queen Mary, University of London}

\date{\today}

\def\X{\mathcal{X}}
\def\Y{\mathcal{Y}}
\def\Past{\mathrm{Past}}
\def\Future{\mathrm{Future}}
\def\Present{\mathrm{Present}}
\def\param{\theta}
\def\trans{a}
\def\init{\pi^{\trans}}
%\def\entrorate(#1){\mathcal{H}(#1)}
%\def\entrorate(#1){\dot{\mathcal{H}}(#1)}
\def\entrorate{h}
\def\emcmarg(#1){b_#1}
\def\mcmarg{\vec{b}}
\def\domS{\mathcal{S}}
\def\domA{\mathcal{A}}

\def\Lxz(#1,#2){\mathcal{L}(#1|#2)}
\def\LXz(#1){\overline{\mathcal{L}}(#1)}
\def\LxZ(#1){\underline{\mathcal{L}}(#1)}
\def\LXZ{\overline{\underline{\mathcal{L}}}}
\def\Ixz(#1,#2){\mathcal{I}(#1|#2)}
\def\IXz(#1){\overline{\mathcal{I}}(#1)}
\def\IxZ(#1){\underline{\mathcal{I}}(#1)}
\def\IXZ{\overline{\underline{\mathcal{I}}}}

\def\ev(#1=#2){#1\!\!=\!#2}
\def\sev(#1=#2){#1\!=#2}

\def\FE{\mathcal{F}}

\newcommand\past[1]{\overset{\rule{0pt}{0.2em}\smash{\leftarrow}}{#1}}
\newcommand\fut[1]{\overset{\rule{0pt}{0.1em}\smash{\rightarrow}}{#1}}

\def\cn(#1,#2) {\node[circle,draw,inner sep=0.2em] (#1#2) {${#1}_{#2}$};}
\def\dn(#1) {\node[circle,inner sep=0.2em] (#1) {$\cdots$};}
\def\rl(#1,#2) {\draw (#1) -- (#2);}

\definecolor{un0}{rgb}{0.5,0.0,0.0}
\definecolor{un1}{rgb}{0.6,0.15,0.15}
\definecolor{un2}{rgb}{0.7,0.3,0.3}
\definecolor{un3}{rgb}{0.8,0.45,0.45}
\definecolor{un4}{rgb}{0.9,0.6,0.6}{
\definecolor{un5}{rgb}{1.0,0.75,0.75}

%\def\blob(#1){\node[circle,draw,fill=#1,inner sep=0.25em]{};}
\def\bl(#1){\draw[circle,fill=#1] (0,0) circle (0.4em);}
\def\noderow(#1,#2,#3,#4,#5,#6){%
	\tikz{\matrix[draw,rounded corners,inner sep=0.4em,column sep=2.1em,ampersand replacement=\&]{%
	\bl(#1)\&\bl(#2)\&\bl(#3)\&\bl(#4)\&\bl(#5)\&\bl(#6)\\};}}

\begin{document}
	\frame{\titlepage}
	\section[Outline]{}
	\frame{
		\frametitle{Outline}
		\tableofcontents
	}

	

\section{Expectation and surprise in music}
\label{s:Intro}

\begin{iframe}[`Unfoldingness']
	Music is experienced as a 
	\uncover<2->{phenomenon}
	\uncover<3->{that}
	\uncover<4->{`unfolds'} \uncover<5->{in}\\
	\only<6>{blancmange}%
	\only<7>{(just kidding)}%
	\uncover<8->{time,} 
	\uncover<9->{rather than being apprehended as a static object presented in its 
	entirety.} 

	\uncover<10->{[This is recognised in computation linguistics where the phenomenon is known as \emph{incrementality}, \eg in incremental parsing.]}
	
	\uncover<11->{%
	Meyer \cite{Meyer67} argued that musical experience depends on
	how we change and revise our conceptions \emph{as events happen},
	on how expectation and prediction interact with occurrence, and that, to a large
	degree, the way to understand the effect of music is to focus on
	this `kinetics' of expectation and surprise.%
	}
\end{iframe}

\begin{iframe}[Expectation and suprise in music]

	Music creates
	\emph{expectations} of what is to come next, which may be fulfilled
	immediately, after some delay, or not at all.
	Suggested by music theorists, \eg 
	L. B. Meyer \cite{Meyer67} and Narmour \citep{Narmour77} but also
	noted much earlier by Hanslick \cite{Hanslick1854} in the
	 1850s:
		\begin{quote}
			\small
			`The most important factor in the mental process which accompanies the
			act of listening to music, and which converts it to a source of pleasure, is
			\ldots
%			frequently overlooked. We here refer to 
			the intellectual satisfaction which the
			listener derives from continually following and anticipating the composer's
			intentions---now, to see his expectations fulfilled, and now, to find himself
			agreeably mistaken. It is a matter of course that this intellectual flux and
			reflux, this perpetual giving and receiving takes place unconsciously, and with 
			the rapidity of lightning-flashes.'
		\end{quote}
\end{iframe}

\begin{iframe}[Probabilistic reasoning]
	\uncover<1->{%
	Making predictions and assessing surprise is 
	essentially reasoning with degrees of belief and (arguably)
	the best way to do this is using Bayesian probability theory \cite{Cox1946,Jaynes27}.%

	[NB. this is \textbf{subjective} probability as advocated by \eg De Finetti and Jaynes.]
	}

%  Thus, we assume that musical schemata are encoded as probabilistic % \citep{Meyer56} models, and 
	\uncover<2->{%
   We suppose that familiarity with different styles of music takes the form
	of various probabilistic models, and that these models are adapted through listening.%
	}
%	various stylistic norms is encoded as
%	using models that encode the statistics of music in general, the particular styles
%	of music that seem best to fit the piece we happen to be listening to, and the emerging 
%	structures peculiar to the current piece.

	\uncover<3->{%
	Experimental evidence that humans are able to internalise statistical
	knowledge about musical: \citep{SaffranJohnsonAslin1999,EerolaToiviainenKrumhansl2002}; and also
	that statistical models are effective for computational analysis of music, \eg \cite{ConklinWitten95,Pearce2005}.%
	}

	% analysis of music, \eg \cite{ConklinWitten95,PonsfordWigginsMellish1999,Pearce2005}.
%		\cite{Ferrand2002}. Dubnov and Assayag PSTs? 
\end{iframe}

\begin{iframe}[Music and information theory]
	\uncover<1->{
	With probabilistic models in hand we can apply quantitative information theory: we can compute entropies,
	relative entropies, mutual information, and all that.
	}

	\uncover<2->{
	Lots of interest in application of information theory to perception, music and aesthetics since the 50s,
	\eg Moles \cite{Moles66}, Meyer \cite{Meyer67}, Cohen \cite{Cohen1962}, Berlyne \cite{Berlyne71}.
	(See also Bense, Hiller)
	}

	\uncover<3->{
	Idea is that subjective qualities and 
	states like uncertainty, surprise, complexity, tension, and interestingness
	are determined by information-theoretic quantities.
	}

	\uncover<4->{
	Berlyne \cite{Berlyne71} called such quantities `collative variables', since they are 
	to do with patterns of occurrence rather than medium-specific details.
	\emph{Information aesthetics}.
	}
%	Listeners then experience greater or lesser levels of surprise
%	in response to departures from these norms. 
%	By careful manipulation
%	of the material, the composer can thus define, and induce within the
%	listener, a temporal programme of varying
%	levels of uncertainty, ambiguity and surprise. 
\end{iframe}

\begin{iframe}[Probabilistic model-based observer hypothesis]
	\begin{itemize}
		\item<1-> 
		As we listen, we maintain a probabilistic model that enables 
		us to make predictions.  As events unfold, we revise our probabilistic `belief state', 
		including predictions about the future.
		\item<2-> 
		Probability distributions and changes in distributions are characterised in terms 
		of information theoretic-measures such as entropy and relative entropy (KL divergence).
		\item<3->
		The dynamic evolution of these information measures captures significant structure,
		\eg events that are surprising, informative, explanatory \etc
	\end{itemize}
	
\end{iframe}

\begin{iframe}[Features of information dynamics]
	\uncover<1->{
	\textbf{Abstraction}: sensitive mainly to \emph{patterns} of occurence, 
	rather than details of which specific things occur or the sensory medium. 
%	it operates at a level of abstraction removed from the details of the sensory experience and 
%	the medium through which it was received, suggesting that the same
%	approach could, in principle, be used to analyse and compare information 
%	flow in different temporal media regardless of whether they are auditory, visual or otherwise. 
	}

	\uncover<2->{
	\textbf{Generality}: applicable in principle to any probabilistic model, in particular,
	models with time-dependent latent variables such as HMMs.
	Many important musical concepts like key, harmony, and beat are essentially `hidden variables'.
	}

	\uncover<3->{
	\textbf{Richness}: when applied to models with latent variables, can result in many-layered 
	analysis, capturing information flow about harmony, tempo, \etc
	}

	\uncover<4->{
	\textbf{Subjectivity}: all probabilities are \emph{subjective} probabilities relative to \emph{observer's} 
	model, which can depend on observer's capabilities and prior experience.
	}
\end{iframe}

\section{Surprise, entropy and information in random sequences}
\label{s:InfoInRandomProcs}

\begin{iframe}[Information theory primer\nicedot Entropy]
		Let $X$ be a discrete-valued random (in the sense of \emph{subjective} probability) variable.
		Entropy is a measure of \emph{uncertainty}. If observer expects to see $x$ with probability $p(x)$, 
		then 
		\begin{align*}
			H(X) &= \sum_{x\in\X} - p(x) \log p(x) \\
			&= \expect{[-\log p(X)]}. 
		\end{align*}
		Consider $-\log p(x)$ as the `surprisingness' of $x$, then the entropy is the `expected surprisingness'.
		High for spread out distributions and low for concentrated ones.
\end{iframe}

\begin{iframe}[Information theory primer\nicedot Relative entropy]
		Relative entropy or Kullback-Leibler (KL) divergence quantifies difference between 
		probability distributions.
		If observer receives data $\mathcal{D}$, divergence between (subjective) prior and 
		posterior distributions is the
		amount of information in $\mathcal{D}$ \emph{about} $X$ for this observer:
		\[
			I(\mathcal{D}\to X) = 
			D(p_{X|\mathcal{D}} || p_X) 
				= \sum_{x\in\X} p(x|\mathcal{D}) \log \frac{p(x|\mathcal{D})}{p(x)}. 
		\]
		If observing $\mathcal{D}$ causes a large change in belief about $X$, then $\mathcal{D}$
		contained a lot of information about $X$.

		Like Lindley's (1956) information (thanks Lars!).
\end{iframe}

\begin{iframe}[Information theory primer\nicedot Mutual information]
		Mutual information between (MI) $X_1$ and $X_2$ is the expected amount of information about 
		$X_2$ in an observation of $X_1$. Can be written in several ways:
		\begin{align*}
			I(X_1;X_2) &= \sum_{x_1,x_2} p(x_1,x_2) \log \frac{p(x_1,x_2)}{p(x_1)p(x_2)} \\
					&= H(X_1) + H(X_2) - H(X_1,X_2) \\
					&= H(X_2) - H(X_2|X_1).
		\end{align*}
		(1) Expected information about $X_2$ in an observation of $X_1$;\\
		(2) Expected reduction in uncertainty about $X_2$ after observing $X_1$;\\
		(3) Symmetric: $I(X_1;X_2) = I(X_2;X_1)$.
\end{iframe}

\begin{iframe}[Information theory primer\nicedot Conditional MI]
		Information in one variable about another given observations of some third variable.
		Formulated analogously by adding conditioning variables to entropies:
		\begin{align*}
			I(X_1;X_2|X_3) &= H(X_1|X_3) - H(X_1|X_2,X_3).
		\end{align*}
		Makes explicit the dependence of information assessment on background knowledge,
		represented by conditioning variables.
\end{iframe}


\begin{isframe}[Information theory primer\nicedot I-Diagrams]
		\newcommand\rad{2.2em}%
		\newcommand\circo{circle (3.4em)}%
		\newcommand\labrad{4.3em}
		\newcommand\bound{(-6em,-5em) rectangle (6em,6em)}
		\newcommand\clipin[1]{\clip (#1) \circo;}%
		\newcommand\clipout[1]{\clip \bound (#1) \circo;}%
		\newcommand\cliptwo[3]{%
			\begin{scope}
				\clipin{#1};
				\clipin{#2};
				\clipout{#3};
				\fill[black!30] \bound;
			\end{scope}
		}%
		\newcommand\clipone[3]{%
			\begin{scope}
				\clipin{#1};
				\clipout{#2};
				\clipout{#3};
				\fill[black!15] \bound;
			\end{scope}
		}%
		Information diagrams are a Venn diagram-like represention of entropies and mutual 
		informations for a set of random variables.
	\begin{center}
		\begin{tabular}{c@{\ }c}
			\scalebox{0.8}{%
			\begin{tikzpicture}[baseline=0pt]
				\coordinate (p1) at (90:\rad);
				\coordinate (p2) at (210:\rad);
				\coordinate (p3) at (-30:\rad);
				\clipone{p1}{p2}{p3};
				\clipone{p2}{p3}{p1};
				\clipone{p3}{p1}{p2};
				\cliptwo{p1}{p2}{p3};
				\cliptwo{p2}{p3}{p1};
				\cliptwo{p3}{p1}{p2};
            \begin{scope}
               \clip (p1) \circo;
               \clip (p2) \circo;
               \clip (p3) \circo;
               \fill[black!45] \bound;
            \end{scope}
				\draw (p1) \circo;
				\draw (p2) \circo;
				\draw (p3) \circo;
				\path 
					(barycentric cs:p3=1,p1=-0.2,p2=-0.1) +(0ex,0) node {$I_{3|12}$}
					(barycentric cs:p1=1,p2=-0.2,p3=-0.1) +(0ex,0) node {$I_{1|23}$}
					(barycentric cs:p2=1,p3=-0.2,p1=-0.1) +(0ex,0) node {$I_{2|13}$}
					(barycentric cs:p3=1,p2=1,p1=-0.55) +(0ex,0) node {$I_{23|1}$}
					(barycentric cs:p1=1,p3=1,p2=-0.55) +(0ex,0) node {$I_{13|2}$}
					(barycentric cs:p2=1,p1=1,p3=-0.55) +(0ex,0) node {$I_{12|3}$}
					(barycentric cs:p3=1,p2=1,p1=1) node {$I_{123}$}
					;
				\path
					(p1) +(140:\labrad) node {$X_1$}
					(p2) +(-140:\labrad) node {$X_2$}
					(p3) +(-40:\labrad) node {$X_3$};
			\end{tikzpicture}%
			}
			&
			\parbox{0.5\linewidth}{
				\small
				\begin{align*}
					I_{1|23} &= H(X_1|X_2,X_3) \\
					I_{13|2} &= I(X_1;X_3|X_2) \\
					I_{1|23} + I_{13|2} &= H(X_1|X_2) \\
					I_{12|3} + I_{123} &= I(X_1;X_2) 
				\end{align*}
			}
		\end{tabular}
	\end{center}
The areas of 
		the three circles represent $H(X_1)$, $H(X_2)$ and $H(X_3)$ respectively.
		The total shaded area is the joint entropy $H(X_1,X_2,X_3)$.
		Each undivided region is an \emph{atom} of the I-diagram.
\end{isframe}




\begin{isframe}[Information theory in sequences]
	\def\bx{1.6em}%
	\def\cn(#1,#2) {\node[circle,draw,fill=white,inner sep=0.2em] at(#1) {$#2$};}%
	\def\dn(#1){\node[circle,inner sep=0.2em] at(#1) {$\cdots$};}%
	\def\en(#1){coordinate(#1)}%
	\def\tb{++(3.8em,0)}%
	\def\lb(#1)#2{\path (#1)+(0,\bx) node[anchor=south] {#2};}
	\def\nr(#1,#2,#3){\draw[rounded corners,fill=#3] (#1) rectangle (#2);}%

		Consider an observer receiving elements of a random sequence
		$(\ldots, X_{-1}, X_0, X_1, X_2, \ldots)$, so that at any time $t$ there is 
		a `present' $X_t$, an observed pasti $\past{X}_t$, and an unobserved future
		$\fut{X}_t$. Eg, at time $t=3$:

		\begin{figure}
				\begin{tikzpicture}%[baseline=-1em]
					\path (0,0) \en(X0) \tb \en(X1) \tb \en(X2) \tb \en(X3) \tb \en(X4) \tb \en(X5) \tb \en(X6);
					\path (X0)+(-\bx,-\bx) \en(p1) (X2)+(\bx,\bx) \en(p2)
					      (X3)+(-\bx,-\bx) \en(p3) (X3)+(\bx,\bx) \en(p4)
					      (X4)+(-\bx,-\bx) \en(p5) (X6)+(\bx,\bx) \en(p6);
					\nr(p1,p2,un3) \nr(p3,p4,un4) \nr(p5,p6,un5)
					\dn(X0) \cn(X1,X_1) \cn(X2,X_2) \cn(X3,X_3) \cn(X4,X_4) \cn(X5,X_5) \dn(X6)
					\lb(X1){Past: $\past{X}_3$}
					\lb(X5){Future $\fut{X}_3$}
					\lb(X3){Present}
				\end{tikzpicture}%}%
		\end{figure}
	Consider how the observer's belief state evolves when, having observed up to
	$X_2$, it learns the value of $X_3$.
\end{isframe}

\begin{iframe}[`Surprise' based quantities]
	To obtain first set of measures, we ignore the future $\fut{X}_t$
	and consider the probability distribution for $X_t$ give the
	observed past $\past{X}_t=\past{x}_t$.
	
	\begin{enumerate}
		\item<1->
		\textbf{Surprisingness}: negative log-probability
		$\ell_t = -\log p(x_t|\past{x}_t)$.

		\item<2->
		Expected surprisingness given context $\past{X}=\past{x}_t$ is the entropy of the predictive distribution,
		$H(X_t|\ev(\past{X}_t=\past{x}_t))$: uncertainty about $X_t$ before the observation is made.
		
		\item<3->
		Expectation over all possible realisations of process is the conditional entropy 
		$H(X_t|\past{X}_t)$ according to the observer's model. For stationary process, is
		\emph{entropy rate} $h_\mu$.
	\end{enumerate}
\end{iframe}

\begin{iframe}[Predictive information]
	Second set of measures based on amount of information the observation $\ev(X_t=x_t)$
	carries \emph{about} about the unobserved future $\fut{X}_t$, \emph{given} that we already 
	know the past $\ev(\past{X}_t=\past{x}_t)$:
	is
	\begin{equation*}
		\mathcal{I}_t = I(\ev(X_t=x_t)\to\fut{X}_t|\ev(\past{X}_t=\past{x}_t)).
	\end{equation*}
	Is KL divergence between beliefs about future $\fut{X}_t$ prior and posterior
	to observation $\ev(X_t=x_t)$.
	Hence, for continuous valued variables, invariant to invertible
	transformations of the observation spaces. 
\end{iframe}

\begin{iframe}[Predictive information based quantities]
	\begin{enumerate}
	\item<1->
		\emph{Instantaneous predictive information} (IPI) is just $\mathcal{I}_t$.

%	Expectations over $X|\ev(Z=z)$, $Z|\ev(X=x)$, and $(X,Z)$ give 3 more information measures:
	\item<2->
		Expectation of $\mathcal{I}_t$ before observation at time $t$ is 
		$I(X_t;\fut{X}_t | \ev(\past{X}_t=\past{x}_t))$: mutual information conditioned on
		observed past. Is the amount of new information about the future expected from the next observation.
		Useful for directing attention towards the next event even before it happens?

%	This is different from Itti and Baldi's proposal that Bayesian
%	\emph{surprise} attracts attention \cite{IttiBaldi2005}, as it is a mechanism which can 
%	operate \emph{before} the surprise occurs.


	\item<3->
	Expectation over all possible realisations is the conditional mutual information
	$I(X_t;\fut{X}_t|\past{X}_t)$. For stationary process, this is the global
	\emph{predictive information rate} (PIR), the average rate at which new information arrives about
	the future. In terms of conditional entropies, has two forms:
	$H(\fut{X}_t|\past{X}_t) - H(\fut{X}_t|X_t,\past{X}_t)$ or 
	$H(X_t|\past{X}_t) - H(X_t|\fut{X}_t,\past{X}_t)$. 
	\end{enumerate}

\end{iframe}

\begin{iframe}[Global measures for stationary processes]
	For a stationary random process model, the average levels of suprise and information
	are captured by the time-shift invariant process information measures:
	\begin{align*}
		\text{entropy rate} &:  & h_\mu  &= H(X_t | \past{X}_t) \\
		\text{multi-information rate}  &: & \rho_\mu  &= I(\past{X}_t;X_t)  = H(X_t) - h_\mu \\
		\text{residual entropy  rate}  &: & r_\mu &= H(X_t | \past{X}_t, \fut{X}_t) \\
		\text{predictive information  rate} &:  & b_\mu  &= I(X_t;\fut{X}_t|\past{X}_t)  = h_\mu - r_\mu
	\end{align*}
	Residual entropy also known as \emph{erasure entropy} \cite{VerduWeissman2006}.
\end{iframe}

\begin{isframe}[Process I-diagrams]
%		\newcommand\subfig[2]{\shortstack{#2\\[0.75em]#1}}
		\newcommand\subfig[2]{#2}
		\newcommand\rad{1.75em}%
		\newcommand\ovoid[1]{%
			++(-#1,\rad) 
			-- ++(2 * #1,0em) arc (90:-90:\rad)
 			-- ++(-2 * #1,0em) arc (270:90:\rad) 
		}%
		\newcommand\axis{2.75em}%
		\newcommand\olap{0.85em}%
		\newcommand\offs{3.6em}
		\newcommand\longblob{\ovoid{\axis}}
		\newcommand\shortblob{\ovoid{1.75em}}
		\begin{figure}
				\begin{tikzpicture}%[baseline=-1em]
					\newcommand\rc{\rad}
					\newcommand\throw{2.5em}
					\coordinate (p1) at (180:1.5em);
					\coordinate (p2) at (0:0.3em);
					\newcommand\bound{(-7em,-2.6em) rectangle (7em,3.0em)}
					\newcommand\present{(p2) circle (\rc)}
					\newcommand\thepast{(p1) ++(-\throw,0) \ovoid{\throw}}
					\newcommand\fillclipped[2]{%
						\begin{scope}[even odd rule]
							\foreach \thing in {#2} {\clip \thing;}
							\fill[black!#1] \bound;
						\end{scope}%
					}%
					\fillclipped{30}{\present,\bound \thepast}
					\fillclipped{15}{\present,\bound \thepast}
					\fillclipped{45}{\present,\thepast}
					\draw \thepast;
					\draw \present;
					\node at (barycentric cs:p2=1,p1=-0.3) {$h_\mu$};
					\node at (barycentric cs:p2=1,p1=1) [shape=rectangle,fill=black!45,inner sep=1pt]{$\rho_\mu$};
					\path (p2) +(90:3em) node {$X_0$};
					\path (p1) +(-3em,0em) node  {\shortstack{infinite\\past}};
					\path (p1) +(-4em,\rad) node [anchor=south] {$\ldots,X_{-1}$};
				\end{tikzpicture}%
			\\[0.25em]
				\begin{tikzpicture}%[baseline=-1em]
					\newcommand\rc{2.2em}
					\newcommand\throw{2.5em}
					\coordinate (p1) at (210:1.5em);
					\coordinate (p2) at (90:0.8em);
					\coordinate (p3) at (-30:1.5em);
					\newcommand\bound{(-7em,-2.6em) rectangle (7em,3.0em)}
					\newcommand\present{(p2) circle (\rc)}
					\newcommand\thepast{(p1) ++(-\throw,0) \ovoid{\throw}}
					\newcommand\future{(p3) ++(\throw,0) \ovoid{\throw}}
					\newcommand\fillclipped[2]{%
						\begin{scope}[even odd rule]
							\foreach \thing in {#2} {\clip \thing;}
							\fill[black!#1] \bound;
						\end{scope}%
					}%
%					\fillclipped{80}{\future,\thepast}
					\fillclipped{30}{\present,\future,\bound \thepast}
					\fillclipped{15}{\present,\bound \future,\bound \thepast}
					\draw \future;
					\fillclipped{45}{\present,\thepast}
					\draw \thepast;
					\draw \present;
					\node at (barycentric cs:p2=0.9,p1=-0.17,p3=-0.17) {$r_\mu$};
					\node at (barycentric cs:p1=-0.5,p2=1.0,p3=1) {$b_\mu$};
					\node at (barycentric cs:p3=0,p2=1,p1=1.2) [shape=rectangle,fill=black!45,inner sep=1pt]{$\rho_\mu$};
					\path (p2) +(140:3.2em) node {$X_0$};
	%            \node at (barycentric cs:p3=0,p2=1,p1=1) {$\rho_\mu$};
					\path (p3) +(3em,0em) node  {\shortstack{infinite\\future}};
					\path (p1) +(-3em,0em) node  {\shortstack{infinite\\past}};
					\path (p1) +(-4em,\rad) node [anchor=south] {$\ldots,X_{-1}$};
					\path (p3) +(4em,\rad) node [anchor=south] {$X_1,\ldots$};
				\end{tikzpicture}%
%				\\[0.25em]
%		The small dark
%		region  below $X_0$ is $\sigma_\mu$ and the excess entropy 
%		is $E = \rho_\mu + \sigma_\mu$.
		\end{figure}
		Marginal entropy of `present' $X_0$ is $H(X_0)=\rho_\mu+r_\mu+b_\mu$.\\
		Entropy rate is $h_\mu = r_\mu+b_\mu$.
\end{isframe}

\section{Markov chains}
\label{s:InfoInMC}


\begin{iframe}[Markov chains\nicedot Definitions]

%	Now we'll look at information dynamics in one of the simplest possible models, a Markov chain.
%	To illustrate the how the measures defined in \secrf{InfoInRandomProcs} can be computed
%	in practice, we will consider one of the simplest random processes, a 
%	first order Markov chain. 
%	In this case, the dynamic information measures can be computed in closed-form.
%

	Let $X$ be a Markov chain with state space 
	$\{1, \ldots, K\}$, \ie the $X_t$ take values from $1$ to $K$.
	\begin{center}
   \begin{tikzpicture}[->]
      \matrix[column sep=2em,ampersand replacement=\&]{
        \cn(X,1) \&  \cn(X,2) \& \cn(X,3) \&  \cn(X,4)  \& \dn(XT) \\};
      \rl(X1,X2) \rl(X2,X3) \rl(X3,X4) \rl(X4,XT)
    \end{tikzpicture}
	\end{center}
%	For the sake of brevity let us assume that $\domA$ is the set of integers from 1 to $K$. 
	Parameterised by transition matrix $\trans \in \reals^{K\times K}$,
%	encoding the distribution of any element of the sequence given previous one,
	\ie $p(\ev(X_{t+1}=i)|\ev(X_t=j))=\trans_{ij}$.
	Assume irreducibility, ergodicity \etc to ensure uniqueness of 
	stationary distribution $\pi$ such that
	$p(\ev(X_t=i))=\init_i$ independent of $t$. Entropy rate as a function of
	$a$ is
% $\entrorate:\reals^{K\times K} \to \reals$:
	\[
		\entrorate(\trans) = \sum_{j=1}^K \init_j \sum_{i=1}^K -\trans_{ij} \log \trans_{ij}.
	\]
\end{iframe}

\begin{iframe}[Markov chains\nicedot  PIR]
	Predictive information rate for first order chains comes out in terms of entropy rate
	function as 
	\[
		b_\mu = h(a^2) - h(a),
	\]
	where $a^2$ is \emph{two-step} transition matrix. 

	\uncover<2->{
	Can be generalised to higher-order transition matrices
	\[
		b_\mu = h(\hat{a}^{N+1}) - Nh(\hat{a}),
	\]
	where $N$ is the order of the chain and $\hat{a}$ is a sparse
	$K^N\times K^N$ transition matrix over product state space of $N$
	consecutive observations (step size 1).
	}
\end{iframe}

\begin{iframe}[Entropy rate and PIR in Markov chains]

	\begin{fig}{artseq}
		\hangbox{\colfig[0.40]{matbase/fig8515}}%
		\quad
		\hangbox{%
			\begin{tabular}{cc}%
				\colfig[0.18]{matbase/fig1356} &
				\colfig[0.18]{matbase/fig45647} \\
				\colfig[0.18]{matbase/fig49938} &
				\colfig[0.18]{matbase/fig23355}%
			\end{tabular}%
		}%
%			\end{hanging}\\
	\end{fig}
	For given $K$, entropy rate varies between 0 (deterministic sequence)
	and $\log K$ when $\trans_{ij}=1/K$ for all $i,j$.
	Space of transition matrices explored by generating
	them at random and plotting entropy rate vs PIR. (Note inverted
	`U' relationship). %Transmat (d) is almost uniform.
\end{iframe}

\begin{iframe}[Samples from processes with different PIR]
	\begin{figure}
		\colfig[0.75]{matbase/fig847}\\
		\colfig[0.75]{matbase/fig61989}\\
		\colfig[0.75]{matbase/fig43415}\\
		\colfig[0.75]{matbase/fig50385}
	\end{figure}
	Sequence (a) is repetition
	of state 4 (see transmat (a) on previous slide).
	System (b) has the highest PIR.
\end{iframe}

%				\begin{tabular}{rl}
%					(a) & \raisebox{-1em}{\colfig[0.58]{matbase/fig9048}}\\[1em]
%					(b) & \raisebox{-1em}{\colfig[0.58]{matbase/fig58845}}\\[1em]
%					(c) & \raisebox{-1em}{\colfig[0.58]{matbase/fig45019}}\\[1em]
%					(d) & \raisebox{-1em}{\colfig[0.58]{matbase/fig1511}}
%				\end{tabular}

\section{Application: The Melody Triangle}
\begin{iframe}[Complexity and interestingness: the Wundt Curve]
	\label{s:Wundt}
		Studies looking into the relationship between stochastic complexity
		(usually measured as entropy or entropy rate) and aesthetic value, reveal 
		an inverted `U' shaped curve \citep{Berlyne71}. (Also, Wundt curve \cite{Wundt1897}).
		Repeated exposure tends to move stimuli leftwards.

		\hangbox{%
			\only<1>{\colfig[0.5]{wundt}}%
			\only<2>{\colfig[0.5]{wundt2}}%
		}\hfill
		\hangbox{\parbox{0.43\linewidth}{\raggedright
		%Too deterministic $\rightarrow$ predictable, boring like a monotone;\\
		%Too random $\rightarrow$ are boring like white noise: unstructured,
		%featureless, uniform.
		Explanations for this usually appeal to a need for a `balance'
		between order and chaos, unity and diversity, and so on, in a generally
		imprecise way.}}


%		Hence, a sequence can be uninteresting in two opposite ways: by
%		being utterly predictable \emph{or} by being utterly
%		unpredictable.
%		Meyer \cite{Meyer2004} suggests something similar:
%		hints at the same thing while discussing 
%		the relation between the rate of information flow and aesthetic experience, 
%		suggesting that
%%		`unless there is some degree of order, \ldots
%%		there is nothing to be uncertain \emph{about} \ldots
%		`If the amount of information [by which he means entropy and surprisingness] 
%		is inordinately increased, the result is a kind of cognitive white noise.'

\end{iframe}

\begin{iframe}[PIR as a measure of cognitive activity]

		The predictive information rate incorporates a similar balance automatically:
		is maximal for sequences which are neither deterministic nor 
		totally uncorrelated across time. 
		
		\vspace{1em}
		\begin{tabular}{rr}%
			\raisebox{0.5em}{too predictable:} &
			\only<1>{\noderow(black,un0,un0,un0,un1,un1)}%
			\only<2>{\noderow(black,black,un0,un0,un0,un1)}%
			\only<3>{\noderow(black,black,black,un0,un0,un0)}%
			\only<4>{\noderow(black,black,black,black,un0,un0)}%
		\\[1.2em]
			\raisebox{0.5em}{intermediate:} &
			\only<1>{\noderow(black,un1,un2,un3,un4,un5)}%
			\only<2>{\noderow(black,black,un1,un2,un3,un4)}%
			\only<3>{\noderow(black,black,black,un1,un2,un3)}%
			\only<4>{\noderow(black,black,black,black,un1,un2)}%
		\\[1.2em]
			\raisebox{0.5em}{too random:} &
			\only<1>{\noderow(black,un5,un5,un5,un5,un5)}%
			\only<2>{\noderow(black,black,un5,un5,un5,un5)}%
			\only<3>{\noderow(black,black,black,un5,un5,un5)}%
			\only<4>{\noderow(black,black,black,black,un5,un5)}%
		\end{tabular}
		\vspace{1em}

		(Black: \emph{observed}; red: \emph{unobserved}; paler: \emph{greater uncertainty}.)
		Our interpretation:
%		when each event appears to carry no new information about the unknown future,
%		it is `meaningless' and not worth attending to. 
		Things are `interesting' or at least `salient' when each new part supplies new information about parts to come.

%		Quantitative information dynamics will enable us to test this experimentally with human 
%		subjects.
\end{iframe}

\begin{iframe}[The Melody Triangle\nicedot Information space]
 \begin{figure}
	\colfig[0.75]{mtriscat}
	\end{figure}
	Population of transition matrices in 3D space of $h_\mu$, $\rho_\mu$ and $b_\mu$. 
%	Concentrations of points along redundancy axis correspond to roughly periodic patterns.
	Colour of each point
	represents PIR.
	%---highest values found at intermediate entropy and redundancy. 
	Shape is mostly (not completely) hollow inside: forming roughly 
	a curved triangular sheet.
\end{iframe}

\begin{iframe}[The Melody Triangle\nicedot User interface]
 \begin{figure}
	\colfig[0.55]{TheTriangle.pdf}
	\end{figure}
	Allows user to place tokens in the triangle
	to cause sonification of a Markov chain with corresponding information
	`coordinate'. 
\end{iframe}

\begin{iframe}[Subjective information]
	So far we've assumed that sequence is actually sampled from 
	from a stationary Markov chain with a transition matrix known
	to the observer.
	This means time averages of IPI and surprise should equal
	expectations.

	\uncover<2->{
	What if sequence is sampled from some other Markov chain, 
	or is produced by some unknown process?
	}
	
	\begin{itemize}
		\item<3->
		In general, it may be impossible to identify any `true' model. There
		are no `objective' probabilities; only subjective ones, as
		argued by de Finetti \cite{deFinetti}.
		

		\item<4->
		If sequence \emph{is} sampled from some Markov chain, we can
		compute (time) averages of observer's average subjective surprise 
		and PI and also track what happens if observer gradually learns 
		the transition  matrix from the data.
	\end{itemize}
\end{iframe}


\begin{iframe}[Effect of learning on information dynamics]
	\begin{figure}
%		\colfig{matbase/fig42687} % too small text
%		\colfig{matbase/fig60379} % 9*19 too tall
%		\colfig{matbase/fig52515} % 9*20 ok, perhaps text still too small
		\colfig[0.9]{matbase/fig30461} % 8*19  ok
%		\colfig{matbase/fig66022} % 8.5*19  ok
	\end{figure}
%	Upper row shows actual stochastic learning,
%	lower shows the idealised deterministic learning.
	\textbf{(a/b/e/f)}: multiple runs starting from same 
	initial condition but using different generative transition matrices.
	\textbf{(c/d/g/h)}: multiple runs starting from different
	initial conditions and converging on transition matrices 
		with (c/g) high and (d/h) low PIR.
\end{iframe}


\section{More process models}
\begin{iframe}[Exchangeable sequences and parametric models]
	De Finetti's theorem says that an exchangeable random process can be represented
	as a sequence variables which are iid \emph{given} some hidden probability
	distribution, which we can think of as a parameterised model:
	\begin{tabular}{lp{0.45\linewidth}}
		\hangbox{\begin{tikzpicture}
			[>=stealth',var/.style={circle,draw,inner sep=1pt,text height=10pt,text depth=4pt}]
			\matrix[ampersand replacement=\&,matrix of math nodes,row sep=2em,column sep=1.8em,minimum size=17pt] {
				\& |(theta) [var]| \Theta \\
				|(x1) [var]| X_1 \& |(x2) [var]| X_2 \& |(x3) [var]| X_3 \&
				|(etc) [outer sep=2pt]| \dots \\
			};
			\foreach \n in {x1,x2,x3,etc} \draw[->] (theta)--(\n);
		\end{tikzpicture}}
		&
			\raggedright 
			\uncover<2->{Observer's belief state at time $t$ includes probability distribution 
			over the parameters $p(\ev(\Theta=\theta)|\ev(\past{X}_t=\past{x}_t))$.}
	\end{tabular}\\[1em]	
	\uncover<3->{
	Each observation causes revision of belief state
	and hence supplies information 
	$
		I(\ev(X_t=x_t)\to\Theta|\ev(\past{X}_t=\past{x}_t)) 
	%		= D( p_{\Theta|\ev(X_t=x_t),\ev(\past{X}_t=\past{x}_t)} || p_{\Theta|\ev(\past{X}_t=\past{x}_t)} ).
	$ about $\Theta$:
	In previous work we called this the `model information rate'.
	}
	\uncover<4->{(Same as Haussler and Opper's \cite{HausslerOpper1995} IIG or 
	Itti and Baldi's \cite{IttiBaldi2005} Bayesian surprise.)}
\end{iframe}

		\def\circ{circle (9)}%
		\def\bs(#1,#2,#3){(barycentric cs:p1=#1,p2=#2,p3=#3)}%
\begin{iframe}[IIG equals IPI in (some) XRPs]
	\begin{tabular}{@{}lc}
		\parbox[c]{0.5\linewidth}{\raggedright
		Mild assumptions yield a relationship between IIG (instantaneous information gain) and IPI.
		(Everything here implicitly conditioned on $\past{X}_t$).}
	&
		\pgfsetxvec{\pgfpoint{1mm}{0mm}}%
		\pgfsetyvec{\pgfpoint{0mm}{1mm}}%
		\begin{tikzpicture}[baseline=0pt]
			\coordinate (p1) at (90:6);
			\coordinate (p2) at (210:6);
			\coordinate (p3) at (330:6);
			\only<4->{%
				\begin{scope}
					\foreach \p in {p1,p2,p3} \clip (\p) \circ;
					\fill[lightgray] (-10,-10) rectangle (10,10);
				\end{scope}
				\path	(0,0) node {$\mathcal{I}_t$};}
			\foreach \p in {p1,p2,p3} \draw (\p) \circ;
			\path (p2) +(210:13) node {$X_t$}
						(p3) +(330:13) node {$\fut{X}_t$}
					(p1) +(140:12) node {$\Theta$};
			\only<2->{\path	\bs(-0.25,0.5,0.5) node {$0$};}
			\only<3->{\path	\bs(0.5,0.5,-0.25) node {$0$};}
		\end{tikzpicture}
	\end{tabular}\\
	\begin{enumerate}
			\uncover<2->{\item	$X_t \perp \fut{X}_t | \Theta$: observations iid given $\Theta$ for XRPs;}
			\uncover<3->{\item $\Theta \perp X_t | \fut{X}_t$:
%		$I(X_t;\fut{X}_t|\Theta_t)=0$ due to the conditional independence of
%		observables given the parameters $\Theta_t$, and 
%		$I(\Theta_t;X_t|\fut{X}_t)=0$
		assumption that $X_t$ adds no new information about $\Theta$
		given infinitely long sequence $\fut{X}_t =X_{t+1:\infty}$.}
\end{enumerate}
\uncover<4->{Hence, $I(X_t;\Theta_t|\past{X}_t)=I(X_t;\fut{X}_t|\past{X}_t) = \mathcal{I}_t$.\\}
\uncover<5->{Can drop assumption 1 and still get $I(X_t;\Theta_t|\past{X}_t)$ as an additive component (lower bound) of $\mathcal{I}_t$.}
\end{iframe}

\def\fid#1{#1}
\def\specint#1{\frac{1}{2\pi}\int_{-\pi}^\pi #1{S(\omega)} \dd \omega}
\begin{iframe}[Discrete-time Gaussian processes]
	Information-theoretic quantities used earlier have analogues for continuous-valued
	random variables.  For stationary Gaussian processes, we can obtain results in
	terms of the power spectral density $S(\omega)$, (which for discrete time is periodic
	in $\omega$ with period $2\pi$). Standard methods give
	\begin{align*}
		H(X_t) &= \frac{1}{2}\left( \log 2\pi e + \log \specint{}\right), \\
		h_\mu &= \frac{1}{2} \left( \log 2\pi e  + \specint{\log} \right), \\
		\rho_\mu &= \frac{1}{2} \left( \log \specint{\fid} - \specint{\log}\right).
	\end{align*}
	Entropy rate is also known as Kolmogorov-Sinai entropy. 
%	$H(X_t)$ is a function of marginal variance which is just the total power in the spectrum.
\end{iframe}

\begin{iframe}[PIR/Multi-information duality]
	Analysis yeilds PIR:
	\[
		b_\mu = \frac{1}{2} \left( \log \specint{\frac{1}} - \specint{\log\frac{1}} \right).
	\]
	Yields simple expression for finite-order autogregressive processes, but beware: can diverge
	for moving average processes!

	\uncover<2->{
	Compare with multi-information rate:
	\[
		\rho_\mu = \frac{1}{2} \left( \log \specint{\fid} - \specint{\log}\right).
	\]
	Yields simple expression for finite-order moving-average processes, but can diverge
	for marginally stable autogregressive processes.
	}

	\uncover<3->{
		Infinities are troublesome and point to problem with notion of infinitely
		precise observation of continuous-valued variables.
	}
\end{iframe}

%		Information gained about model parameters (measured as the KL divergence
%		between prior and posterior distributions) is equivalent
%		to \textbf{Itti and Baldi's `Bayesian surprise'} \cite{IttiBaldi2005}.


	\section{Application: Analysis of minimalist music}
	\label{s:Experiments}

\begin{iframe}[Material and Methods]

%		Returning to our original goal of modelling the perception of temporal structure
%		in music, we computed dynamic information measures for 
		We took two pieces of minimalist 
		music by Philip Glass, \emph{Two Pages} (1969) and \emph{Gradus} (1968).
		Both monophonic and isochronous, so representable very simply as 
		a sequence of symbols (notes), one symbol per beat,
		yet remain ecologically valid examples of `real' music. 

		We use an elaboration of the Markov chain model---not necessarily 
		a good model \latin{per se}, but that wasn't the point of the experiment.
		Markov chain model was chosen as it is tractable from and information 
		dynamics point of view while not being completely trivial.
\end{iframe}

\begin{iframe}[Time-varying transition matrix model]
		We allow transition matrix to vary slowly with time to track 
		changes in the sequence structure.
		Hence, observer's belief state includes a probabilitiy
		distribution over transition matrices; we choose a product of
		Dirichlet distributions:
		\[
			\textstyle
			p(\trans|\param) = \prod_{j=1}^K p_\mathrm{Dir}(\trans_{:j}|\param_{:j}),
		\]
		where $\trans_{:j}$ is \nth{j} column of $\trans$ and $\param$ is an
		$K \times K$ parameter matrix.
%		(Dirichlet, being conjugate to discrete/multinomial distribution,
%		makes processing of observations particularly simple.)
%		such that $\param_{:j}$ is the 
%		parameter tuple for the $K$-component Dirichlet distribution $p_\mathrm{Dir}$.
%		\begin{equation}
%			\textstyle
%			p(\trans|\param) = \prod_{j=1}^K p_\mathrm{Dir}(\trans_{:j}|\param_{:j})
%			   = \prod_{j=1}^K (\prod_{i=1}^K \trans_{ij}^{\param_{ij}-1}) / B(\param_{:j}),
%		\end{equation}
%		where $\trans_{:j}$ is the \nth{j} column of $\trans$ and $\param$ is an
%		$K \times K$ matrix of parameters.

		At each time step, distribution first \emph{spreads} under mapping
		\[
			\param_{ij} \mapsto \frac{\beta\param_{ij}}{(\beta + \param_{ij})}
		\]
		to model possibility that transition matrix
		has changed ($\beta=2500$ in our experiments). Then it \emph{contracts}
		due to new observation providing fresh evidence about transition matrix.
%
%		Each observed symbol % provides fresh evidence about current transition matrix, 
%		enables observer to update its belief state.
\end{iframe}


\begin{iframe}[Two Pages\nicedot Results]

%		\begin{fig}{twopages}
			\begin{tabular}{c@{\hspace{1.5ex}}l}%
%			\hspace*{-1.5em}
%				\hangbox{\colfig[0.5]{matbase/fig20304}} % 3 plots
%				\hangbox{\colfig[0.52]{matbase/fig39528}} % 4 plots with means
%				\hangbox{\colfig[0.52]{matbase/fig63538}} % two pages, 5 plots
%				\hangbox{\colfig[0.52]{matbase/fig53706}} % two pages, 5 plots
				\hangbox{\colfig[0.72]{matbase/fig33309}} % two pages, 5 plots
			&
				\hangbox{%
					\parbox{0.28\linewidth}{
						\raggedright
						\textbf{Thick lines:} part boundaries as indicated 
						by Glass; \textbf{grey lines (top four panels):} changes in the melodic `figures';
					%	of which the piece is constructed. 
						\textbf{grey lines (bottom panel):}
						six most surprising moments chosen by expert listenter. 
					}
				}
			\end{tabular}
%		\end{fig}
\end{iframe}

\begin{iframe}[Two Pages\nicedot Rule based analysis]
	\begin{figure}
		\colfig[0.98]{matbase/fig13377}
%		\hangbox{\colfig[0.98]{matbase/fig13377}}
	\end{figure}
	Analysis of \emph{Two Pages} using (top) Cambouropoulos' 
	Local Boundary Detection Model (LBDM) and 
	(bottom) Lerdahl and Jackendoff's 
	grouping preference rule 3a (GPR3a), which is a function of pitch proximity.
	Both analyses indicate `boundary strength'.
\end{iframe}

\begin{iframe}[Two Pages\nicedot Discussion]
		Correspondence between the information
		measures and the structure of the piece is quite close.
		Good agreement between the six `most surprising
		moments' chosen by expert listener and  model information signal. 
		
		What appears to be an error in the detection of
		the major part boundary (between events 5000 and 6000) actually
		raises a known anomaly in the score, where Glass places the boundary several events
		before there is any change in the pattern of notes. Alternative analyses of \emph{Two Pages}
		place the boundary in agreement with peak in our surprisingness signal.
\end{iframe}

\comment{
\begin{iframe}[Gradus\nicedot Results]

%		\begin{fig}{gradus}
			\begin{tabular}{c@{\hspace{1.5ex}}l}
%				&
%				\hangbox{\colfig[0.4]{matbase/fig81812}}
%				\hangbox{\colfig[0.52]{matbase/fig23177}} % two pages, 5 plots
%				\hangbox{\colfig[0.495]{matbase/fig50709}} % Fudged segmentation
%				\hangbox{\colfig[0.495]{matbase/fig3124}} % Geraint's segmentation
				\hangbox{\colfig[0.715]{matbase/fig11808}} % Geraint's segmentation, corrected
			&
%				\hangbox{\colfig[0.5]{matbase/fig39914}} 
				\hangbox{%
					\parbox{0.28\linewidth}{
						\raggedright
						\textbf{Thick lines:} part boundaries as indicated 
						by the composer.
						\textbf{Grey lines:} segmentation by expert listener.

						Note: traces smoothed with Gaussian
						window about 16 events wide. 
					}
				}
			\end{tabular}
%		\end{fig}
\end{iframe}

\begin{iframe}[Gradus\nicedot Rule based analysis]
	\begin{figure}
		\colfig[0.98]{matbase/fig58691}
	\end{figure}
	Boundary strength analysis of \emph{Gradus} using (top) Cambouropoulos' 
	\cite{CambouropoulosPhD} Local Boundary Detection Model  and 
	(bottom) Lerdahl and Jackendoff's \cite{LerdahlJackendoff83}
	grouping preference rule 3a.
\end{iframe}
}
\begin{iframe}[Gradus\nicedot Metrical analysis]
	\begin{figure}
		\begin{tabular}{cc}
			\colfig[0.40]{matbase/fig56807} & \colfig[0.41]{matbase/fig27144} \\
			\colfig[0.40]{matbase/fig87574} & \colfig[0.41]{matbase/fig13651} \\
			\hspace*{1ex}\colfig[0.39]{matbase/fig19913} & \hspace*{1ex}\colfig[0.40]{matbase/fig66144}
		\end{tabular}
	\end{figure}
\end{iframe}

\comment{
\begin{iframe}[Gradus\nicedot Discussion]
		
		\emph{Gradus} is much less systematically structured than \emph{Two Pages}, and
		relies more on the conventions of tonal music, which are not represented the model.

		For example initial transition matrix is uniform, which does not correctly represent 
		prior knowledge about tonal music.

		Information dynamic analysis does not give such a 
		clear picture of the structure; but some of the fine structure can be related
		to specific events in the music (see Pearce and Wiggins 2006).
%		nonetheless, there are some points of correspondence between the analysis and
%		segmentation given by Keith Potter.

\end{iframe}
}

	\section{Application: Beat tracking and rhythm}

	\begin{iframe}[Bayesian beat tracker]
		\uncover<1->{
			Works by maintaining probabilistic belief state about time of next
			beat and current tempo.
			
			\begin{figure}
				\colfig{beat_prior}
			\end{figure}
			}

		\uncover<2->{	
			Receives categorised drum events (kick or snare) from audio analysis front-end.
			}

	\end{iframe}

	\begin{iframe}[Information gain in the beat tracker]
		\begin{tabular}{ll}
			\parbox[t]{0.43\linewidth}{\raggedright
			\uncover<1->{
				Each event triggers a change in belief state, so we can compute
				information gain about beat parameters.}\\[1em]

			\uncover<2->{
				Relationship between IIG and IPI
				means we treat it as a proxy for IPI.}
				}
			&
			\hangbox{\colfig[0.55]{beat_info}}
		\end{tabular}
	\end{iframe}

	\begin{iframe}[Analysis of drum patterns]
		We analysed 17 recordings of drummers, both playing solo or with a band.
		All patterns in were in 4/4.
		\begin{itemize}
			\item
			\uncover<1->{
				Information tends to arrive at beat times: consequence of structure of model.
			}
			\item
			\uncover<2->{
				Lots of information seems to arrive after drum fills and breaks
				as the drummer reestablishes the beat.
			}
			\item
			\uncover<3->{
				No consistent pattern of information arrival in relation to metrical
				structure, so no obvious metrical structure in micro-timing of events.
				However, still possible that metrical structure might emerge from predictive
				analysis of drum pattern.
				}
		\end{itemize}
	\end{iframe}

	\section{Summary and conclusions}
	\label{s:Conclusions}

	\begin{iframe}[Summary]

		\begin{itemize}
		\item Dynamic, observer-centric information theory.
		\item Applicable to any dynamic probabilistic model.
		\item PIR potentially a measure of complexity.
		\item Simple analysis for Markov chains and Gaussian processes.
		\item Applications in music analysis and composition.
		\item Search for neural correlates is ongoing (that's another talk\ldots).
		\end{itemize}
		Thanks!
	\end{iframe}

	\begin{bframe}[Bibliography]
		\bibliographystyle{alpha}
		{\small \bibliography{all,c4dm,compsci}}
	\end{bframe}
\end{document}