samer@74: \documentclass{beamer} samer@74: samer@74: \usepackage[T1]{fontenc} samer@74: \usepackage{microtype} samer@74: \usepackage{multimedia} samer@74: \usepackage{tikz} samer@74: \usetikzlibrary{matrix} samer@74: \usetikzlibrary{patterns} samer@74: \usetikzlibrary{arrows} samer@74: \usetikzlibrary{calc} samer@74: \usepackage{tools} samer@74: %\usepackage{amsfonts,amssymb} samer@74: samer@74: \tikzset{every picture/.style=semithick} samer@74: samer@74: %%% font options: samer@74: % atypewri, frankgth, gillsans, centuryg, futura, eurostil samer@74: %\usepackage{fourier} % Maths in serif Utopia samer@74: \usepackage[sf]{frankgth} samer@74: %\usepackage[sf]{optima} samer@74: samer@74: %%% Monospace font samer@74: %\usepackage[scaled=0.88]{ulgothic} % 0.88 % suits narrow faces samer@74: \renewcommand{\ttdefault}{plg} % Adobe Letter Gothic - suits light medium width face samer@74: %\renewcommand{\ttdefault}{pcr} % Courier - suits wide faces samer@74: % remember to match up size and weight of monospace font to main font samer@74: samer@74: \newcommand{\mytt}[1]{{\texttt{\footnotesize\fontseries{bx}\selectfont #1}}} samer@74: samer@74: \DeclareMathAlphabet{\mathcal}{OMS}{cmsy}{m}{n} samer@74: samer@74: samer@74: %%% Black on white samer@74: \definecolor{base}{rgb}{0,0,0} samer@74: \definecolor{comp}{named}{green} samer@74: \definecolor{paper}{named}{white} samer@74: samer@74: \logo{% samer@74: \includegraphics[height=16pt]{qmul-black}\hspace*{45pt}% samer@74: \raisebox{1pt}{\includegraphics[height=12pt]{c4dm-black-white}}% samer@74: } samer@74: samer@74: %%% Red on black samer@74: \comment{ samer@74: \definecolor{base}{rgb}{1,0,0} samer@74: \definecolor{comp}{rgb}{0,0.8,0.2} samer@74: \definecolor{paper}{named}{black} samer@74: samer@74: \logo{% samer@74: \includegraphics[height=16pt]{qmul-red}\hspace*{45pt}% samer@74: \raisebox{1pt}{\includegraphics[height=12pt]{c4dm-red-black}}% samer@74: } samer@74: } samer@74: samer@74: samer@74: \useinnertheme{default}%circles samer@74: \useoutertheme{default} samer@74: \usefonttheme[onlymath]{serif} samer@74: samer@74: \setbeamercolor{normal text}{bg=paper,fg=base!90!-paper} samer@74: \setbeamercolor{background}{bg=comp!50!paper,fg=comp} samer@74: %\setbeamercolor{structure}{fg=base!75!-paper} samer@74: \setbeamercolor{structure}{fg=red!50!base} samer@74: \setbeamercolor{palette primary}{bg=yellow!50!paper,fg=yellow} samer@74: \setbeamercolor{palette secondary}{bg=orange!50!paper,fg=orange} samer@74: \setbeamercolor{palette tertiary}{bg=blue!50!paper,fg=blue} samer@74: \setbeamercolor{palette quaternary}{bg=green!50!paper,fg=green} samer@74: \setbeamercolor{block body}{bg=base!20!paper} samer@74: \setbeamercolor{block title}{bg=base!60!paper,fg=paper} samer@74: \setbeamercolor{navigation symbols}{fg=base!90!paper} samer@74: \setbeamercolor{separation line}{bg=blue,fg=yellow} samer@74: \setbeamercolor{fine separation line}{bg=blue,fg=orange} samer@74: samer@74: % Title page samer@74: % \setbeamercolor{title}{bg=base!20!paper} samer@74: % \setbeamercolor{subtitle}{bg=base!20!paper} samer@74: % \setbeamercolor{title page}{bg=base!40!paper} samer@74: samer@74: % \setbeamercolor{headline}{bg=blue} samer@74: % \setbeamercolor{footline}{bg=blue} samer@74: % \setbeamercolor{frametitle}{bg=base!30!paper} samer@74: % \setbeamercolor{framesubtitle}{bg=base!40!paper} samer@74: samer@74: % \setbeamercolor{section in toc}{bg=base!25!paper,fg=orange} samer@74: % \setbeamercolor{section in toc shaded}{bg=base!25!paper,fg=orange!80!paper} samer@74: % \setbeamercolor{subsection in toc}{bg=base!25!paper,fg=orange} samer@74: % \setbeamercolor{subsection in toc shaded}{bg=yellow!25!paper,fg=orange!80!paper} samer@74: % page number in head/foot samer@74: % section in head/foot samer@74: % section in head/foot shaded samer@74: samer@74: samer@74: \setbeamerfont{structure}{series=\bfseries} samer@74: \setbeamerfont{title}{series=\mdseries,size=\Large} samer@74: %\setbeamerfont{title}{series=\ltseries,size=\huge} samer@74: \setbeamerfont{date}{size=\footnotesize}%,series=\mdcseries} samer@74: \setbeamerfont{institute}{size=\footnotesize}%,series=\mdcseries} samer@74: \setbeamerfont{author}{size=\footnotesize,series=\bfseries} samer@74: \setbeamercolor{bibliography item}{parent={normal text}} samer@74: \setbeamercolor{bibliography entry author}{fg=base} samer@74: \setbeamercolor{bibliography entry location}{fg=base!70!paper} samer@74: samer@74: %%% Templates samer@74: samer@74: \setbeamertemplate{bibliography item}[text] samer@74: \setbeamertemplate{bibliography entry title}{ } samer@74: \setbeamertemplate{bibliography entry location}{ } samer@74: \setbeamertemplate{blocks}[rounded][shadow=false] samer@74: \setbeamertemplate{items}[circle] samer@74: %\setbeamertemplate{bibliography item}[triangle] samer@74: % \setbeamertemplate{title page}[default][rounded=true,shadow=false] samer@74: % \setbeamertemplate{frametitle}[default][rounded=true,shadow=false] samer@74: \setbeamertemplate{sidebar right}{} samer@74: \setbeamertemplate{footline}{ samer@74: \hspace*{0.2cm} samer@74: \insertlogo samer@74: \hfill samer@74: \usebeamertemplate***{navigation symbols}% samer@74: \hfill samer@74: \makebox[6ex]{\hfill\insertframenumber/\inserttotalframenumber}% samer@74: \hspace*{0.2cm} samer@74: samer@74: \vskip 4pt samer@74: } samer@74: samer@74: \setbeamertemplate{navigation symbols} samer@74: {% samer@74: \hbox{% samer@74: \hbox{\insertslidenavigationsymbol} samer@74: \hbox{\insertframenavigationsymbol} samer@74: % \hbox{\insertsubsectionnavigationsymbol} samer@74: \hbox{\insertsectionnavigationsymbol} samer@74: \hbox{\insertdocnavigationsymbol} samer@74: % \hbox{\insertbackfindforwardnavigationsymbol}% samer@74: }% samer@74: } samer@74: samer@74: samer@74: \AtBeginSection[]{ samer@74: \begin{iframe}[Outline] samer@74: \tableofcontents[currentsection] samer@74: \end{iframe} samer@74: } samer@74: %\linespread{1.1} samer@74: samer@74: \setlength{\parskip}{0.5em} samer@74: samer@74: \newenvironment{bframe}[1][untitled]{\begin{frame}[allowframebreaks]\frametitle{#1}}{\end{frame}} samer@74: \newenvironment{iframe}[1][untitled]{\begin{frame}\frametitle{#1}}{\end{frame}} samer@74: \newenvironment{isframe}[1][untitled]{\begin{frame}[fragile=singleslide,environment=isframe]\frametitle{#1}}{\end{frame}} samer@74: samer@74: \renewenvironment{fig}[1] samer@74: {% samer@74: \begin{figure} samer@74: \def\fglbl{f:#1} samer@74: \let\ocap=\caption samer@74: \renewcommand{\caption}[2][]{\ocap[##1]{\small ##2}} samer@74: \centering\small samer@74: }{% samer@74: \label{\fglbl} samer@74: \end{figure} samer@74: } samer@74: samer@74: \newcommand{\paragraph}[1]{\textbf{#1}\qquad} samer@74: \newcommand{\colfig}[2][1]{\includegraphics[width=#1\linewidth]{figs/#2}}% samer@74: \let\citep=\cite samer@74: %\newcommand{\dotmath}[2]{\psfrag{#1}[Bc][Bc]{\small $#2$}} samer@74: samer@74: \title{Cognitive Music Modelling:\\An Information Dynamics Approach} samer@74: \author{Samer Abdallah, Henrik Ekeus, Peter Foster,\\Andrew Robertson and Mark Plumbley} samer@74: \institute{Centre for Digital Music\\Queen Mary, University of London} samer@74: samer@74: \date{\today} samer@74: samer@74: \def\X{\mathcal{X}} samer@74: \def\Y{\mathcal{Y}} samer@74: \def\Past{\mathrm{Past}} samer@74: \def\Future{\mathrm{Future}} samer@74: \def\Present{\mathrm{Present}} samer@74: \def\param{\theta} samer@74: \def\trans{a} samer@74: \def\init{\pi^{\trans}} samer@74: %\def\entrorate(#1){\mathcal{H}(#1)} samer@74: %\def\entrorate(#1){\dot{\mathcal{H}}(#1)} samer@74: \def\entrorate{h} samer@74: \def\emcmarg(#1){b_#1} samer@74: \def\mcmarg{\vec{b}} samer@74: \def\domS{\mathcal{S}} samer@74: \def\domA{\mathcal{A}} samer@74: samer@74: \def\Lxz(#1,#2){\mathcal{L}(#1|#2)} samer@74: \def\LXz(#1){\overline{\mathcal{L}}(#1)} samer@74: \def\LxZ(#1){\underline{\mathcal{L}}(#1)} samer@74: \def\LXZ{\overline{\underline{\mathcal{L}}}} samer@74: \def\Ixz(#1,#2){\mathcal{I}(#1|#2)} samer@74: \def\IXz(#1){\overline{\mathcal{I}}(#1)} samer@74: \def\IxZ(#1){\underline{\mathcal{I}}(#1)} samer@74: \def\IXZ{\overline{\underline{\mathcal{I}}}} samer@74: samer@74: \def\ev(#1=#2){#1\!\!=\!#2} samer@74: \def\sev(#1=#2){#1\!=#2} samer@74: samer@74: \def\FE{\mathcal{F}} samer@74: samer@74: \newcommand\past[1]{\overset{\rule{0pt}{0.2em}\smash{\leftarrow}}{#1}} samer@74: \newcommand\fut[1]{\overset{\rule{0pt}{0.1em}\smash{\rightarrow}}{#1}} samer@74: samer@74: \def\cn(#1,#2) {\node[circle,draw,inner sep=0.2em] (#1#2) {${#1}_{#2}$};} samer@74: \def\dn(#1) {\node[circle,inner sep=0.2em] (#1) {$\cdots$};} samer@74: \def\rl(#1,#2) {\draw (#1) -- (#2);} samer@74: samer@74: \definecolor{un0}{rgb}{0.5,0.0,0.0} samer@74: \definecolor{un1}{rgb}{0.6,0.15,0.15} samer@74: \definecolor{un2}{rgb}{0.7,0.3,0.3} samer@74: \definecolor{un3}{rgb}{0.8,0.45,0.45} samer@74: \definecolor{un4}{rgb}{0.9,0.6,0.6}{ samer@74: \definecolor{un5}{rgb}{1.0,0.75,0.75} samer@74: samer@74: %\def\blob(#1){\node[circle,draw,fill=#1,inner sep=0.25em]{};} samer@74: \def\bl(#1){\draw[circle,fill=#1] (0,0) circle (0.4em);} samer@74: \def\noderow(#1,#2,#3,#4,#5,#6){% samer@74: \tikz{\matrix[draw,rounded corners,inner sep=0.4em,column sep=2.1em,ampersand replacement=\&]{% samer@74: \bl(#1)\&\bl(#2)\&\bl(#3)\&\bl(#4)\&\bl(#5)\&\bl(#6)\\};}} samer@74: samer@74: \begin{document} samer@74: \frame{\titlepage} samer@74: \section[Outline]{} samer@74: \frame{ samer@74: \frametitle{Outline} samer@74: \tableofcontents samer@74: } samer@74: samer@74: samer@74: samer@74: \section{Expectation and surprise in music} samer@74: \label{s:Intro} samer@74: samer@74: \begin{iframe}[`Unfoldingness'] samer@74: Music is experienced as a samer@74: \uncover<2->{phenomenon} samer@74: \uncover<3->{that} samer@74: \uncover<4->{`unfolds'} \uncover<5->{in}\\ samer@74: \only<6>{blancmange}% samer@74: \only<7>{(just kidding)}% samer@74: \uncover<8->{time,} samer@74: \uncover<9->{rather than being apprehended as a static object presented in its samer@74: entirety.} samer@74: samer@74: \uncover<10->{[This is recognised in computation linguistics where the phenomenon is known as \emph{incrementality}, \eg in incremental parsing.]} samer@74: samer@74: \uncover<11->{% samer@74: Meyer \cite{Meyer67} argued that musical experience depends on samer@74: how we change and revise our conceptions \emph{as events happen}, samer@74: on how expectation and prediction interact with occurrence, and that, to a large samer@74: degree, the way to understand the effect of music is to focus on samer@74: this `kinetics' of expectation and surprise.% samer@74: } samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Expectation and suprise in music] samer@74: samer@74: Music creates samer@74: \emph{expectations} of what is to come next, which may be fulfilled samer@74: immediately, after some delay, or not at all. samer@74: Suggested by music theorists, \eg samer@74: L. B. Meyer \cite{Meyer67} and Narmour \citep{Narmour77} but also samer@74: noted much earlier by Hanslick \cite{Hanslick1854} in the samer@74: 1850s: samer@74: \begin{quote} samer@74: \small samer@74: `The most important factor in the mental process which accompanies the samer@74: act of listening to music, and which converts it to a source of pleasure, is samer@74: \ldots samer@74: % frequently overlooked. We here refer to samer@74: the intellectual satisfaction which the samer@74: listener derives from continually following and anticipating the composer's samer@74: intentions---now, to see his expectations fulfilled, and now, to find himself samer@74: agreeably mistaken. It is a matter of course that this intellectual flux and samer@74: reflux, this perpetual giving and receiving takes place unconsciously, and with samer@74: the rapidity of lightning-flashes.' samer@74: \end{quote} samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Probabilistic reasoning] samer@74: \uncover<1->{% samer@74: Making predictions and assessing surprise is samer@74: essentially reasoning with degrees of belief and (arguably) samer@74: the best way to do this is using Bayesian probability theory \cite{Cox1946,Jaynes27}.% samer@74: samer@74: [NB. this is \textbf{subjective} probability as advocated by \eg De Finetti and Jaynes.] samer@74: } samer@74: samer@74: % Thus, we assume that musical schemata are encoded as probabilistic % \citep{Meyer56} models, and samer@74: \uncover<2->{% samer@74: We suppose that familiarity with different styles of music takes the form samer@74: of various probabilistic models, and that these models are adapted through listening.% samer@74: } samer@74: % various stylistic norms is encoded as samer@74: % using models that encode the statistics of music in general, the particular styles samer@74: % of music that seem best to fit the piece we happen to be listening to, and the emerging samer@74: % structures peculiar to the current piece. samer@74: samer@74: \uncover<3->{% samer@74: Experimental evidence that humans are able to internalise statistical samer@74: knowledge about musical: \citep{SaffranJohnsonAslin1999,EerolaToiviainenKrumhansl2002}; and also samer@74: that statistical models are effective for computational analysis of music, \eg \cite{ConklinWitten95,Pearce2005}.% samer@74: } samer@74: samer@74: % analysis of music, \eg \cite{ConklinWitten95,PonsfordWigginsMellish1999,Pearce2005}. samer@74: % \cite{Ferrand2002}. Dubnov and Assayag PSTs? samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Music and information theory] samer@74: \uncover<1->{ samer@74: With probabilistic models in hand we can apply quantitative information theory: we can compute entropies, samer@74: relative entropies, mutual information, and all that. samer@74: } samer@74: samer@74: \uncover<2->{ samer@74: Lots of interest in application of information theory to perception, music and aesthetics since the 50s, samer@74: \eg Moles \cite{Moles66}, Meyer \cite{Meyer67}, Cohen \cite{Cohen1962}, Berlyne \cite{Berlyne71}. samer@74: (See also Bense, Hiller) samer@74: } samer@74: samer@74: \uncover<3->{ samer@74: Idea is that subjective qualities and samer@74: states like uncertainty, surprise, complexity, tension, and interestingness samer@74: are determined by information-theoretic quantities. samer@74: } samer@74: samer@74: \uncover<4->{ samer@74: Berlyne \cite{Berlyne71} called such quantities `collative variables', since they are samer@74: to do with patterns of occurrence rather than medium-specific details. samer@74: \emph{Information aesthetics}. samer@74: } samer@74: % Listeners then experience greater or lesser levels of surprise samer@74: % in response to departures from these norms. samer@74: % By careful manipulation samer@74: % of the material, the composer can thus define, and induce within the samer@74: % listener, a temporal programme of varying samer@74: % levels of uncertainty, ambiguity and surprise. samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Probabilistic model-based observer hypothesis] samer@74: \begin{itemize} samer@74: \item<1-> samer@74: As we listen, we maintain a probabilistic model that enables samer@74: us to make predictions. As events unfold, we revise our probabilistic `belief state', samer@74: including predictions about the future. samer@74: \item<2-> samer@74: Probability distributions and changes in distributions are characterised in terms samer@74: of information theoretic-measures such as entropy and relative entropy (KL divergence). samer@74: \item<3-> samer@74: The dynamic evolution of these information measures captures significant structure, samer@74: \eg events that are surprising, informative, explanatory \etc samer@74: \end{itemize} samer@74: samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Features of information dynamics] samer@74: \uncover<1->{ samer@74: \textbf{Abstraction}: sensitive mainly to \emph{patterns} of occurence, samer@74: rather than details of which specific things occur or the sensory medium. samer@74: % it operates at a level of abstraction removed from the details of the sensory experience and samer@74: % the medium through which it was received, suggesting that the same samer@74: % approach could, in principle, be used to analyse and compare information samer@74: % flow in different temporal media regardless of whether they are auditory, visual or otherwise. samer@74: } samer@74: samer@74: \uncover<2->{ samer@74: \textbf{Generality}: applicable in principle to any probabilistic model, in particular, samer@74: models with time-dependent latent variables such as HMMs. samer@74: Many important musical concepts like key, harmony, and beat are essentially `hidden variables'. samer@74: } samer@74: samer@74: \uncover<3->{ samer@74: \textbf{Richness}: when applied to models with latent variables, can result in many-layered samer@74: analysis, capturing information flow about harmony, tempo, \etc samer@74: } samer@74: samer@74: \uncover<4->{ samer@74: \textbf{Subjectivity}: all probabilities are \emph{subjective} probabilities relative to \emph{observer's} samer@74: model, which can depend on observer's capabilities and prior experience. samer@74: } samer@74: \end{iframe} samer@74: samer@74: \section{Surprise, entropy and information in random sequences} samer@74: \label{s:InfoInRandomProcs} samer@74: samer@74: \begin{iframe}[Information theory primer\nicedot Entropy] samer@74: Let $X$ be a discrete-valued random (in the sense of \emph{subjective} probability) variable. samer@74: Entropy is a measure of \emph{uncertainty}. If observer expects to see $x$ with probability $p(x)$, samer@74: then samer@74: \begin{align*} samer@74: H(X) &= \sum_{x\in\X} - p(x) \log p(x) \\ samer@74: &= \expect{[-\log p(X)]}. samer@74: \end{align*} samer@74: Consider $-\log p(x)$ as the `surprisingness' of $x$, then the entropy is the `expected surprisingness'. samer@74: High for spread out distributions and low for concentrated ones. samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Information theory primer\nicedot Relative entropy] samer@74: Relative entropy or Kullback-Leibler (KL) divergence quantifies difference between samer@74: probability distributions. samer@74: If observer receives data $\mathcal{D}$, divergence between (subjective) prior and samer@74: posterior distributions is the samer@74: amount of information in $\mathcal{D}$ \emph{about} $X$ for this observer: samer@74: \[ samer@74: I(\mathcal{D}\to X) = samer@74: D(p_{X|\mathcal{D}} || p_X) samer@74: = \sum_{x\in\X} p(x|\mathcal{D}) \log \frac{p(x|\mathcal{D})}{p(x)}. samer@74: \] samer@74: If observing $\mathcal{D}$ causes a large change in belief about $X$, then $\mathcal{D}$ samer@74: contained a lot of information about $X$. samer@74: samer@74: Like Lindley's (1956) information (thanks Lars!). samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Information theory primer\nicedot Mutual information] samer@74: Mutual information between (MI) $X_1$ and $X_2$ is the expected amount of information about samer@74: $X_2$ in an observation of $X_1$. Can be written in several ways: samer@74: \begin{align*} samer@74: I(X_1;X_2) &= \sum_{x_1,x_2} p(x_1,x_2) \log \frac{p(x_1,x_2)}{p(x_1)p(x_2)} \\ samer@74: &= H(X_1) + H(X_2) - H(X_1,X_2) \\ samer@74: &= H(X_2) - H(X_2|X_1). samer@74: \end{align*} samer@74: (1) Expected information about $X_2$ in an observation of $X_1$;\\ samer@74: (2) Expected reduction in uncertainty about $X_2$ after observing $X_1$;\\ samer@74: (3) Symmetric: $I(X_1;X_2) = I(X_2;X_1)$. samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Information theory primer\nicedot Conditional MI] samer@74: Information in one variable about another given observations of some third variable. samer@74: Formulated analogously by adding conditioning variables to entropies: samer@74: \begin{align*} samer@74: I(X_1;X_2|X_3) &= H(X_1|X_3) - H(X_1|X_2,X_3). samer@74: \end{align*} samer@74: Makes explicit the dependence of information assessment on background knowledge, samer@74: represented by conditioning variables. samer@74: \end{iframe} samer@74: samer@74: samer@74: \begin{isframe}[Information theory primer\nicedot I-Diagrams] samer@74: \newcommand\rad{2.2em}% samer@74: \newcommand\circo{circle (3.4em)}% samer@74: \newcommand\labrad{4.3em} samer@74: \newcommand\bound{(-6em,-5em) rectangle (6em,6em)} samer@74: \newcommand\clipin[1]{\clip (#1) \circo;}% samer@74: \newcommand\clipout[1]{\clip \bound (#1) \circo;}% samer@74: \newcommand\cliptwo[3]{% samer@74: \begin{scope} samer@74: \clipin{#1}; samer@74: \clipin{#2}; samer@74: \clipout{#3}; samer@74: \fill[black!30] \bound; samer@74: \end{scope} samer@74: }% samer@74: \newcommand\clipone[3]{% samer@74: \begin{scope} samer@74: \clipin{#1}; samer@74: \clipout{#2}; samer@74: \clipout{#3}; samer@74: \fill[black!15] \bound; samer@74: \end{scope} samer@74: }% samer@74: Information diagrams are a Venn diagram-like represention of entropies and mutual samer@74: informations for a set of random variables. samer@74: \begin{center} samer@74: \begin{tabular}{c@{\ }c} samer@74: \scalebox{0.8}{% samer@74: \begin{tikzpicture}[baseline=0pt] samer@74: \coordinate (p1) at (90:\rad); samer@74: \coordinate (p2) at (210:\rad); samer@74: \coordinate (p3) at (-30:\rad); samer@74: \clipone{p1}{p2}{p3}; samer@74: \clipone{p2}{p3}{p1}; samer@74: \clipone{p3}{p1}{p2}; samer@74: \cliptwo{p1}{p2}{p3}; samer@74: \cliptwo{p2}{p3}{p1}; samer@74: \cliptwo{p3}{p1}{p2}; samer@74: \begin{scope} samer@74: \clip (p1) \circo; samer@74: \clip (p2) \circo; samer@74: \clip (p3) \circo; samer@74: \fill[black!45] \bound; samer@74: \end{scope} samer@74: \draw (p1) \circo; samer@74: \draw (p2) \circo; samer@74: \draw (p3) \circo; samer@74: \path samer@74: (barycentric cs:p3=1,p1=-0.2,p2=-0.1) +(0ex,0) node {$I_{3|12}$} samer@74: (barycentric cs:p1=1,p2=-0.2,p3=-0.1) +(0ex,0) node {$I_{1|23}$} samer@74: (barycentric cs:p2=1,p3=-0.2,p1=-0.1) +(0ex,0) node {$I_{2|13}$} samer@74: (barycentric cs:p3=1,p2=1,p1=-0.55) +(0ex,0) node {$I_{23|1}$} samer@74: (barycentric cs:p1=1,p3=1,p2=-0.55) +(0ex,0) node {$I_{13|2}$} samer@74: (barycentric cs:p2=1,p1=1,p3=-0.55) +(0ex,0) node {$I_{12|3}$} samer@74: (barycentric cs:p3=1,p2=1,p1=1) node {$I_{123}$} samer@74: ; samer@74: \path samer@74: (p1) +(140:\labrad) node {$X_1$} samer@74: (p2) +(-140:\labrad) node {$X_2$} samer@74: (p3) +(-40:\labrad) node {$X_3$}; samer@74: \end{tikzpicture}% samer@74: } samer@74: & samer@74: \parbox{0.5\linewidth}{ samer@74: \small samer@74: \begin{align*} samer@74: I_{1|23} &= H(X_1|X_2,X_3) \\ samer@74: I_{13|2} &= I(X_1;X_3|X_2) \\ samer@74: I_{1|23} + I_{13|2} &= H(X_1|X_2) \\ samer@74: I_{12|3} + I_{123} &= I(X_1;X_2) samer@74: \end{align*} samer@74: } samer@74: \end{tabular} samer@74: \end{center} samer@74: The areas of samer@74: the three circles represent $H(X_1)$, $H(X_2)$ and $H(X_3)$ respectively. samer@74: The total shaded area is the joint entropy $H(X_1,X_2,X_3)$. samer@74: Each undivided region is an \emph{atom} of the I-diagram. samer@74: \end{isframe} samer@74: samer@74: samer@74: samer@74: samer@74: \begin{isframe}[Information theory in sequences] samer@74: \def\bx{1.6em}% samer@74: \def\cn(#1,#2) {\node[circle,draw,fill=white,inner sep=0.2em] at(#1) {$#2$};}% samer@74: \def\dn(#1){\node[circle,inner sep=0.2em] at(#1) {$\cdots$};}% samer@74: \def\en(#1){coordinate(#1)}% samer@74: \def\tb{++(3.8em,0)}% samer@74: \def\lb(#1)#2{\path (#1)+(0,\bx) node[anchor=south] {#2};} samer@74: \def\nr(#1,#2,#3){\draw[rounded corners,fill=#3] (#1) rectangle (#2);}% samer@74: samer@74: Consider an observer receiving elements of a random sequence samer@74: $(\ldots, X_{-1}, X_0, X_1, X_2, \ldots)$, so that at any time $t$ there is samer@74: a `present' $X_t$, an observed pasti $\past{X}_t$, and an unobserved future samer@74: $\fut{X}_t$. Eg, at time $t=3$: samer@74: samer@74: \begin{figure} samer@74: \begin{tikzpicture}%[baseline=-1em] samer@74: \path (0,0) \en(X0) \tb \en(X1) \tb \en(X2) \tb \en(X3) \tb \en(X4) \tb \en(X5) \tb \en(X6); samer@74: \path (X0)+(-\bx,-\bx) \en(p1) (X2)+(\bx,\bx) \en(p2) samer@74: (X3)+(-\bx,-\bx) \en(p3) (X3)+(\bx,\bx) \en(p4) samer@74: (X4)+(-\bx,-\bx) \en(p5) (X6)+(\bx,\bx) \en(p6); samer@74: \nr(p1,p2,un3) \nr(p3,p4,un4) \nr(p5,p6,un5) samer@74: \dn(X0) \cn(X1,X_1) \cn(X2,X_2) \cn(X3,X_3) \cn(X4,X_4) \cn(X5,X_5) \dn(X6) samer@74: \lb(X1){Past: $\past{X}_3$} samer@74: \lb(X5){Future $\fut{X}_3$} samer@74: \lb(X3){Present} samer@74: \end{tikzpicture}%}% samer@74: \end{figure} samer@74: Consider how the observer's belief state evolves when, having observed up to samer@74: $X_2$, it learns the value of $X_3$. samer@74: \end{isframe} samer@74: samer@74: \begin{iframe}[`Surprise' based quantities] samer@74: To obtain first set of measures, we ignore the future $\fut{X}_t$ samer@74: and consider the probability distribution for $X_t$ give the samer@74: observed past $\past{X}_t=\past{x}_t$. samer@74: samer@74: \begin{enumerate} samer@74: \item<1-> samer@74: \textbf{Surprisingness}: negative log-probability samer@74: $\ell_t = -\log p(x_t|\past{x}_t)$. samer@74: samer@74: \item<2-> samer@74: Expected surprisingness given context $\past{X}=\past{x}_t$ is the entropy of the predictive distribution, samer@74: $H(X_t|\ev(\past{X}_t=\past{x}_t))$: uncertainty about $X_t$ before the observation is made. samer@74: samer@74: \item<3-> samer@74: Expectation over all possible realisations of process is the conditional entropy samer@74: $H(X_t|\past{X}_t)$ according to the observer's model. For stationary process, is samer@74: \emph{entropy rate} $h_\mu$. samer@74: \end{enumerate} samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Predictive information] samer@74: Second set of measures based on amount of information the observation $\ev(X_t=x_t)$ samer@74: carries \emph{about} about the unobserved future $\fut{X}_t$, \emph{given} that we already samer@74: know the past $\ev(\past{X}_t=\past{x}_t)$: samer@74: is samer@74: \begin{equation*} samer@74: \mathcal{I}_t = I(\ev(X_t=x_t)\to\fut{X}_t|\ev(\past{X}_t=\past{x}_t)). samer@74: \end{equation*} samer@74: Is KL divergence between beliefs about future $\fut{X}_t$ prior and posterior samer@74: to observation $\ev(X_t=x_t)$. samer@74: Hence, for continuous valued variables, invariant to invertible samer@74: transformations of the observation spaces. samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Predictive information based quantities] samer@74: \begin{enumerate} samer@74: \item<1-> samer@74: \emph{Instantaneous predictive information} (IPI) is just $\mathcal{I}_t$. samer@74: samer@74: % Expectations over $X|\ev(Z=z)$, $Z|\ev(X=x)$, and $(X,Z)$ give 3 more information measures: samer@74: \item<2-> samer@74: Expectation of $\mathcal{I}_t$ before observation at time $t$ is samer@74: $I(X_t;\fut{X}_t | \ev(\past{X}_t=\past{x}_t))$: mutual information conditioned on samer@74: observed past. Is the amount of new information about the future expected from the next observation. samer@74: Useful for directing attention towards the next event even before it happens? samer@74: samer@74: % This is different from Itti and Baldi's proposal that Bayesian samer@74: % \emph{surprise} attracts attention \cite{IttiBaldi2005}, as it is a mechanism which can samer@74: % operate \emph{before} the surprise occurs. samer@74: samer@74: samer@74: \item<3-> samer@74: Expectation over all possible realisations is the conditional mutual information samer@74: $I(X_t;\fut{X}_t|\past{X}_t)$. For stationary process, this is the global samer@74: \emph{predictive information rate} (PIR), the average rate at which new information arrives about samer@74: the future. In terms of conditional entropies, has two forms: samer@74: $H(\fut{X}_t|\past{X}_t) - H(\fut{X}_t|X_t,\past{X}_t)$ or samer@74: $H(X_t|\past{X}_t) - H(X_t|\fut{X}_t,\past{X}_t)$. samer@74: \end{enumerate} samer@74: samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Global measures for stationary processes] samer@74: For a stationary random process model, the average levels of suprise and information samer@74: are captured by the time-shift invariant process information measures: samer@74: \begin{align*} samer@74: \text{entropy rate} &: & h_\mu &= H(X_t | \past{X}_t) \\ samer@74: \text{multi-information rate} &: & \rho_\mu &= I(\past{X}_t;X_t) = H(X_t) - h_\mu \\ samer@74: \text{residual entropy rate} &: & r_\mu &= H(X_t | \past{X}_t, \fut{X}_t) \\ samer@74: \text{predictive information rate} &: & b_\mu &= I(X_t;\fut{X}_t|\past{X}_t) = h_\mu - r_\mu samer@74: \end{align*} samer@74: Residual entropy also known as \emph{erasure entropy} \cite{VerduWeissman2006}. samer@74: \end{iframe} samer@74: samer@74: \begin{isframe}[Process I-diagrams] samer@74: % \newcommand\subfig[2]{\shortstack{#2\\[0.75em]#1}} samer@74: \newcommand\subfig[2]{#2} samer@74: \newcommand\rad{1.75em}% samer@74: \newcommand\ovoid[1]{% samer@74: ++(-#1,\rad) samer@74: -- ++(2 * #1,0em) arc (90:-90:\rad) samer@74: -- ++(-2 * #1,0em) arc (270:90:\rad) samer@74: }% samer@74: \newcommand\axis{2.75em}% samer@74: \newcommand\olap{0.85em}% samer@74: \newcommand\offs{3.6em} samer@74: \newcommand\longblob{\ovoid{\axis}} samer@74: \newcommand\shortblob{\ovoid{1.75em}} samer@74: \begin{figure} samer@74: \begin{tikzpicture}%[baseline=-1em] samer@74: \newcommand\rc{\rad} samer@74: \newcommand\throw{2.5em} samer@74: \coordinate (p1) at (180:1.5em); samer@74: \coordinate (p2) at (0:0.3em); samer@74: \newcommand\bound{(-7em,-2.6em) rectangle (7em,3.0em)} samer@74: \newcommand\present{(p2) circle (\rc)} samer@74: \newcommand\thepast{(p1) ++(-\throw,0) \ovoid{\throw}} samer@74: \newcommand\fillclipped[2]{% samer@74: \begin{scope}[even odd rule] samer@74: \foreach \thing in {#2} {\clip \thing;} samer@74: \fill[black!#1] \bound; samer@74: \end{scope}% samer@74: }% samer@74: \fillclipped{30}{\present,\bound \thepast} samer@74: \fillclipped{15}{\present,\bound \thepast} samer@74: \fillclipped{45}{\present,\thepast} samer@74: \draw \thepast; samer@74: \draw \present; samer@74: \node at (barycentric cs:p2=1,p1=-0.3) {$h_\mu$}; samer@74: \node at (barycentric cs:p2=1,p1=1) [shape=rectangle,fill=black!45,inner sep=1pt]{$\rho_\mu$}; samer@74: \path (p2) +(90:3em) node {$X_0$}; samer@74: \path (p1) +(-3em,0em) node {\shortstack{infinite\\past}}; samer@74: \path (p1) +(-4em,\rad) node [anchor=south] {$\ldots,X_{-1}$}; samer@74: \end{tikzpicture}% samer@74: \\[0.25em] samer@74: \begin{tikzpicture}%[baseline=-1em] samer@74: \newcommand\rc{2.2em} samer@74: \newcommand\throw{2.5em} samer@74: \coordinate (p1) at (210:1.5em); samer@74: \coordinate (p2) at (90:0.8em); samer@74: \coordinate (p3) at (-30:1.5em); samer@74: \newcommand\bound{(-7em,-2.6em) rectangle (7em,3.0em)} samer@74: \newcommand\present{(p2) circle (\rc)} samer@74: \newcommand\thepast{(p1) ++(-\throw,0) \ovoid{\throw}} samer@74: \newcommand\future{(p3) ++(\throw,0) \ovoid{\throw}} samer@74: \newcommand\fillclipped[2]{% samer@74: \begin{scope}[even odd rule] samer@74: \foreach \thing in {#2} {\clip \thing;} samer@74: \fill[black!#1] \bound; samer@74: \end{scope}% samer@74: }% samer@74: % \fillclipped{80}{\future,\thepast} samer@74: \fillclipped{30}{\present,\future,\bound \thepast} samer@74: \fillclipped{15}{\present,\bound \future,\bound \thepast} samer@74: \draw \future; samer@74: \fillclipped{45}{\present,\thepast} samer@74: \draw \thepast; samer@74: \draw \present; samer@74: \node at (barycentric cs:p2=0.9,p1=-0.17,p3=-0.17) {$r_\mu$}; samer@74: \node at (barycentric cs:p1=-0.5,p2=1.0,p3=1) {$b_\mu$}; samer@74: \node at (barycentric cs:p3=0,p2=1,p1=1.2) [shape=rectangle,fill=black!45,inner sep=1pt]{$\rho_\mu$}; samer@74: \path (p2) +(140:3.2em) node {$X_0$}; samer@74: % \node at (barycentric cs:p3=0,p2=1,p1=1) {$\rho_\mu$}; samer@74: \path (p3) +(3em,0em) node {\shortstack{infinite\\future}}; samer@74: \path (p1) +(-3em,0em) node {\shortstack{infinite\\past}}; samer@74: \path (p1) +(-4em,\rad) node [anchor=south] {$\ldots,X_{-1}$}; samer@74: \path (p3) +(4em,\rad) node [anchor=south] {$X_1,\ldots$}; samer@74: \end{tikzpicture}% samer@74: % \\[0.25em] samer@74: % The small dark samer@74: % region below $X_0$ is $\sigma_\mu$ and the excess entropy samer@74: % is $E = \rho_\mu + \sigma_\mu$. samer@74: \end{figure} samer@74: Marginal entropy of `present' $X_0$ is $H(X_0)=\rho_\mu+r_\mu+b_\mu$.\\ samer@74: Entropy rate is $h_\mu = r_\mu+b_\mu$. samer@74: \end{isframe} samer@74: samer@74: \section{Markov chains} samer@74: \label{s:InfoInMC} samer@74: samer@74: samer@74: \begin{iframe}[Markov chains\nicedot Definitions] samer@74: samer@74: % Now we'll look at information dynamics in one of the simplest possible models, a Markov chain. samer@74: % To illustrate the how the measures defined in \secrf{InfoInRandomProcs} can be computed samer@74: % in practice, we will consider one of the simplest random processes, a samer@74: % first order Markov chain. samer@74: % In this case, the dynamic information measures can be computed in closed-form. samer@74: % samer@74: samer@74: Let $X$ be a Markov chain with state space samer@74: $\{1, \ldots, K\}$, \ie the $X_t$ take values from $1$ to $K$. samer@74: \begin{center} samer@74: \begin{tikzpicture}[->] samer@74: \matrix[column sep=2em,ampersand replacement=\&]{ samer@74: \cn(X,1) \& \cn(X,2) \& \cn(X,3) \& \cn(X,4) \& \dn(XT) \\}; samer@74: \rl(X1,X2) \rl(X2,X3) \rl(X3,X4) \rl(X4,XT) samer@74: \end{tikzpicture} samer@74: \end{center} samer@74: % For the sake of brevity let us assume that $\domA$ is the set of integers from 1 to $K$. samer@74: Parameterised by transition matrix $\trans \in \reals^{K\times K}$, samer@74: % encoding the distribution of any element of the sequence given previous one, samer@74: \ie $p(\ev(X_{t+1}=i)|\ev(X_t=j))=\trans_{ij}$. samer@74: Assume irreducibility, ergodicity \etc to ensure uniqueness of samer@74: stationary distribution $\pi$ such that samer@74: $p(\ev(X_t=i))=\init_i$ independent of $t$. Entropy rate as a function of samer@74: $a$ is samer@74: % $\entrorate:\reals^{K\times K} \to \reals$: samer@74: \[ samer@74: \entrorate(\trans) = \sum_{j=1}^K \init_j \sum_{i=1}^K -\trans_{ij} \log \trans_{ij}. samer@74: \] samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Markov chains\nicedot PIR] samer@74: Predictive information rate for first order chains comes out in terms of entropy rate samer@74: function as samer@74: \[ samer@74: b_\mu = h(a^2) - h(a), samer@74: \] samer@74: where $a^2$ is \emph{two-step} transition matrix. samer@74: samer@74: \uncover<2->{ samer@74: Can be generalised to higher-order transition matrices samer@74: \[ samer@74: b_\mu = h(\hat{a}^{N+1}) - Nh(\hat{a}), samer@74: \] samer@74: where $N$ is the order of the chain and $\hat{a}$ is a sparse samer@74: $K^N\times K^N$ transition matrix over product state space of $N$ samer@74: consecutive observations (step size 1). samer@74: } samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Entropy rate and PIR in Markov chains] samer@74: samer@74: \begin{fig}{artseq} samer@74: \hangbox{\colfig[0.40]{matbase/fig8515}}% samer@74: \quad samer@74: \hangbox{% samer@74: \begin{tabular}{cc}% samer@74: \colfig[0.18]{matbase/fig1356} & samer@74: \colfig[0.18]{matbase/fig45647} \\ samer@74: \colfig[0.18]{matbase/fig49938} & samer@74: \colfig[0.18]{matbase/fig23355}% samer@74: \end{tabular}% samer@74: }% samer@74: % \end{hanging}\\ samer@74: \end{fig} samer@74: For given $K$, entropy rate varies between 0 (deterministic sequence) samer@74: and $\log K$ when $\trans_{ij}=1/K$ for all $i,j$. samer@74: Space of transition matrices explored by generating samer@74: them at random and plotting entropy rate vs PIR. (Note inverted samer@74: `U' relationship). %Transmat (d) is almost uniform. samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Samples from processes with different PIR] samer@74: \begin{figure} samer@74: \colfig[0.75]{matbase/fig847}\\ samer@74: \colfig[0.75]{matbase/fig61989}\\ samer@74: \colfig[0.75]{matbase/fig43415}\\ samer@74: \colfig[0.75]{matbase/fig50385} samer@74: \end{figure} samer@74: Sequence (a) is repetition samer@74: of state 4 (see transmat (a) on previous slide). samer@74: System (b) has the highest PIR. samer@74: \end{iframe} samer@74: samer@74: % \begin{tabular}{rl} samer@74: % (a) & \raisebox{-1em}{\colfig[0.58]{matbase/fig9048}}\\[1em] samer@74: % (b) & \raisebox{-1em}{\colfig[0.58]{matbase/fig58845}}\\[1em] samer@74: % (c) & \raisebox{-1em}{\colfig[0.58]{matbase/fig45019}}\\[1em] samer@74: % (d) & \raisebox{-1em}{\colfig[0.58]{matbase/fig1511}} samer@74: % \end{tabular} samer@74: samer@74: \section{Application: The Melody Triangle} samer@74: \begin{iframe}[Complexity and interestingness: the Wundt Curve] samer@74: \label{s:Wundt} samer@74: Studies looking into the relationship between stochastic complexity samer@74: (usually measured as entropy or entropy rate) and aesthetic value, reveal samer@74: an inverted `U' shaped curve \citep{Berlyne71}. (Also, Wundt curve \cite{Wundt1897}). samer@74: Repeated exposure tends to move stimuli leftwards. samer@74: samer@74: \hangbox{% samer@74: \only<1>{\colfig[0.5]{wundt}}% samer@74: \only<2>{\colfig[0.5]{wundt2}}% samer@74: }\hfill samer@74: \hangbox{\parbox{0.43\linewidth}{\raggedright samer@74: %Too deterministic $\rightarrow$ predictable, boring like a monotone;\\ samer@74: %Too random $\rightarrow$ are boring like white noise: unstructured, samer@74: %featureless, uniform. samer@74: Explanations for this usually appeal to a need for a `balance' samer@74: between order and chaos, unity and diversity, and so on, in a generally samer@74: imprecise way.}} samer@74: samer@74: samer@74: % Hence, a sequence can be uninteresting in two opposite ways: by samer@74: % being utterly predictable \emph{or} by being utterly samer@74: % unpredictable. samer@74: % Meyer \cite{Meyer2004} suggests something similar: samer@74: % hints at the same thing while discussing samer@74: % the relation between the rate of information flow and aesthetic experience, samer@74: % suggesting that samer@74: %% `unless there is some degree of order, \ldots samer@74: %% there is nothing to be uncertain \emph{about} \ldots samer@74: % `If the amount of information [by which he means entropy and surprisingness] samer@74: % is inordinately increased, the result is a kind of cognitive white noise.' samer@74: samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[PIR as a measure of cognitive activity] samer@74: samer@74: The predictive information rate incorporates a similar balance automatically: samer@74: is maximal for sequences which are neither deterministic nor samer@74: totally uncorrelated across time. samer@74: samer@74: \vspace{1em} samer@74: \begin{tabular}{rr}% samer@74: \raisebox{0.5em}{too predictable:} & samer@74: \only<1>{\noderow(black,un0,un0,un0,un1,un1)}% samer@74: \only<2>{\noderow(black,black,un0,un0,un0,un1)}% samer@74: \only<3>{\noderow(black,black,black,un0,un0,un0)}% samer@74: \only<4>{\noderow(black,black,black,black,un0,un0)}% samer@74: \\[1.2em] samer@74: \raisebox{0.5em}{intermediate:} & samer@74: \only<1>{\noderow(black,un1,un2,un3,un4,un5)}% samer@74: \only<2>{\noderow(black,black,un1,un2,un3,un4)}% samer@74: \only<3>{\noderow(black,black,black,un1,un2,un3)}% samer@74: \only<4>{\noderow(black,black,black,black,un1,un2)}% samer@74: \\[1.2em] samer@74: \raisebox{0.5em}{too random:} & samer@74: \only<1>{\noderow(black,un5,un5,un5,un5,un5)}% samer@74: \only<2>{\noderow(black,black,un5,un5,un5,un5)}% samer@74: \only<3>{\noderow(black,black,black,un5,un5,un5)}% samer@74: \only<4>{\noderow(black,black,black,black,un5,un5)}% samer@74: \end{tabular} samer@74: \vspace{1em} samer@74: samer@74: (Black: \emph{observed}; red: \emph{unobserved}; paler: \emph{greater uncertainty}.) samer@74: Our interpretation: samer@74: % when each event appears to carry no new information about the unknown future, samer@74: % it is `meaningless' and not worth attending to. samer@74: Things are `interesting' or at least `salient' when each new part supplies new information about parts to come. samer@74: samer@74: % Quantitative information dynamics will enable us to test this experimentally with human samer@74: % subjects. samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[The Melody Triangle\nicedot Information space] samer@74: \begin{figure} samer@74: \colfig[0.75]{mtriscat} samer@74: \end{figure} samer@74: Population of transition matrices in 3D space of $h_\mu$, $\rho_\mu$ and $b_\mu$. samer@74: % Concentrations of points along redundancy axis correspond to roughly periodic patterns. samer@74: Colour of each point samer@74: represents PIR. samer@74: %---highest values found at intermediate entropy and redundancy. samer@74: Shape is mostly (not completely) hollow inside: forming roughly samer@74: a curved triangular sheet. samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[The Melody Triangle\nicedot User interface] samer@74: \begin{figure} samer@74: \colfig[0.55]{TheTriangle.pdf} samer@74: \end{figure} samer@74: Allows user to place tokens in the triangle samer@74: to cause sonification of a Markov chain with corresponding information samer@74: `coordinate'. samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Subjective information] samer@74: So far we've assumed that sequence is actually sampled from samer@74: from a stationary Markov chain with a transition matrix known samer@74: to the observer. samer@74: This means time averages of IPI and surprise should equal samer@74: expectations. samer@74: samer@74: \uncover<2->{ samer@74: What if sequence is sampled from some other Markov chain, samer@74: or is produced by some unknown process? samer@74: } samer@74: samer@74: \begin{itemize} samer@74: \item<3-> samer@74: In general, it may be impossible to identify any `true' model. There samer@74: are no `objective' probabilities; only subjective ones, as samer@74: argued by de Finetti \cite{deFinetti}. samer@74: samer@74: samer@74: \item<4-> samer@74: If sequence \emph{is} sampled from some Markov chain, we can samer@74: compute (time) averages of observer's average subjective surprise samer@74: and PI and also track what happens if observer gradually learns samer@74: the transition matrix from the data. samer@74: \end{itemize} samer@74: \end{iframe} samer@74: samer@74: samer@74: \begin{iframe}[Effect of learning on information dynamics] samer@74: \begin{figure} samer@74: % \colfig{matbase/fig42687} % too small text samer@74: % \colfig{matbase/fig60379} % 9*19 too tall samer@74: % \colfig{matbase/fig52515} % 9*20 ok, perhaps text still too small samer@74: \colfig[0.9]{matbase/fig30461} % 8*19 ok samer@74: % \colfig{matbase/fig66022} % 8.5*19 ok samer@74: \end{figure} samer@74: % Upper row shows actual stochastic learning, samer@74: % lower shows the idealised deterministic learning. samer@74: \textbf{(a/b/e/f)}: multiple runs starting from same samer@74: initial condition but using different generative transition matrices. samer@74: \textbf{(c/d/g/h)}: multiple runs starting from different samer@74: initial conditions and converging on transition matrices samer@74: with (c/g) high and (d/h) low PIR. samer@74: \end{iframe} samer@74: samer@74: samer@74: \section{More process models} samer@74: \begin{iframe}[Exchangeable sequences and parametric models] samer@74: De Finetti's theorem says that an exchangeable random process can be represented samer@74: as a sequence variables which are iid \emph{given} some hidden probability samer@74: distribution, which we can think of as a parameterised model: samer@74: \begin{tabular}{lp{0.45\linewidth}} samer@74: \hangbox{\begin{tikzpicture} samer@74: [>=stealth',var/.style={circle,draw,inner sep=1pt,text height=10pt,text depth=4pt}] samer@74: \matrix[ampersand replacement=\&,matrix of math nodes,row sep=2em,column sep=1.8em,minimum size=17pt] { samer@74: \& |(theta) [var]| \Theta \\ samer@74: |(x1) [var]| X_1 \& |(x2) [var]| X_2 \& |(x3) [var]| X_3 \& samer@74: |(etc) [outer sep=2pt]| \dots \\ samer@74: }; samer@74: \foreach \n in {x1,x2,x3,etc} \draw[->] (theta)--(\n); samer@74: \end{tikzpicture}} samer@74: & samer@74: \raggedright samer@74: \uncover<2->{Observer's belief state at time $t$ includes probability distribution samer@74: over the parameters $p(\ev(\Theta=\theta)|\ev(\past{X}_t=\past{x}_t))$.} samer@74: \end{tabular}\\[1em] samer@74: \uncover<3->{ samer@74: Each observation causes revision of belief state samer@74: and hence supplies information samer@74: $ samer@74: I(\ev(X_t=x_t)\to\Theta|\ev(\past{X}_t=\past{x}_t)) samer@74: % = D( p_{\Theta|\ev(X_t=x_t),\ev(\past{X}_t=\past{x}_t)} || p_{\Theta|\ev(\past{X}_t=\past{x}_t)} ). samer@74: $ about $\Theta$: samer@74: In previous work we called this the `model information rate'. samer@74: } samer@74: \uncover<4->{(Same as Haussler and Opper's \cite{HausslerOpper1995} IIG or samer@74: Itti and Baldi's \cite{IttiBaldi2005} Bayesian surprise.)} samer@74: \end{iframe} samer@74: samer@74: \def\circ{circle (9)}% samer@74: \def\bs(#1,#2,#3){(barycentric cs:p1=#1,p2=#2,p3=#3)}% samer@74: \begin{iframe}[IIG equals IPI in (some) XRPs] samer@74: \begin{tabular}{@{}lc} samer@74: \parbox[c]{0.5\linewidth}{\raggedright samer@74: Mild assumptions yield a relationship between IIG (instantaneous information gain) and IPI. samer@74: (Everything here implicitly conditioned on $\past{X}_t$).} samer@74: & samer@74: \pgfsetxvec{\pgfpoint{1mm}{0mm}}% samer@74: \pgfsetyvec{\pgfpoint{0mm}{1mm}}% samer@74: \begin{tikzpicture}[baseline=0pt] samer@74: \coordinate (p1) at (90:6); samer@74: \coordinate (p2) at (210:6); samer@74: \coordinate (p3) at (330:6); samer@74: \only<4->{% samer@74: \begin{scope} samer@74: \foreach \p in {p1,p2,p3} \clip (\p) \circ; samer@74: \fill[lightgray] (-10,-10) rectangle (10,10); samer@74: \end{scope} samer@74: \path (0,0) node {$\mathcal{I}_t$};} samer@74: \foreach \p in {p1,p2,p3} \draw (\p) \circ; samer@74: \path (p2) +(210:13) node {$X_t$} samer@74: (p3) +(330:13) node {$\fut{X}_t$} samer@74: (p1) +(140:12) node {$\Theta$}; samer@74: \only<2->{\path \bs(-0.25,0.5,0.5) node {$0$};} samer@74: \only<3->{\path \bs(0.5,0.5,-0.25) node {$0$};} samer@74: \end{tikzpicture} samer@74: \end{tabular}\\ samer@74: \begin{enumerate} samer@74: \uncover<2->{\item $X_t \perp \fut{X}_t | \Theta$: observations iid given $\Theta$ for XRPs;} samer@74: \uncover<3->{\item $\Theta \perp X_t | \fut{X}_t$: samer@74: % $I(X_t;\fut{X}_t|\Theta_t)=0$ due to the conditional independence of samer@74: % observables given the parameters $\Theta_t$, and samer@74: % $I(\Theta_t;X_t|\fut{X}_t)=0$ samer@74: assumption that $X_t$ adds no new information about $\Theta$ samer@74: given infinitely long sequence $\fut{X}_t =X_{t+1:\infty}$.} samer@74: \end{enumerate} samer@74: \uncover<4->{Hence, $I(X_t;\Theta_t|\past{X}_t)=I(X_t;\fut{X}_t|\past{X}_t) = \mathcal{I}_t$.\\} samer@74: \uncover<5->{Can drop assumption 1 and still get $I(X_t;\Theta_t|\past{X}_t)$ as an additive component (lower bound) of $\mathcal{I}_t$.} samer@74: \end{iframe} samer@74: samer@74: \def\fid#1{#1} samer@74: \def\specint#1{\frac{1}{2\pi}\int_{-\pi}^\pi #1{S(\omega)} \dd \omega} samer@74: \begin{iframe}[Discrete-time Gaussian processes] samer@74: Information-theoretic quantities used earlier have analogues for continuous-valued samer@74: random variables. For stationary Gaussian processes, we can obtain results in samer@74: terms of the power spectral density $S(\omega)$, (which for discrete time is periodic samer@74: in $\omega$ with period $2\pi$). Standard methods give samer@74: \begin{align*} samer@74: H(X_t) &= \frac{1}{2}\left( \log 2\pi e + \log \specint{}\right), \\ samer@74: h_\mu &= \frac{1}{2} \left( \log 2\pi e + \specint{\log} \right), \\ samer@74: \rho_\mu &= \frac{1}{2} \left( \log \specint{\fid} - \specint{\log}\right). samer@74: \end{align*} samer@74: Entropy rate is also known as Kolmogorov-Sinai entropy. samer@74: % $H(X_t)$ is a function of marginal variance which is just the total power in the spectrum. samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[PIR/Multi-information duality] samer@74: Analysis yeilds PIR: samer@74: \[ samer@74: b_\mu = \frac{1}{2} \left( \log \specint{\frac{1}} - \specint{\log\frac{1}} \right). samer@74: \] samer@74: Yields simple expression for finite-order autogregressive processes, but beware: can diverge samer@74: for moving average processes! samer@74: samer@74: \uncover<2->{ samer@74: Compare with multi-information rate: samer@74: \[ samer@74: \rho_\mu = \frac{1}{2} \left( \log \specint{\fid} - \specint{\log}\right). samer@74: \] samer@74: Yields simple expression for finite-order moving-average processes, but can diverge samer@74: for marginally stable autogregressive processes. samer@74: } samer@74: samer@74: \uncover<3->{ samer@74: Infinities are troublesome and point to problem with notion of infinitely samer@74: precise observation of continuous-valued variables. samer@74: } samer@74: \end{iframe} samer@74: samer@74: % Information gained about model parameters (measured as the KL divergence samer@74: % between prior and posterior distributions) is equivalent samer@74: % to \textbf{Itti and Baldi's `Bayesian surprise'} \cite{IttiBaldi2005}. samer@74: samer@74: samer@74: \section{Application: Analysis of minimalist music} samer@74: \label{s:Experiments} samer@74: samer@74: \begin{iframe}[Material and Methods] samer@74: samer@74: % Returning to our original goal of modelling the perception of temporal structure samer@74: % in music, we computed dynamic information measures for samer@74: We took two pieces of minimalist samer@74: music by Philip Glass, \emph{Two Pages} (1969) and \emph{Gradus} (1968). samer@74: Both monophonic and isochronous, so representable very simply as samer@74: a sequence of symbols (notes), one symbol per beat, samer@74: yet remain ecologically valid examples of `real' music. samer@74: samer@74: We use an elaboration of the Markov chain model---not necessarily samer@74: a good model \latin{per se}, but that wasn't the point of the experiment. samer@74: Markov chain model was chosen as it is tractable from and information samer@74: dynamics point of view while not being completely trivial. samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Time-varying transition matrix model] samer@74: We allow transition matrix to vary slowly with time to track samer@74: changes in the sequence structure. samer@74: Hence, observer's belief state includes a probabilitiy samer@74: distribution over transition matrices; we choose a product of samer@74: Dirichlet distributions: samer@74: \[ samer@74: \textstyle samer@74: p(\trans|\param) = \prod_{j=1}^K p_\mathrm{Dir}(\trans_{:j}|\param_{:j}), samer@74: \] samer@74: where $\trans_{:j}$ is \nth{j} column of $\trans$ and $\param$ is an samer@74: $K \times K$ parameter matrix. samer@74: % (Dirichlet, being conjugate to discrete/multinomial distribution, samer@74: % makes processing of observations particularly simple.) samer@74: % such that $\param_{:j}$ is the samer@74: % parameter tuple for the $K$-component Dirichlet distribution $p_\mathrm{Dir}$. samer@74: % \begin{equation} samer@74: % \textstyle samer@74: % p(\trans|\param) = \prod_{j=1}^K p_\mathrm{Dir}(\trans_{:j}|\param_{:j}) samer@74: % = \prod_{j=1}^K (\prod_{i=1}^K \trans_{ij}^{\param_{ij}-1}) / B(\param_{:j}), samer@74: % \end{equation} samer@74: % where $\trans_{:j}$ is the \nth{j} column of $\trans$ and $\param$ is an samer@74: % $K \times K$ matrix of parameters. samer@74: samer@74: At each time step, distribution first \emph{spreads} under mapping samer@74: \[ samer@74: \param_{ij} \mapsto \frac{\beta\param_{ij}}{(\beta + \param_{ij})} samer@74: \] samer@74: to model possibility that transition matrix samer@74: has changed ($\beta=2500$ in our experiments). Then it \emph{contracts} samer@74: due to new observation providing fresh evidence about transition matrix. samer@74: % samer@74: % Each observed symbol % provides fresh evidence about current transition matrix, samer@74: % enables observer to update its belief state. samer@74: \end{iframe} samer@74: samer@74: samer@74: \begin{iframe}[Two Pages\nicedot Results] samer@74: samer@74: % \begin{fig}{twopages} samer@74: \begin{tabular}{c@{\hspace{1.5ex}}l}% samer@74: % \hspace*{-1.5em} samer@74: % \hangbox{\colfig[0.5]{matbase/fig20304}} % 3 plots samer@74: % \hangbox{\colfig[0.52]{matbase/fig39528}} % 4 plots with means samer@74: % \hangbox{\colfig[0.52]{matbase/fig63538}} % two pages, 5 plots samer@74: % \hangbox{\colfig[0.52]{matbase/fig53706}} % two pages, 5 plots samer@74: \hangbox{\colfig[0.72]{matbase/fig33309}} % two pages, 5 plots samer@74: & samer@74: \hangbox{% samer@74: \parbox{0.28\linewidth}{ samer@74: \raggedright samer@74: \textbf{Thick lines:} part boundaries as indicated samer@74: by Glass; \textbf{grey lines (top four panels):} changes in the melodic `figures'; samer@74: % of which the piece is constructed. samer@74: \textbf{grey lines (bottom panel):} samer@74: six most surprising moments chosen by expert listenter. samer@74: } samer@74: } samer@74: \end{tabular} samer@74: % \end{fig} samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Two Pages\nicedot Rule based analysis] samer@74: \begin{figure} samer@74: \colfig[0.98]{matbase/fig13377} samer@74: % \hangbox{\colfig[0.98]{matbase/fig13377}} samer@74: \end{figure} samer@74: Analysis of \emph{Two Pages} using (top) Cambouropoulos' samer@74: Local Boundary Detection Model (LBDM) and samer@74: (bottom) Lerdahl and Jackendoff's samer@74: grouping preference rule 3a (GPR3a), which is a function of pitch proximity. samer@74: Both analyses indicate `boundary strength'. samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Two Pages\nicedot Discussion] samer@74: Correspondence between the information samer@74: measures and the structure of the piece is quite close. samer@74: Good agreement between the six `most surprising samer@74: moments' chosen by expert listener and model information signal. samer@74: samer@74: What appears to be an error in the detection of samer@74: the major part boundary (between events 5000 and 6000) actually samer@74: raises a known anomaly in the score, where Glass places the boundary several events samer@74: before there is any change in the pattern of notes. Alternative analyses of \emph{Two Pages} samer@74: place the boundary in agreement with peak in our surprisingness signal. samer@74: \end{iframe} samer@74: samer@74: \comment{ samer@74: \begin{iframe}[Gradus\nicedot Results] samer@74: samer@74: % \begin{fig}{gradus} samer@74: \begin{tabular}{c@{\hspace{1.5ex}}l} samer@74: % & samer@74: % \hangbox{\colfig[0.4]{matbase/fig81812}} samer@74: % \hangbox{\colfig[0.52]{matbase/fig23177}} % two pages, 5 plots samer@74: % \hangbox{\colfig[0.495]{matbase/fig50709}} % Fudged segmentation samer@74: % \hangbox{\colfig[0.495]{matbase/fig3124}} % Geraint's segmentation samer@74: \hangbox{\colfig[0.715]{matbase/fig11808}} % Geraint's segmentation, corrected samer@74: & samer@74: % \hangbox{\colfig[0.5]{matbase/fig39914}} samer@74: \hangbox{% samer@74: \parbox{0.28\linewidth}{ samer@74: \raggedright samer@74: \textbf{Thick lines:} part boundaries as indicated samer@74: by the composer. samer@74: \textbf{Grey lines:} segmentation by expert listener. samer@74: samer@74: Note: traces smoothed with Gaussian samer@74: window about 16 events wide. samer@74: } samer@74: } samer@74: \end{tabular} samer@74: % \end{fig} samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Gradus\nicedot Rule based analysis] samer@74: \begin{figure} samer@74: \colfig[0.98]{matbase/fig58691} samer@74: \end{figure} samer@74: Boundary strength analysis of \emph{Gradus} using (top) Cambouropoulos' samer@74: \cite{CambouropoulosPhD} Local Boundary Detection Model and samer@74: (bottom) Lerdahl and Jackendoff's \cite{LerdahlJackendoff83} samer@74: grouping preference rule 3a. samer@74: \end{iframe} samer@74: } samer@74: \begin{iframe}[Gradus\nicedot Metrical analysis] samer@74: \begin{figure} samer@74: \begin{tabular}{cc} samer@74: \colfig[0.40]{matbase/fig56807} & \colfig[0.41]{matbase/fig27144} \\ samer@74: \colfig[0.40]{matbase/fig87574} & \colfig[0.41]{matbase/fig13651} \\ samer@74: \hspace*{1ex}\colfig[0.39]{matbase/fig19913} & \hspace*{1ex}\colfig[0.40]{matbase/fig66144} samer@74: \end{tabular} samer@74: \end{figure} samer@74: \end{iframe} samer@74: samer@74: \comment{ samer@74: \begin{iframe}[Gradus\nicedot Discussion] samer@74: samer@74: \emph{Gradus} is much less systematically structured than \emph{Two Pages}, and samer@74: relies more on the conventions of tonal music, which are not represented the model. samer@74: samer@74: For example initial transition matrix is uniform, which does not correctly represent samer@74: prior knowledge about tonal music. samer@74: samer@74: Information dynamic analysis does not give such a samer@74: clear picture of the structure; but some of the fine structure can be related samer@74: to specific events in the music (see Pearce and Wiggins 2006). samer@74: % nonetheless, there are some points of correspondence between the analysis and samer@74: % segmentation given by Keith Potter. samer@74: samer@74: \end{iframe} samer@74: } samer@74: samer@74: \section{Application: Beat tracking and rhythm} samer@74: samer@74: \begin{iframe}[Bayesian beat tracker] samer@74: \uncover<1->{ samer@74: Works by maintaining probabilistic belief state about time of next samer@74: beat and current tempo. samer@74: samer@74: \begin{figure} samer@74: \colfig{beat_prior} samer@74: \end{figure} samer@74: } samer@74: samer@74: \uncover<2->{ samer@74: Receives categorised drum events (kick or snare) from audio analysis front-end. samer@74: } samer@74: samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Information gain in the beat tracker] samer@74: \begin{tabular}{ll} samer@74: \parbox[t]{0.43\linewidth}{\raggedright samer@74: \uncover<1->{ samer@74: Each event triggers a change in belief state, so we can compute samer@74: information gain about beat parameters.}\\[1em] samer@74: samer@74: \uncover<2->{ samer@74: Relationship between IIG and IPI samer@74: means we treat it as a proxy for IPI.} samer@74: } samer@74: & samer@74: \hangbox{\colfig[0.55]{beat_info}} samer@74: \end{tabular} samer@74: \end{iframe} samer@74: samer@74: \begin{iframe}[Analysis of drum patterns] samer@74: We analysed 17 recordings of drummers, both playing solo or with a band. samer@74: All patterns in were in 4/4. samer@74: \begin{itemize} samer@74: \item samer@74: \uncover<1->{ samer@74: Information tends to arrive at beat times: consequence of structure of model. samer@74: } samer@74: \item samer@74: \uncover<2->{ samer@74: Lots of information seems to arrive after drum fills and breaks samer@74: as the drummer reestablishes the beat. samer@74: } samer@74: \item samer@74: \uncover<3->{ samer@74: No consistent pattern of information arrival in relation to metrical samer@74: structure, so no obvious metrical structure in micro-timing of events. samer@74: However, still possible that metrical structure might emerge from predictive samer@74: analysis of drum pattern. samer@74: } samer@74: \end{itemize} samer@74: \end{iframe} samer@74: samer@74: \section{Summary and conclusions} samer@74: \label{s:Conclusions} samer@74: samer@74: \begin{iframe}[Summary] samer@74: samer@74: \begin{itemize} samer@74: \item Dynamic, observer-centric information theory. samer@74: \item Applicable to any dynamic probabilistic model. samer@74: \item PIR potentially a measure of complexity. samer@74: \item Simple analysis for Markov chains and Gaussian processes. samer@74: \item Applications in music analysis and composition. samer@74: \item Search for neural correlates is ongoing (that's another talk\ldots). samer@74: \end{itemize} samer@74: Thanks! samer@74: \end{iframe} samer@74: samer@74: \begin{bframe}[Bibliography] samer@74: \bibliographystyle{alpha} samer@74: {\small \bibliography{all,c4dm,compsci}} samer@74: \end{bframe} samer@74: \end{document}