samer@74
|
1 \documentclass{beamer}
|
samer@74
|
2
|
samer@74
|
3 \usepackage[T1]{fontenc}
|
samer@74
|
4 \usepackage{microtype}
|
samer@74
|
5 \usepackage{multimedia}
|
samer@74
|
6 \usepackage{tikz}
|
samer@74
|
7 \usetikzlibrary{matrix}
|
samer@74
|
8 \usetikzlibrary{patterns}
|
samer@74
|
9 \usetikzlibrary{arrows}
|
samer@74
|
10 \usetikzlibrary{calc}
|
samer@74
|
11 \usepackage{tools}
|
samer@74
|
12 %\usepackage{amsfonts,amssymb}
|
samer@74
|
13
|
samer@74
|
14 \tikzset{every picture/.style=semithick}
|
samer@74
|
15
|
samer@74
|
16 %%% font options:
|
samer@74
|
17 % atypewri, frankgth, gillsans, centuryg, futura, eurostil
|
samer@74
|
18 %\usepackage{fourier} % Maths in serif Utopia
|
samer@74
|
19 \usepackage[sf]{frankgth}
|
samer@74
|
20 %\usepackage[sf]{optima}
|
samer@74
|
21
|
samer@74
|
22 %%% Monospace font
|
samer@74
|
23 %\usepackage[scaled=0.88]{ulgothic} % 0.88 % suits narrow faces
|
samer@74
|
24 \renewcommand{\ttdefault}{plg} % Adobe Letter Gothic - suits light medium width face
|
samer@74
|
25 %\renewcommand{\ttdefault}{pcr} % Courier - suits wide faces
|
samer@74
|
26 % remember to match up size and weight of monospace font to main font
|
samer@74
|
27
|
samer@74
|
28 \newcommand{\mytt}[1]{{\texttt{\footnotesize\fontseries{bx}\selectfont #1}}}
|
samer@74
|
29
|
samer@74
|
30 \DeclareMathAlphabet{\mathcal}{OMS}{cmsy}{m}{n}
|
samer@74
|
31
|
samer@74
|
32
|
samer@74
|
33 %%% Black on white
|
samer@74
|
34 \definecolor{base}{rgb}{0,0,0}
|
samer@74
|
35 \definecolor{comp}{named}{green}
|
samer@74
|
36 \definecolor{paper}{named}{white}
|
samer@74
|
37
|
samer@74
|
38 \logo{%
|
samer@74
|
39 \includegraphics[height=16pt]{qmul-black}\hspace*{45pt}%
|
samer@74
|
40 \raisebox{1pt}{\includegraphics[height=12pt]{c4dm-black-white}}%
|
samer@74
|
41 }
|
samer@74
|
42
|
samer@74
|
43 %%% Red on black
|
samer@74
|
44 \comment{
|
samer@74
|
45 \definecolor{base}{rgb}{1,0,0}
|
samer@74
|
46 \definecolor{comp}{rgb}{0,0.8,0.2}
|
samer@74
|
47 \definecolor{paper}{named}{black}
|
samer@74
|
48
|
samer@74
|
49 \logo{%
|
samer@74
|
50 \includegraphics[height=16pt]{qmul-red}\hspace*{45pt}%
|
samer@74
|
51 \raisebox{1pt}{\includegraphics[height=12pt]{c4dm-red-black}}%
|
samer@74
|
52 }
|
samer@74
|
53 }
|
samer@74
|
54
|
samer@74
|
55
|
samer@74
|
56 \useinnertheme{default}%circles
|
samer@74
|
57 \useoutertheme{default}
|
samer@74
|
58 \usefonttheme[onlymath]{serif}
|
samer@74
|
59
|
samer@74
|
60 \setbeamercolor{normal text}{bg=paper,fg=base!90!-paper}
|
samer@74
|
61 \setbeamercolor{background}{bg=comp!50!paper,fg=comp}
|
samer@74
|
62 %\setbeamercolor{structure}{fg=base!75!-paper}
|
samer@74
|
63 \setbeamercolor{structure}{fg=red!50!base}
|
samer@74
|
64 \setbeamercolor{palette primary}{bg=yellow!50!paper,fg=yellow}
|
samer@74
|
65 \setbeamercolor{palette secondary}{bg=orange!50!paper,fg=orange}
|
samer@74
|
66 \setbeamercolor{palette tertiary}{bg=blue!50!paper,fg=blue}
|
samer@74
|
67 \setbeamercolor{palette quaternary}{bg=green!50!paper,fg=green}
|
samer@74
|
68 \setbeamercolor{block body}{bg=base!20!paper}
|
samer@74
|
69 \setbeamercolor{block title}{bg=base!60!paper,fg=paper}
|
samer@74
|
70 \setbeamercolor{navigation symbols}{fg=base!90!paper}
|
samer@74
|
71 \setbeamercolor{separation line}{bg=blue,fg=yellow}
|
samer@74
|
72 \setbeamercolor{fine separation line}{bg=blue,fg=orange}
|
samer@74
|
73
|
samer@74
|
74 % Title page
|
samer@74
|
75 % \setbeamercolor{title}{bg=base!20!paper}
|
samer@74
|
76 % \setbeamercolor{subtitle}{bg=base!20!paper}
|
samer@74
|
77 % \setbeamercolor{title page}{bg=base!40!paper}
|
samer@74
|
78
|
samer@74
|
79 % \setbeamercolor{headline}{bg=blue}
|
samer@74
|
80 % \setbeamercolor{footline}{bg=blue}
|
samer@74
|
81 % \setbeamercolor{frametitle}{bg=base!30!paper}
|
samer@74
|
82 % \setbeamercolor{framesubtitle}{bg=base!40!paper}
|
samer@74
|
83
|
samer@74
|
84 % \setbeamercolor{section in toc}{bg=base!25!paper,fg=orange}
|
samer@74
|
85 % \setbeamercolor{section in toc shaded}{bg=base!25!paper,fg=orange!80!paper}
|
samer@74
|
86 % \setbeamercolor{subsection in toc}{bg=base!25!paper,fg=orange}
|
samer@74
|
87 % \setbeamercolor{subsection in toc shaded}{bg=yellow!25!paper,fg=orange!80!paper}
|
samer@74
|
88 % page number in head/foot
|
samer@74
|
89 % section in head/foot
|
samer@74
|
90 % section in head/foot shaded
|
samer@74
|
91
|
samer@74
|
92
|
samer@74
|
93 \setbeamerfont{structure}{series=\bfseries}
|
samer@74
|
94 \setbeamerfont{title}{series=\mdseries,size=\Large}
|
samer@74
|
95 %\setbeamerfont{title}{series=\ltseries,size=\huge}
|
samer@74
|
96 \setbeamerfont{date}{size=\footnotesize}%,series=\mdcseries}
|
samer@74
|
97 \setbeamerfont{institute}{size=\footnotesize}%,series=\mdcseries}
|
samer@74
|
98 \setbeamerfont{author}{size=\footnotesize,series=\bfseries}
|
samer@74
|
99 \setbeamercolor{bibliography item}{parent={normal text}}
|
samer@74
|
100 \setbeamercolor{bibliography entry author}{fg=base}
|
samer@74
|
101 \setbeamercolor{bibliography entry location}{fg=base!70!paper}
|
samer@74
|
102
|
samer@74
|
103 %%% Templates
|
samer@74
|
104
|
samer@74
|
105 \setbeamertemplate{bibliography item}[text]
|
samer@74
|
106 \setbeamertemplate{bibliography entry title}{ }
|
samer@74
|
107 \setbeamertemplate{bibliography entry location}{ }
|
samer@74
|
108 \setbeamertemplate{blocks}[rounded][shadow=false]
|
samer@74
|
109 \setbeamertemplate{items}[circle]
|
samer@74
|
110 %\setbeamertemplate{bibliography item}[triangle]
|
samer@74
|
111 % \setbeamertemplate{title page}[default][rounded=true,shadow=false]
|
samer@74
|
112 % \setbeamertemplate{frametitle}[default][rounded=true,shadow=false]
|
samer@74
|
113 \setbeamertemplate{sidebar right}{}
|
samer@74
|
114 \setbeamertemplate{footline}{
|
samer@74
|
115 \hspace*{0.2cm}
|
samer@74
|
116 \insertlogo
|
samer@74
|
117 \hfill
|
samer@74
|
118 \usebeamertemplate***{navigation symbols}%
|
samer@74
|
119 \hfill
|
samer@74
|
120 \makebox[6ex]{\hfill\insertframenumber/\inserttotalframenumber}%
|
samer@74
|
121 \hspace*{0.2cm}
|
samer@74
|
122
|
samer@74
|
123 \vskip 4pt
|
samer@74
|
124 }
|
samer@74
|
125
|
samer@74
|
126 \setbeamertemplate{navigation symbols}
|
samer@74
|
127 {%
|
samer@74
|
128 \hbox{%
|
samer@74
|
129 \hbox{\insertslidenavigationsymbol}
|
samer@74
|
130 \hbox{\insertframenavigationsymbol}
|
samer@74
|
131 % \hbox{\insertsubsectionnavigationsymbol}
|
samer@74
|
132 \hbox{\insertsectionnavigationsymbol}
|
samer@74
|
133 \hbox{\insertdocnavigationsymbol}
|
samer@74
|
134 % \hbox{\insertbackfindforwardnavigationsymbol}%
|
samer@74
|
135 }%
|
samer@74
|
136 }
|
samer@74
|
137
|
samer@74
|
138
|
samer@74
|
139 \AtBeginSection[]{
|
samer@74
|
140 \begin{iframe}[Outline]
|
samer@74
|
141 \tableofcontents[currentsection]
|
samer@74
|
142 \end{iframe}
|
samer@74
|
143 }
|
samer@74
|
144 %\linespread{1.1}
|
samer@74
|
145
|
samer@74
|
146 \setlength{\parskip}{0.5em}
|
samer@74
|
147
|
samer@74
|
148 \newenvironment{bframe}[1][untitled]{\begin{frame}[allowframebreaks]\frametitle{#1}}{\end{frame}}
|
samer@74
|
149 \newenvironment{iframe}[1][untitled]{\begin{frame}\frametitle{#1}}{\end{frame}}
|
samer@74
|
150 \newenvironment{isframe}[1][untitled]{\begin{frame}[fragile=singleslide,environment=isframe]\frametitle{#1}}{\end{frame}}
|
samer@74
|
151
|
samer@74
|
152 \renewenvironment{fig}[1]
|
samer@74
|
153 {%
|
samer@74
|
154 \begin{figure}
|
samer@74
|
155 \def\fglbl{f:#1}
|
samer@74
|
156 \let\ocap=\caption
|
samer@74
|
157 \renewcommand{\caption}[2][]{\ocap[##1]{\small ##2}}
|
samer@74
|
158 \centering\small
|
samer@74
|
159 }{%
|
samer@74
|
160 \label{\fglbl}
|
samer@74
|
161 \end{figure}
|
samer@74
|
162 }
|
samer@74
|
163
|
samer@74
|
164 \newcommand{\paragraph}[1]{\textbf{#1}\qquad}
|
samer@74
|
165 \newcommand{\colfig}[2][1]{\includegraphics[width=#1\linewidth]{figs/#2}}%
|
samer@74
|
166 \let\citep=\cite
|
samer@74
|
167 %\newcommand{\dotmath}[2]{\psfrag{#1}[Bc][Bc]{\small $#2$}}
|
samer@74
|
168
|
samer@74
|
169 \title{Cognitive Music Modelling:\\An Information Dynamics Approach}
|
samer@74
|
170 \author{Samer Abdallah, Henrik Ekeus, Peter Foster,\\Andrew Robertson and Mark Plumbley}
|
samer@74
|
171 \institute{Centre for Digital Music\\Queen Mary, University of London}
|
samer@74
|
172
|
samer@74
|
173 \date{\today}
|
samer@74
|
174
|
samer@74
|
175 \def\X{\mathcal{X}}
|
samer@74
|
176 \def\Y{\mathcal{Y}}
|
samer@74
|
177 \def\Past{\mathrm{Past}}
|
samer@74
|
178 \def\Future{\mathrm{Future}}
|
samer@74
|
179 \def\Present{\mathrm{Present}}
|
samer@74
|
180 \def\param{\theta}
|
samer@74
|
181 \def\trans{a}
|
samer@74
|
182 \def\init{\pi^{\trans}}
|
samer@74
|
183 %\def\entrorate(#1){\mathcal{H}(#1)}
|
samer@74
|
184 %\def\entrorate(#1){\dot{\mathcal{H}}(#1)}
|
samer@74
|
185 \def\entrorate{h}
|
samer@74
|
186 \def\emcmarg(#1){b_#1}
|
samer@74
|
187 \def\mcmarg{\vec{b}}
|
samer@74
|
188 \def\domS{\mathcal{S}}
|
samer@74
|
189 \def\domA{\mathcal{A}}
|
samer@74
|
190
|
samer@74
|
191 \def\Lxz(#1,#2){\mathcal{L}(#1|#2)}
|
samer@74
|
192 \def\LXz(#1){\overline{\mathcal{L}}(#1)}
|
samer@74
|
193 \def\LxZ(#1){\underline{\mathcal{L}}(#1)}
|
samer@74
|
194 \def\LXZ{\overline{\underline{\mathcal{L}}}}
|
samer@74
|
195 \def\Ixz(#1,#2){\mathcal{I}(#1|#2)}
|
samer@74
|
196 \def\IXz(#1){\overline{\mathcal{I}}(#1)}
|
samer@74
|
197 \def\IxZ(#1){\underline{\mathcal{I}}(#1)}
|
samer@74
|
198 \def\IXZ{\overline{\underline{\mathcal{I}}}}
|
samer@74
|
199
|
samer@74
|
200 \def\ev(#1=#2){#1\!\!=\!#2}
|
samer@74
|
201 \def\sev(#1=#2){#1\!=#2}
|
samer@74
|
202
|
samer@74
|
203 \def\FE{\mathcal{F}}
|
samer@74
|
204
|
samer@74
|
205 \newcommand\past[1]{\overset{\rule{0pt}{0.2em}\smash{\leftarrow}}{#1}}
|
samer@74
|
206 \newcommand\fut[1]{\overset{\rule{0pt}{0.1em}\smash{\rightarrow}}{#1}}
|
samer@74
|
207
|
samer@74
|
208 \def\cn(#1,#2) {\node[circle,draw,inner sep=0.2em] (#1#2) {${#1}_{#2}$};}
|
samer@74
|
209 \def\dn(#1) {\node[circle,inner sep=0.2em] (#1) {$\cdots$};}
|
samer@74
|
210 \def\rl(#1,#2) {\draw (#1) -- (#2);}
|
samer@74
|
211
|
samer@74
|
212 \definecolor{un0}{rgb}{0.5,0.0,0.0}
|
samer@74
|
213 \definecolor{un1}{rgb}{0.6,0.15,0.15}
|
samer@74
|
214 \definecolor{un2}{rgb}{0.7,0.3,0.3}
|
samer@74
|
215 \definecolor{un3}{rgb}{0.8,0.45,0.45}
|
samer@74
|
216 \definecolor{un4}{rgb}{0.9,0.6,0.6}{
|
samer@74
|
217 \definecolor{un5}{rgb}{1.0,0.75,0.75}
|
samer@74
|
218
|
samer@74
|
219 %\def\blob(#1){\node[circle,draw,fill=#1,inner sep=0.25em]{};}
|
samer@74
|
220 \def\bl(#1){\draw[circle,fill=#1] (0,0) circle (0.4em);}
|
samer@74
|
221 \def\noderow(#1,#2,#3,#4,#5,#6){%
|
samer@74
|
222 \tikz{\matrix[draw,rounded corners,inner sep=0.4em,column sep=2.1em,ampersand replacement=\&]{%
|
samer@74
|
223 \bl(#1)\&\bl(#2)\&\bl(#3)\&\bl(#4)\&\bl(#5)\&\bl(#6)\\};}}
|
samer@74
|
224
|
samer@74
|
225 \begin{document}
|
samer@74
|
226 \frame{\titlepage}
|
samer@74
|
227 \section[Outline]{}
|
samer@74
|
228 \frame{
|
samer@74
|
229 \frametitle{Outline}
|
samer@74
|
230 \tableofcontents
|
samer@74
|
231 }
|
samer@74
|
232
|
samer@74
|
233
|
samer@74
|
234
|
samer@74
|
235 \section{Expectation and surprise in music}
|
samer@74
|
236 \label{s:Intro}
|
samer@74
|
237
|
samer@74
|
238 \begin{iframe}[`Unfoldingness']
|
samer@74
|
239 Music is experienced as a
|
samer@74
|
240 \uncover<2->{phenomenon}
|
samer@74
|
241 \uncover<3->{that}
|
samer@74
|
242 \uncover<4->{`unfolds'} \uncover<5->{in}\\
|
samer@74
|
243 \only<6>{blancmange}%
|
samer@74
|
244 \only<7>{(just kidding)}%
|
samer@74
|
245 \uncover<8->{time,}
|
samer@74
|
246 \uncover<9->{rather than being apprehended as a static object presented in its
|
samer@74
|
247 entirety.}
|
samer@74
|
248
|
samer@74
|
249 \uncover<10->{[This is recognised in computation linguistics where the phenomenon is known as \emph{incrementality}, \eg in incremental parsing.]}
|
samer@74
|
250
|
samer@74
|
251 \uncover<11->{%
|
samer@74
|
252 Meyer \cite{Meyer67} argued that musical experience depends on
|
samer@74
|
253 how we change and revise our conceptions \emph{as events happen},
|
samer@74
|
254 on how expectation and prediction interact with occurrence, and that, to a large
|
samer@74
|
255 degree, the way to understand the effect of music is to focus on
|
samer@74
|
256 this `kinetics' of expectation and surprise.%
|
samer@74
|
257 }
|
samer@74
|
258 \end{iframe}
|
samer@74
|
259
|
samer@74
|
260 \begin{iframe}[Expectation and suprise in music]
|
samer@74
|
261
|
samer@74
|
262 Music creates
|
samer@74
|
263 \emph{expectations} of what is to come next, which may be fulfilled
|
samer@74
|
264 immediately, after some delay, or not at all.
|
samer@74
|
265 Suggested by music theorists, \eg
|
samer@74
|
266 L. B. Meyer \cite{Meyer67} and Narmour \citep{Narmour77} but also
|
samer@74
|
267 noted much earlier by Hanslick \cite{Hanslick1854} in the
|
samer@74
|
268 1850s:
|
samer@74
|
269 \begin{quote}
|
samer@74
|
270 \small
|
samer@74
|
271 `The most important factor in the mental process which accompanies the
|
samer@74
|
272 act of listening to music, and which converts it to a source of pleasure, is
|
samer@74
|
273 \ldots
|
samer@74
|
274 % frequently overlooked. We here refer to
|
samer@74
|
275 the intellectual satisfaction which the
|
samer@74
|
276 listener derives from continually following and anticipating the composer's
|
samer@74
|
277 intentions---now, to see his expectations fulfilled, and now, to find himself
|
samer@74
|
278 agreeably mistaken. It is a matter of course that this intellectual flux and
|
samer@74
|
279 reflux, this perpetual giving and receiving takes place unconsciously, and with
|
samer@74
|
280 the rapidity of lightning-flashes.'
|
samer@74
|
281 \end{quote}
|
samer@74
|
282 \end{iframe}
|
samer@74
|
283
|
samer@74
|
284 \begin{iframe}[Probabilistic reasoning]
|
samer@74
|
285 \uncover<1->{%
|
samer@74
|
286 Making predictions and assessing surprise is
|
samer@74
|
287 essentially reasoning with degrees of belief and (arguably)
|
samer@74
|
288 the best way to do this is using Bayesian probability theory \cite{Cox1946,Jaynes27}.%
|
samer@74
|
289
|
samer@74
|
290 [NB. this is \textbf{subjective} probability as advocated by \eg De Finetti and Jaynes.]
|
samer@74
|
291 }
|
samer@74
|
292
|
samer@74
|
293 % Thus, we assume that musical schemata are encoded as probabilistic % \citep{Meyer56} models, and
|
samer@74
|
294 \uncover<2->{%
|
samer@74
|
295 We suppose that familiarity with different styles of music takes the form
|
samer@74
|
296 of various probabilistic models, and that these models are adapted through listening.%
|
samer@74
|
297 }
|
samer@74
|
298 % various stylistic norms is encoded as
|
samer@74
|
299 % using models that encode the statistics of music in general, the particular styles
|
samer@74
|
300 % of music that seem best to fit the piece we happen to be listening to, and the emerging
|
samer@74
|
301 % structures peculiar to the current piece.
|
samer@74
|
302
|
samer@74
|
303 \uncover<3->{%
|
samer@74
|
304 Experimental evidence that humans are able to internalise statistical
|
samer@74
|
305 knowledge about musical: \citep{SaffranJohnsonAslin1999,EerolaToiviainenKrumhansl2002}; and also
|
samer@74
|
306 that statistical models are effective for computational analysis of music, \eg \cite{ConklinWitten95,Pearce2005}.%
|
samer@74
|
307 }
|
samer@74
|
308
|
samer@74
|
309 % analysis of music, \eg \cite{ConklinWitten95,PonsfordWigginsMellish1999,Pearce2005}.
|
samer@74
|
310 % \cite{Ferrand2002}. Dubnov and Assayag PSTs?
|
samer@74
|
311 \end{iframe}
|
samer@74
|
312
|
samer@74
|
313 \begin{iframe}[Music and information theory]
|
samer@74
|
314 \uncover<1->{
|
samer@74
|
315 With probabilistic models in hand we can apply quantitative information theory: we can compute entropies,
|
samer@74
|
316 relative entropies, mutual information, and all that.
|
samer@74
|
317 }
|
samer@74
|
318
|
samer@74
|
319 \uncover<2->{
|
samer@74
|
320 Lots of interest in application of information theory to perception, music and aesthetics since the 50s,
|
samer@74
|
321 \eg Moles \cite{Moles66}, Meyer \cite{Meyer67}, Cohen \cite{Cohen1962}, Berlyne \cite{Berlyne71}.
|
samer@74
|
322 (See also Bense, Hiller)
|
samer@74
|
323 }
|
samer@74
|
324
|
samer@74
|
325 \uncover<3->{
|
samer@74
|
326 Idea is that subjective qualities and
|
samer@74
|
327 states like uncertainty, surprise, complexity, tension, and interestingness
|
samer@74
|
328 are determined by information-theoretic quantities.
|
samer@74
|
329 }
|
samer@74
|
330
|
samer@74
|
331 \uncover<4->{
|
samer@74
|
332 Berlyne \cite{Berlyne71} called such quantities `collative variables', since they are
|
samer@74
|
333 to do with patterns of occurrence rather than medium-specific details.
|
samer@74
|
334 \emph{Information aesthetics}.
|
samer@74
|
335 }
|
samer@74
|
336 % Listeners then experience greater or lesser levels of surprise
|
samer@74
|
337 % in response to departures from these norms.
|
samer@74
|
338 % By careful manipulation
|
samer@74
|
339 % of the material, the composer can thus define, and induce within the
|
samer@74
|
340 % listener, a temporal programme of varying
|
samer@74
|
341 % levels of uncertainty, ambiguity and surprise.
|
samer@74
|
342 \end{iframe}
|
samer@74
|
343
|
samer@74
|
344 \begin{iframe}[Probabilistic model-based observer hypothesis]
|
samer@74
|
345 \begin{itemize}
|
samer@74
|
346 \item<1->
|
samer@74
|
347 As we listen, we maintain a probabilistic model that enables
|
samer@74
|
348 us to make predictions. As events unfold, we revise our probabilistic `belief state',
|
samer@74
|
349 including predictions about the future.
|
samer@74
|
350 \item<2->
|
samer@74
|
351 Probability distributions and changes in distributions are characterised in terms
|
samer@74
|
352 of information theoretic-measures such as entropy and relative entropy (KL divergence).
|
samer@74
|
353 \item<3->
|
samer@74
|
354 The dynamic evolution of these information measures captures significant structure,
|
samer@74
|
355 \eg events that are surprising, informative, explanatory \etc
|
samer@74
|
356 \end{itemize}
|
samer@74
|
357
|
samer@74
|
358 \end{iframe}
|
samer@74
|
359
|
samer@74
|
360 \begin{iframe}[Features of information dynamics]
|
samer@74
|
361 \uncover<1->{
|
samer@74
|
362 \textbf{Abstraction}: sensitive mainly to \emph{patterns} of occurence,
|
samer@74
|
363 rather than details of which specific things occur or the sensory medium.
|
samer@74
|
364 % it operates at a level of abstraction removed from the details of the sensory experience and
|
samer@74
|
365 % the medium through which it was received, suggesting that the same
|
samer@74
|
366 % approach could, in principle, be used to analyse and compare information
|
samer@74
|
367 % flow in different temporal media regardless of whether they are auditory, visual or otherwise.
|
samer@74
|
368 }
|
samer@74
|
369
|
samer@74
|
370 \uncover<2->{
|
samer@74
|
371 \textbf{Generality}: applicable in principle to any probabilistic model, in particular,
|
samer@74
|
372 models with time-dependent latent variables such as HMMs.
|
samer@74
|
373 Many important musical concepts like key, harmony, and beat are essentially `hidden variables'.
|
samer@74
|
374 }
|
samer@74
|
375
|
samer@74
|
376 \uncover<3->{
|
samer@74
|
377 \textbf{Richness}: when applied to models with latent variables, can result in many-layered
|
samer@74
|
378 analysis, capturing information flow about harmony, tempo, \etc
|
samer@74
|
379 }
|
samer@74
|
380
|
samer@74
|
381 \uncover<4->{
|
samer@74
|
382 \textbf{Subjectivity}: all probabilities are \emph{subjective} probabilities relative to \emph{observer's}
|
samer@74
|
383 model, which can depend on observer's capabilities and prior experience.
|
samer@74
|
384 }
|
samer@74
|
385 \end{iframe}
|
samer@74
|
386
|
samer@74
|
387 \section{Surprise, entropy and information in random sequences}
|
samer@74
|
388 \label{s:InfoInRandomProcs}
|
samer@74
|
389
|
samer@74
|
390 \begin{iframe}[Information theory primer\nicedot Entropy]
|
samer@74
|
391 Let $X$ be a discrete-valued random (in the sense of \emph{subjective} probability) variable.
|
samer@74
|
392 Entropy is a measure of \emph{uncertainty}. If observer expects to see $x$ with probability $p(x)$,
|
samer@74
|
393 then
|
samer@74
|
394 \begin{align*}
|
samer@74
|
395 H(X) &= \sum_{x\in\X} - p(x) \log p(x) \\
|
samer@74
|
396 &= \expect{[-\log p(X)]}.
|
samer@74
|
397 \end{align*}
|
samer@74
|
398 Consider $-\log p(x)$ as the `surprisingness' of $x$, then the entropy is the `expected surprisingness'.
|
samer@74
|
399 High for spread out distributions and low for concentrated ones.
|
samer@74
|
400 \end{iframe}
|
samer@74
|
401
|
samer@74
|
402 \begin{iframe}[Information theory primer\nicedot Relative entropy]
|
samer@74
|
403 Relative entropy or Kullback-Leibler (KL) divergence quantifies difference between
|
samer@74
|
404 probability distributions.
|
samer@74
|
405 If observer receives data $\mathcal{D}$, divergence between (subjective) prior and
|
samer@74
|
406 posterior distributions is the
|
samer@74
|
407 amount of information in $\mathcal{D}$ \emph{about} $X$ for this observer:
|
samer@74
|
408 \[
|
samer@74
|
409 I(\mathcal{D}\to X) =
|
samer@74
|
410 D(p_{X|\mathcal{D}} || p_X)
|
samer@74
|
411 = \sum_{x\in\X} p(x|\mathcal{D}) \log \frac{p(x|\mathcal{D})}{p(x)}.
|
samer@74
|
412 \]
|
samer@74
|
413 If observing $\mathcal{D}$ causes a large change in belief about $X$, then $\mathcal{D}$
|
samer@74
|
414 contained a lot of information about $X$.
|
samer@74
|
415
|
samer@74
|
416 Like Lindley's (1956) information (thanks Lars!).
|
samer@74
|
417 \end{iframe}
|
samer@74
|
418
|
samer@74
|
419 \begin{iframe}[Information theory primer\nicedot Mutual information]
|
samer@74
|
420 Mutual information between (MI) $X_1$ and $X_2$ is the expected amount of information about
|
samer@74
|
421 $X_2$ in an observation of $X_1$. Can be written in several ways:
|
samer@74
|
422 \begin{align*}
|
samer@74
|
423 I(X_1;X_2) &= \sum_{x_1,x_2} p(x_1,x_2) \log \frac{p(x_1,x_2)}{p(x_1)p(x_2)} \\
|
samer@74
|
424 &= H(X_1) + H(X_2) - H(X_1,X_2) \\
|
samer@74
|
425 &= H(X_2) - H(X_2|X_1).
|
samer@74
|
426 \end{align*}
|
samer@74
|
427 (1) Expected information about $X_2$ in an observation of $X_1$;\\
|
samer@74
|
428 (2) Expected reduction in uncertainty about $X_2$ after observing $X_1$;\\
|
samer@74
|
429 (3) Symmetric: $I(X_1;X_2) = I(X_2;X_1)$.
|
samer@74
|
430 \end{iframe}
|
samer@74
|
431
|
samer@74
|
432 \begin{iframe}[Information theory primer\nicedot Conditional MI]
|
samer@74
|
433 Information in one variable about another given observations of some third variable.
|
samer@74
|
434 Formulated analogously by adding conditioning variables to entropies:
|
samer@74
|
435 \begin{align*}
|
samer@74
|
436 I(X_1;X_2|X_3) &= H(X_1|X_3) - H(X_1|X_2,X_3).
|
samer@74
|
437 \end{align*}
|
samer@74
|
438 Makes explicit the dependence of information assessment on background knowledge,
|
samer@74
|
439 represented by conditioning variables.
|
samer@74
|
440 \end{iframe}
|
samer@74
|
441
|
samer@74
|
442
|
samer@74
|
443 \begin{isframe}[Information theory primer\nicedot I-Diagrams]
|
samer@74
|
444 \newcommand\rad{2.2em}%
|
samer@74
|
445 \newcommand\circo{circle (3.4em)}%
|
samer@74
|
446 \newcommand\labrad{4.3em}
|
samer@74
|
447 \newcommand\bound{(-6em,-5em) rectangle (6em,6em)}
|
samer@74
|
448 \newcommand\clipin[1]{\clip (#1) \circo;}%
|
samer@74
|
449 \newcommand\clipout[1]{\clip \bound (#1) \circo;}%
|
samer@74
|
450 \newcommand\cliptwo[3]{%
|
samer@74
|
451 \begin{scope}
|
samer@74
|
452 \clipin{#1};
|
samer@74
|
453 \clipin{#2};
|
samer@74
|
454 \clipout{#3};
|
samer@74
|
455 \fill[black!30] \bound;
|
samer@74
|
456 \end{scope}
|
samer@74
|
457 }%
|
samer@74
|
458 \newcommand\clipone[3]{%
|
samer@74
|
459 \begin{scope}
|
samer@74
|
460 \clipin{#1};
|
samer@74
|
461 \clipout{#2};
|
samer@74
|
462 \clipout{#3};
|
samer@74
|
463 \fill[black!15] \bound;
|
samer@74
|
464 \end{scope}
|
samer@74
|
465 }%
|
samer@74
|
466 Information diagrams are a Venn diagram-like represention of entropies and mutual
|
samer@74
|
467 informations for a set of random variables.
|
samer@74
|
468 \begin{center}
|
samer@74
|
469 \begin{tabular}{c@{\ }c}
|
samer@74
|
470 \scalebox{0.8}{%
|
samer@74
|
471 \begin{tikzpicture}[baseline=0pt]
|
samer@74
|
472 \coordinate (p1) at (90:\rad);
|
samer@74
|
473 \coordinate (p2) at (210:\rad);
|
samer@74
|
474 \coordinate (p3) at (-30:\rad);
|
samer@74
|
475 \clipone{p1}{p2}{p3};
|
samer@74
|
476 \clipone{p2}{p3}{p1};
|
samer@74
|
477 \clipone{p3}{p1}{p2};
|
samer@74
|
478 \cliptwo{p1}{p2}{p3};
|
samer@74
|
479 \cliptwo{p2}{p3}{p1};
|
samer@74
|
480 \cliptwo{p3}{p1}{p2};
|
samer@74
|
481 \begin{scope}
|
samer@74
|
482 \clip (p1) \circo;
|
samer@74
|
483 \clip (p2) \circo;
|
samer@74
|
484 \clip (p3) \circo;
|
samer@74
|
485 \fill[black!45] \bound;
|
samer@74
|
486 \end{scope}
|
samer@74
|
487 \draw (p1) \circo;
|
samer@74
|
488 \draw (p2) \circo;
|
samer@74
|
489 \draw (p3) \circo;
|
samer@74
|
490 \path
|
samer@74
|
491 (barycentric cs:p3=1,p1=-0.2,p2=-0.1) +(0ex,0) node {$I_{3|12}$}
|
samer@74
|
492 (barycentric cs:p1=1,p2=-0.2,p3=-0.1) +(0ex,0) node {$I_{1|23}$}
|
samer@74
|
493 (barycentric cs:p2=1,p3=-0.2,p1=-0.1) +(0ex,0) node {$I_{2|13}$}
|
samer@74
|
494 (barycentric cs:p3=1,p2=1,p1=-0.55) +(0ex,0) node {$I_{23|1}$}
|
samer@74
|
495 (barycentric cs:p1=1,p3=1,p2=-0.55) +(0ex,0) node {$I_{13|2}$}
|
samer@74
|
496 (barycentric cs:p2=1,p1=1,p3=-0.55) +(0ex,0) node {$I_{12|3}$}
|
samer@74
|
497 (barycentric cs:p3=1,p2=1,p1=1) node {$I_{123}$}
|
samer@74
|
498 ;
|
samer@74
|
499 \path
|
samer@74
|
500 (p1) +(140:\labrad) node {$X_1$}
|
samer@74
|
501 (p2) +(-140:\labrad) node {$X_2$}
|
samer@74
|
502 (p3) +(-40:\labrad) node {$X_3$};
|
samer@74
|
503 \end{tikzpicture}%
|
samer@74
|
504 }
|
samer@74
|
505 &
|
samer@74
|
506 \parbox{0.5\linewidth}{
|
samer@74
|
507 \small
|
samer@74
|
508 \begin{align*}
|
samer@74
|
509 I_{1|23} &= H(X_1|X_2,X_3) \\
|
samer@74
|
510 I_{13|2} &= I(X_1;X_3|X_2) \\
|
samer@74
|
511 I_{1|23} + I_{13|2} &= H(X_1|X_2) \\
|
samer@74
|
512 I_{12|3} + I_{123} &= I(X_1;X_2)
|
samer@74
|
513 \end{align*}
|
samer@74
|
514 }
|
samer@74
|
515 \end{tabular}
|
samer@74
|
516 \end{center}
|
samer@74
|
517 The areas of
|
samer@74
|
518 the three circles represent $H(X_1)$, $H(X_2)$ and $H(X_3)$ respectively.
|
samer@74
|
519 The total shaded area is the joint entropy $H(X_1,X_2,X_3)$.
|
samer@74
|
520 Each undivided region is an \emph{atom} of the I-diagram.
|
samer@74
|
521 \end{isframe}
|
samer@74
|
522
|
samer@74
|
523
|
samer@74
|
524
|
samer@74
|
525
|
samer@74
|
526 \begin{isframe}[Information theory in sequences]
|
samer@74
|
527 \def\bx{1.6em}%
|
samer@74
|
528 \def\cn(#1,#2) {\node[circle,draw,fill=white,inner sep=0.2em] at(#1) {$#2$};}%
|
samer@74
|
529 \def\dn(#1){\node[circle,inner sep=0.2em] at(#1) {$\cdots$};}%
|
samer@74
|
530 \def\en(#1){coordinate(#1)}%
|
samer@74
|
531 \def\tb{++(3.8em,0)}%
|
samer@74
|
532 \def\lb(#1)#2{\path (#1)+(0,\bx) node[anchor=south] {#2};}
|
samer@74
|
533 \def\nr(#1,#2,#3){\draw[rounded corners,fill=#3] (#1) rectangle (#2);}%
|
samer@74
|
534
|
samer@74
|
535 Consider an observer receiving elements of a random sequence
|
samer@74
|
536 $(\ldots, X_{-1}, X_0, X_1, X_2, \ldots)$, so that at any time $t$ there is
|
samer@74
|
537 a `present' $X_t$, an observed pasti $\past{X}_t$, and an unobserved future
|
samer@74
|
538 $\fut{X}_t$. Eg, at time $t=3$:
|
samer@74
|
539
|
samer@74
|
540 \begin{figure}
|
samer@74
|
541 \begin{tikzpicture}%[baseline=-1em]
|
samer@74
|
542 \path (0,0) \en(X0) \tb \en(X1) \tb \en(X2) \tb \en(X3) \tb \en(X4) \tb \en(X5) \tb \en(X6);
|
samer@74
|
543 \path (X0)+(-\bx,-\bx) \en(p1) (X2)+(\bx,\bx) \en(p2)
|
samer@74
|
544 (X3)+(-\bx,-\bx) \en(p3) (X3)+(\bx,\bx) \en(p4)
|
samer@74
|
545 (X4)+(-\bx,-\bx) \en(p5) (X6)+(\bx,\bx) \en(p6);
|
samer@74
|
546 \nr(p1,p2,un3) \nr(p3,p4,un4) \nr(p5,p6,un5)
|
samer@74
|
547 \dn(X0) \cn(X1,X_1) \cn(X2,X_2) \cn(X3,X_3) \cn(X4,X_4) \cn(X5,X_5) \dn(X6)
|
samer@74
|
548 \lb(X1){Past: $\past{X}_3$}
|
samer@74
|
549 \lb(X5){Future $\fut{X}_3$}
|
samer@74
|
550 \lb(X3){Present}
|
samer@74
|
551 \end{tikzpicture}%}%
|
samer@74
|
552 \end{figure}
|
samer@74
|
553 Consider how the observer's belief state evolves when, having observed up to
|
samer@74
|
554 $X_2$, it learns the value of $X_3$.
|
samer@74
|
555 \end{isframe}
|
samer@74
|
556
|
samer@74
|
557 \begin{iframe}[`Surprise' based quantities]
|
samer@74
|
558 To obtain first set of measures, we ignore the future $\fut{X}_t$
|
samer@74
|
559 and consider the probability distribution for $X_t$ give the
|
samer@74
|
560 observed past $\past{X}_t=\past{x}_t$.
|
samer@74
|
561
|
samer@74
|
562 \begin{enumerate}
|
samer@74
|
563 \item<1->
|
samer@74
|
564 \textbf{Surprisingness}: negative log-probability
|
samer@74
|
565 $\ell_t = -\log p(x_t|\past{x}_t)$.
|
samer@74
|
566
|
samer@74
|
567 \item<2->
|
samer@74
|
568 Expected surprisingness given context $\past{X}=\past{x}_t$ is the entropy of the predictive distribution,
|
samer@74
|
569 $H(X_t|\ev(\past{X}_t=\past{x}_t))$: uncertainty about $X_t$ before the observation is made.
|
samer@74
|
570
|
samer@74
|
571 \item<3->
|
samer@74
|
572 Expectation over all possible realisations of process is the conditional entropy
|
samer@74
|
573 $H(X_t|\past{X}_t)$ according to the observer's model. For stationary process, is
|
samer@74
|
574 \emph{entropy rate} $h_\mu$.
|
samer@74
|
575 \end{enumerate}
|
samer@74
|
576 \end{iframe}
|
samer@74
|
577
|
samer@74
|
578 \begin{iframe}[Predictive information]
|
samer@74
|
579 Second set of measures based on amount of information the observation $\ev(X_t=x_t)$
|
samer@74
|
580 carries \emph{about} about the unobserved future $\fut{X}_t$, \emph{given} that we already
|
samer@74
|
581 know the past $\ev(\past{X}_t=\past{x}_t)$:
|
samer@74
|
582 is
|
samer@74
|
583 \begin{equation*}
|
samer@74
|
584 \mathcal{I}_t = I(\ev(X_t=x_t)\to\fut{X}_t|\ev(\past{X}_t=\past{x}_t)).
|
samer@74
|
585 \end{equation*}
|
samer@74
|
586 Is KL divergence between beliefs about future $\fut{X}_t$ prior and posterior
|
samer@74
|
587 to observation $\ev(X_t=x_t)$.
|
samer@74
|
588 Hence, for continuous valued variables, invariant to invertible
|
samer@74
|
589 transformations of the observation spaces.
|
samer@74
|
590 \end{iframe}
|
samer@74
|
591
|
samer@74
|
592 \begin{iframe}[Predictive information based quantities]
|
samer@74
|
593 \begin{enumerate}
|
samer@74
|
594 \item<1->
|
samer@74
|
595 \emph{Instantaneous predictive information} (IPI) is just $\mathcal{I}_t$.
|
samer@74
|
596
|
samer@74
|
597 % Expectations over $X|\ev(Z=z)$, $Z|\ev(X=x)$, and $(X,Z)$ give 3 more information measures:
|
samer@74
|
598 \item<2->
|
samer@74
|
599 Expectation of $\mathcal{I}_t$ before observation at time $t$ is
|
samer@74
|
600 $I(X_t;\fut{X}_t | \ev(\past{X}_t=\past{x}_t))$: mutual information conditioned on
|
samer@74
|
601 observed past. Is the amount of new information about the future expected from the next observation.
|
samer@74
|
602 Useful for directing attention towards the next event even before it happens?
|
samer@74
|
603
|
samer@74
|
604 % This is different from Itti and Baldi's proposal that Bayesian
|
samer@74
|
605 % \emph{surprise} attracts attention \cite{IttiBaldi2005}, as it is a mechanism which can
|
samer@74
|
606 % operate \emph{before} the surprise occurs.
|
samer@74
|
607
|
samer@74
|
608
|
samer@74
|
609 \item<3->
|
samer@74
|
610 Expectation over all possible realisations is the conditional mutual information
|
samer@74
|
611 $I(X_t;\fut{X}_t|\past{X}_t)$. For stationary process, this is the global
|
samer@74
|
612 \emph{predictive information rate} (PIR), the average rate at which new information arrives about
|
samer@74
|
613 the future. In terms of conditional entropies, has two forms:
|
samer@74
|
614 $H(\fut{X}_t|\past{X}_t) - H(\fut{X}_t|X_t,\past{X}_t)$ or
|
samer@74
|
615 $H(X_t|\past{X}_t) - H(X_t|\fut{X}_t,\past{X}_t)$.
|
samer@74
|
616 \end{enumerate}
|
samer@74
|
617
|
samer@74
|
618 \end{iframe}
|
samer@74
|
619
|
samer@74
|
620 \begin{iframe}[Global measures for stationary processes]
|
samer@74
|
621 For a stationary random process model, the average levels of suprise and information
|
samer@74
|
622 are captured by the time-shift invariant process information measures:
|
samer@74
|
623 \begin{align*}
|
samer@74
|
624 \text{entropy rate} &: & h_\mu &= H(X_t | \past{X}_t) \\
|
samer@74
|
625 \text{multi-information rate} &: & \rho_\mu &= I(\past{X}_t;X_t) = H(X_t) - h_\mu \\
|
samer@74
|
626 \text{residual entropy rate} &: & r_\mu &= H(X_t | \past{X}_t, \fut{X}_t) \\
|
samer@74
|
627 \text{predictive information rate} &: & b_\mu &= I(X_t;\fut{X}_t|\past{X}_t) = h_\mu - r_\mu
|
samer@74
|
628 \end{align*}
|
samer@74
|
629 Residual entropy also known as \emph{erasure entropy} \cite{VerduWeissman2006}.
|
samer@74
|
630 \end{iframe}
|
samer@74
|
631
|
samer@74
|
632 \begin{isframe}[Process I-diagrams]
|
samer@74
|
633 % \newcommand\subfig[2]{\shortstack{#2\\[0.75em]#1}}
|
samer@74
|
634 \newcommand\subfig[2]{#2}
|
samer@74
|
635 \newcommand\rad{1.75em}%
|
samer@74
|
636 \newcommand\ovoid[1]{%
|
samer@74
|
637 ++(-#1,\rad)
|
samer@74
|
638 -- ++(2 * #1,0em) arc (90:-90:\rad)
|
samer@74
|
639 -- ++(-2 * #1,0em) arc (270:90:\rad)
|
samer@74
|
640 }%
|
samer@74
|
641 \newcommand\axis{2.75em}%
|
samer@74
|
642 \newcommand\olap{0.85em}%
|
samer@74
|
643 \newcommand\offs{3.6em}
|
samer@74
|
644 \newcommand\longblob{\ovoid{\axis}}
|
samer@74
|
645 \newcommand\shortblob{\ovoid{1.75em}}
|
samer@74
|
646 \begin{figure}
|
samer@74
|
647 \begin{tikzpicture}%[baseline=-1em]
|
samer@74
|
648 \newcommand\rc{\rad}
|
samer@74
|
649 \newcommand\throw{2.5em}
|
samer@74
|
650 \coordinate (p1) at (180:1.5em);
|
samer@74
|
651 \coordinate (p2) at (0:0.3em);
|
samer@74
|
652 \newcommand\bound{(-7em,-2.6em) rectangle (7em,3.0em)}
|
samer@74
|
653 \newcommand\present{(p2) circle (\rc)}
|
samer@74
|
654 \newcommand\thepast{(p1) ++(-\throw,0) \ovoid{\throw}}
|
samer@74
|
655 \newcommand\fillclipped[2]{%
|
samer@74
|
656 \begin{scope}[even odd rule]
|
samer@74
|
657 \foreach \thing in {#2} {\clip \thing;}
|
samer@74
|
658 \fill[black!#1] \bound;
|
samer@74
|
659 \end{scope}%
|
samer@74
|
660 }%
|
samer@74
|
661 \fillclipped{30}{\present,\bound \thepast}
|
samer@74
|
662 \fillclipped{15}{\present,\bound \thepast}
|
samer@74
|
663 \fillclipped{45}{\present,\thepast}
|
samer@74
|
664 \draw \thepast;
|
samer@74
|
665 \draw \present;
|
samer@74
|
666 \node at (barycentric cs:p2=1,p1=-0.3) {$h_\mu$};
|
samer@74
|
667 \node at (barycentric cs:p2=1,p1=1) [shape=rectangle,fill=black!45,inner sep=1pt]{$\rho_\mu$};
|
samer@74
|
668 \path (p2) +(90:3em) node {$X_0$};
|
samer@74
|
669 \path (p1) +(-3em,0em) node {\shortstack{infinite\\past}};
|
samer@74
|
670 \path (p1) +(-4em,\rad) node [anchor=south] {$\ldots,X_{-1}$};
|
samer@74
|
671 \end{tikzpicture}%
|
samer@74
|
672 \\[0.25em]
|
samer@74
|
673 \begin{tikzpicture}%[baseline=-1em]
|
samer@74
|
674 \newcommand\rc{2.2em}
|
samer@74
|
675 \newcommand\throw{2.5em}
|
samer@74
|
676 \coordinate (p1) at (210:1.5em);
|
samer@74
|
677 \coordinate (p2) at (90:0.8em);
|
samer@74
|
678 \coordinate (p3) at (-30:1.5em);
|
samer@74
|
679 \newcommand\bound{(-7em,-2.6em) rectangle (7em,3.0em)}
|
samer@74
|
680 \newcommand\present{(p2) circle (\rc)}
|
samer@74
|
681 \newcommand\thepast{(p1) ++(-\throw,0) \ovoid{\throw}}
|
samer@74
|
682 \newcommand\future{(p3) ++(\throw,0) \ovoid{\throw}}
|
samer@74
|
683 \newcommand\fillclipped[2]{%
|
samer@74
|
684 \begin{scope}[even odd rule]
|
samer@74
|
685 \foreach \thing in {#2} {\clip \thing;}
|
samer@74
|
686 \fill[black!#1] \bound;
|
samer@74
|
687 \end{scope}%
|
samer@74
|
688 }%
|
samer@74
|
689 % \fillclipped{80}{\future,\thepast}
|
samer@74
|
690 \fillclipped{30}{\present,\future,\bound \thepast}
|
samer@74
|
691 \fillclipped{15}{\present,\bound \future,\bound \thepast}
|
samer@74
|
692 \draw \future;
|
samer@74
|
693 \fillclipped{45}{\present,\thepast}
|
samer@74
|
694 \draw \thepast;
|
samer@74
|
695 \draw \present;
|
samer@74
|
696 \node at (barycentric cs:p2=0.9,p1=-0.17,p3=-0.17) {$r_\mu$};
|
samer@74
|
697 \node at (barycentric cs:p1=-0.5,p2=1.0,p3=1) {$b_\mu$};
|
samer@74
|
698 \node at (barycentric cs:p3=0,p2=1,p1=1.2) [shape=rectangle,fill=black!45,inner sep=1pt]{$\rho_\mu$};
|
samer@74
|
699 \path (p2) +(140:3.2em) node {$X_0$};
|
samer@74
|
700 % \node at (barycentric cs:p3=0,p2=1,p1=1) {$\rho_\mu$};
|
samer@74
|
701 \path (p3) +(3em,0em) node {\shortstack{infinite\\future}};
|
samer@74
|
702 \path (p1) +(-3em,0em) node {\shortstack{infinite\\past}};
|
samer@74
|
703 \path (p1) +(-4em,\rad) node [anchor=south] {$\ldots,X_{-1}$};
|
samer@74
|
704 \path (p3) +(4em,\rad) node [anchor=south] {$X_1,\ldots$};
|
samer@74
|
705 \end{tikzpicture}%
|
samer@74
|
706 % \\[0.25em]
|
samer@74
|
707 % The small dark
|
samer@74
|
708 % region below $X_0$ is $\sigma_\mu$ and the excess entropy
|
samer@74
|
709 % is $E = \rho_\mu + \sigma_\mu$.
|
samer@74
|
710 \end{figure}
|
samer@74
|
711 Marginal entropy of `present' $X_0$ is $H(X_0)=\rho_\mu+r_\mu+b_\mu$.\\
|
samer@74
|
712 Entropy rate is $h_\mu = r_\mu+b_\mu$.
|
samer@74
|
713 \end{isframe}
|
samer@74
|
714
|
samer@74
|
715 \section{Markov chains}
|
samer@74
|
716 \label{s:InfoInMC}
|
samer@74
|
717
|
samer@74
|
718
|
samer@74
|
719 \begin{iframe}[Markov chains\nicedot Definitions]
|
samer@74
|
720
|
samer@74
|
721 % Now we'll look at information dynamics in one of the simplest possible models, a Markov chain.
|
samer@74
|
722 % To illustrate the how the measures defined in \secrf{InfoInRandomProcs} can be computed
|
samer@74
|
723 % in practice, we will consider one of the simplest random processes, a
|
samer@74
|
724 % first order Markov chain.
|
samer@74
|
725 % In this case, the dynamic information measures can be computed in closed-form.
|
samer@74
|
726 %
|
samer@74
|
727
|
samer@74
|
728 Let $X$ be a Markov chain with state space
|
samer@74
|
729 $\{1, \ldots, K\}$, \ie the $X_t$ take values from $1$ to $K$.
|
samer@74
|
730 \begin{center}
|
samer@74
|
731 \begin{tikzpicture}[->]
|
samer@74
|
732 \matrix[column sep=2em,ampersand replacement=\&]{
|
samer@74
|
733 \cn(X,1) \& \cn(X,2) \& \cn(X,3) \& \cn(X,4) \& \dn(XT) \\};
|
samer@74
|
734 \rl(X1,X2) \rl(X2,X3) \rl(X3,X4) \rl(X4,XT)
|
samer@74
|
735 \end{tikzpicture}
|
samer@74
|
736 \end{center}
|
samer@74
|
737 % For the sake of brevity let us assume that $\domA$ is the set of integers from 1 to $K$.
|
samer@74
|
738 Parameterised by transition matrix $\trans \in \reals^{K\times K}$,
|
samer@74
|
739 % encoding the distribution of any element of the sequence given previous one,
|
samer@74
|
740 \ie $p(\ev(X_{t+1}=i)|\ev(X_t=j))=\trans_{ij}$.
|
samer@74
|
741 Assume irreducibility, ergodicity \etc to ensure uniqueness of
|
samer@74
|
742 stationary distribution $\pi$ such that
|
samer@74
|
743 $p(\ev(X_t=i))=\init_i$ independent of $t$. Entropy rate as a function of
|
samer@74
|
744 $a$ is
|
samer@74
|
745 % $\entrorate:\reals^{K\times K} \to \reals$:
|
samer@74
|
746 \[
|
samer@74
|
747 \entrorate(\trans) = \sum_{j=1}^K \init_j \sum_{i=1}^K -\trans_{ij} \log \trans_{ij}.
|
samer@74
|
748 \]
|
samer@74
|
749 \end{iframe}
|
samer@74
|
750
|
samer@74
|
751 \begin{iframe}[Markov chains\nicedot PIR]
|
samer@74
|
752 Predictive information rate for first order chains comes out in terms of entropy rate
|
samer@74
|
753 function as
|
samer@74
|
754 \[
|
samer@74
|
755 b_\mu = h(a^2) - h(a),
|
samer@74
|
756 \]
|
samer@74
|
757 where $a^2$ is \emph{two-step} transition matrix.
|
samer@74
|
758
|
samer@74
|
759 \uncover<2->{
|
samer@74
|
760 Can be generalised to higher-order transition matrices
|
samer@74
|
761 \[
|
samer@74
|
762 b_\mu = h(\hat{a}^{N+1}) - Nh(\hat{a}),
|
samer@74
|
763 \]
|
samer@74
|
764 where $N$ is the order of the chain and $\hat{a}$ is a sparse
|
samer@74
|
765 $K^N\times K^N$ transition matrix over product state space of $N$
|
samer@74
|
766 consecutive observations (step size 1).
|
samer@74
|
767 }
|
samer@74
|
768 \end{iframe}
|
samer@74
|
769
|
samer@74
|
770 \begin{iframe}[Entropy rate and PIR in Markov chains]
|
samer@74
|
771
|
samer@74
|
772 \begin{fig}{artseq}
|
samer@74
|
773 \hangbox{\colfig[0.40]{matbase/fig8515}}%
|
samer@74
|
774 \quad
|
samer@74
|
775 \hangbox{%
|
samer@74
|
776 \begin{tabular}{cc}%
|
samer@74
|
777 \colfig[0.18]{matbase/fig1356} &
|
samer@74
|
778 \colfig[0.18]{matbase/fig45647} \\
|
samer@74
|
779 \colfig[0.18]{matbase/fig49938} &
|
samer@74
|
780 \colfig[0.18]{matbase/fig23355}%
|
samer@74
|
781 \end{tabular}%
|
samer@74
|
782 }%
|
samer@74
|
783 % \end{hanging}\\
|
samer@74
|
784 \end{fig}
|
samer@74
|
785 For given $K$, entropy rate varies between 0 (deterministic sequence)
|
samer@74
|
786 and $\log K$ when $\trans_{ij}=1/K$ for all $i,j$.
|
samer@74
|
787 Space of transition matrices explored by generating
|
samer@74
|
788 them at random and plotting entropy rate vs PIR. (Note inverted
|
samer@74
|
789 `U' relationship). %Transmat (d) is almost uniform.
|
samer@74
|
790 \end{iframe}
|
samer@74
|
791
|
samer@74
|
792 \begin{iframe}[Samples from processes with different PIR]
|
samer@74
|
793 \begin{figure}
|
samer@74
|
794 \colfig[0.75]{matbase/fig847}\\
|
samer@74
|
795 \colfig[0.75]{matbase/fig61989}\\
|
samer@74
|
796 \colfig[0.75]{matbase/fig43415}\\
|
samer@74
|
797 \colfig[0.75]{matbase/fig50385}
|
samer@74
|
798 \end{figure}
|
samer@74
|
799 Sequence (a) is repetition
|
samer@74
|
800 of state 4 (see transmat (a) on previous slide).
|
samer@74
|
801 System (b) has the highest PIR.
|
samer@74
|
802 \end{iframe}
|
samer@74
|
803
|
samer@74
|
804 % \begin{tabular}{rl}
|
samer@74
|
805 % (a) & \raisebox{-1em}{\colfig[0.58]{matbase/fig9048}}\\[1em]
|
samer@74
|
806 % (b) & \raisebox{-1em}{\colfig[0.58]{matbase/fig58845}}\\[1em]
|
samer@74
|
807 % (c) & \raisebox{-1em}{\colfig[0.58]{matbase/fig45019}}\\[1em]
|
samer@74
|
808 % (d) & \raisebox{-1em}{\colfig[0.58]{matbase/fig1511}}
|
samer@74
|
809 % \end{tabular}
|
samer@74
|
810
|
samer@74
|
811 \section{Application: The Melody Triangle}
|
samer@74
|
812 \begin{iframe}[Complexity and interestingness: the Wundt Curve]
|
samer@74
|
813 \label{s:Wundt}
|
samer@74
|
814 Studies looking into the relationship between stochastic complexity
|
samer@74
|
815 (usually measured as entropy or entropy rate) and aesthetic value, reveal
|
samer@74
|
816 an inverted `U' shaped curve \citep{Berlyne71}. (Also, Wundt curve \cite{Wundt1897}).
|
samer@74
|
817 Repeated exposure tends to move stimuli leftwards.
|
samer@74
|
818
|
samer@74
|
819 \hangbox{%
|
samer@74
|
820 \only<1>{\colfig[0.5]{wundt}}%
|
samer@74
|
821 \only<2>{\colfig[0.5]{wundt2}}%
|
samer@74
|
822 }\hfill
|
samer@74
|
823 \hangbox{\parbox{0.43\linewidth}{\raggedright
|
samer@74
|
824 %Too deterministic $\rightarrow$ predictable, boring like a monotone;\\
|
samer@74
|
825 %Too random $\rightarrow$ are boring like white noise: unstructured,
|
samer@74
|
826 %featureless, uniform.
|
samer@74
|
827 Explanations for this usually appeal to a need for a `balance'
|
samer@74
|
828 between order and chaos, unity and diversity, and so on, in a generally
|
samer@74
|
829 imprecise way.}}
|
samer@74
|
830
|
samer@74
|
831
|
samer@74
|
832 % Hence, a sequence can be uninteresting in two opposite ways: by
|
samer@74
|
833 % being utterly predictable \emph{or} by being utterly
|
samer@74
|
834 % unpredictable.
|
samer@74
|
835 % Meyer \cite{Meyer2004} suggests something similar:
|
samer@74
|
836 % hints at the same thing while discussing
|
samer@74
|
837 % the relation between the rate of information flow and aesthetic experience,
|
samer@74
|
838 % suggesting that
|
samer@74
|
839 %% `unless there is some degree of order, \ldots
|
samer@74
|
840 %% there is nothing to be uncertain \emph{about} \ldots
|
samer@74
|
841 % `If the amount of information [by which he means entropy and surprisingness]
|
samer@74
|
842 % is inordinately increased, the result is a kind of cognitive white noise.'
|
samer@74
|
843
|
samer@74
|
844 \end{iframe}
|
samer@74
|
845
|
samer@74
|
846 \begin{iframe}[PIR as a measure of cognitive activity]
|
samer@74
|
847
|
samer@74
|
848 The predictive information rate incorporates a similar balance automatically:
|
samer@74
|
849 is maximal for sequences which are neither deterministic nor
|
samer@74
|
850 totally uncorrelated across time.
|
samer@74
|
851
|
samer@74
|
852 \vspace{1em}
|
samer@74
|
853 \begin{tabular}{rr}%
|
samer@74
|
854 \raisebox{0.5em}{too predictable:} &
|
samer@74
|
855 \only<1>{\noderow(black,un0,un0,un0,un1,un1)}%
|
samer@74
|
856 \only<2>{\noderow(black,black,un0,un0,un0,un1)}%
|
samer@74
|
857 \only<3>{\noderow(black,black,black,un0,un0,un0)}%
|
samer@74
|
858 \only<4>{\noderow(black,black,black,black,un0,un0)}%
|
samer@74
|
859 \\[1.2em]
|
samer@74
|
860 \raisebox{0.5em}{intermediate:} &
|
samer@74
|
861 \only<1>{\noderow(black,un1,un2,un3,un4,un5)}%
|
samer@74
|
862 \only<2>{\noderow(black,black,un1,un2,un3,un4)}%
|
samer@74
|
863 \only<3>{\noderow(black,black,black,un1,un2,un3)}%
|
samer@74
|
864 \only<4>{\noderow(black,black,black,black,un1,un2)}%
|
samer@74
|
865 \\[1.2em]
|
samer@74
|
866 \raisebox{0.5em}{too random:} &
|
samer@74
|
867 \only<1>{\noderow(black,un5,un5,un5,un5,un5)}%
|
samer@74
|
868 \only<2>{\noderow(black,black,un5,un5,un5,un5)}%
|
samer@74
|
869 \only<3>{\noderow(black,black,black,un5,un5,un5)}%
|
samer@74
|
870 \only<4>{\noderow(black,black,black,black,un5,un5)}%
|
samer@74
|
871 \end{tabular}
|
samer@74
|
872 \vspace{1em}
|
samer@74
|
873
|
samer@74
|
874 (Black: \emph{observed}; red: \emph{unobserved}; paler: \emph{greater uncertainty}.)
|
samer@74
|
875 Our interpretation:
|
samer@74
|
876 % when each event appears to carry no new information about the unknown future,
|
samer@74
|
877 % it is `meaningless' and not worth attending to.
|
samer@74
|
878 Things are `interesting' or at least `salient' when each new part supplies new information about parts to come.
|
samer@74
|
879
|
samer@74
|
880 % Quantitative information dynamics will enable us to test this experimentally with human
|
samer@74
|
881 % subjects.
|
samer@74
|
882 \end{iframe}
|
samer@74
|
883
|
samer@74
|
884 \begin{iframe}[The Melody Triangle\nicedot Information space]
|
samer@74
|
885 \begin{figure}
|
samer@74
|
886 \colfig[0.75]{mtriscat}
|
samer@74
|
887 \end{figure}
|
samer@74
|
888 Population of transition matrices in 3D space of $h_\mu$, $\rho_\mu$ and $b_\mu$.
|
samer@74
|
889 % Concentrations of points along redundancy axis correspond to roughly periodic patterns.
|
samer@74
|
890 Colour of each point
|
samer@74
|
891 represents PIR.
|
samer@74
|
892 %---highest values found at intermediate entropy and redundancy.
|
samer@74
|
893 Shape is mostly (not completely) hollow inside: forming roughly
|
samer@74
|
894 a curved triangular sheet.
|
samer@74
|
895 \end{iframe}
|
samer@74
|
896
|
samer@74
|
897 \begin{iframe}[The Melody Triangle\nicedot User interface]
|
samer@74
|
898 \begin{figure}
|
samer@74
|
899 \colfig[0.55]{TheTriangle.pdf}
|
samer@74
|
900 \end{figure}
|
samer@74
|
901 Allows user to place tokens in the triangle
|
samer@74
|
902 to cause sonification of a Markov chain with corresponding information
|
samer@74
|
903 `coordinate'.
|
samer@74
|
904 \end{iframe}
|
samer@74
|
905
|
samer@74
|
906 \begin{iframe}[Subjective information]
|
samer@74
|
907 So far we've assumed that sequence is actually sampled from
|
samer@74
|
908 from a stationary Markov chain with a transition matrix known
|
samer@74
|
909 to the observer.
|
samer@74
|
910 This means time averages of IPI and surprise should equal
|
samer@74
|
911 expectations.
|
samer@74
|
912
|
samer@74
|
913 \uncover<2->{
|
samer@74
|
914 What if sequence is sampled from some other Markov chain,
|
samer@74
|
915 or is produced by some unknown process?
|
samer@74
|
916 }
|
samer@74
|
917
|
samer@74
|
918 \begin{itemize}
|
samer@74
|
919 \item<3->
|
samer@74
|
920 In general, it may be impossible to identify any `true' model. There
|
samer@74
|
921 are no `objective' probabilities; only subjective ones, as
|
samer@74
|
922 argued by de Finetti \cite{deFinetti}.
|
samer@74
|
923
|
samer@74
|
924
|
samer@74
|
925 \item<4->
|
samer@74
|
926 If sequence \emph{is} sampled from some Markov chain, we can
|
samer@74
|
927 compute (time) averages of observer's average subjective surprise
|
samer@74
|
928 and PI and also track what happens if observer gradually learns
|
samer@74
|
929 the transition matrix from the data.
|
samer@74
|
930 \end{itemize}
|
samer@74
|
931 \end{iframe}
|
samer@74
|
932
|
samer@74
|
933
|
samer@74
|
934 \begin{iframe}[Effect of learning on information dynamics]
|
samer@74
|
935 \begin{figure}
|
samer@74
|
936 % \colfig{matbase/fig42687} % too small text
|
samer@74
|
937 % \colfig{matbase/fig60379} % 9*19 too tall
|
samer@74
|
938 % \colfig{matbase/fig52515} % 9*20 ok, perhaps text still too small
|
samer@74
|
939 \colfig[0.9]{matbase/fig30461} % 8*19 ok
|
samer@74
|
940 % \colfig{matbase/fig66022} % 8.5*19 ok
|
samer@74
|
941 \end{figure}
|
samer@74
|
942 % Upper row shows actual stochastic learning,
|
samer@74
|
943 % lower shows the idealised deterministic learning.
|
samer@74
|
944 \textbf{(a/b/e/f)}: multiple runs starting from same
|
samer@74
|
945 initial condition but using different generative transition matrices.
|
samer@74
|
946 \textbf{(c/d/g/h)}: multiple runs starting from different
|
samer@74
|
947 initial conditions and converging on transition matrices
|
samer@74
|
948 with (c/g) high and (d/h) low PIR.
|
samer@74
|
949 \end{iframe}
|
samer@74
|
950
|
samer@74
|
951
|
samer@74
|
952 \section{More process models}
|
samer@74
|
953 \begin{iframe}[Exchangeable sequences and parametric models]
|
samer@74
|
954 De Finetti's theorem says that an exchangeable random process can be represented
|
samer@74
|
955 as a sequence variables which are iid \emph{given} some hidden probability
|
samer@74
|
956 distribution, which we can think of as a parameterised model:
|
samer@74
|
957 \begin{tabular}{lp{0.45\linewidth}}
|
samer@74
|
958 \hangbox{\begin{tikzpicture}
|
samer@74
|
959 [>=stealth',var/.style={circle,draw,inner sep=1pt,text height=10pt,text depth=4pt}]
|
samer@74
|
960 \matrix[ampersand replacement=\&,matrix of math nodes,row sep=2em,column sep=1.8em,minimum size=17pt] {
|
samer@74
|
961 \& |(theta) [var]| \Theta \\
|
samer@74
|
962 |(x1) [var]| X_1 \& |(x2) [var]| X_2 \& |(x3) [var]| X_3 \&
|
samer@74
|
963 |(etc) [outer sep=2pt]| \dots \\
|
samer@74
|
964 };
|
samer@74
|
965 \foreach \n in {x1,x2,x3,etc} \draw[->] (theta)--(\n);
|
samer@74
|
966 \end{tikzpicture}}
|
samer@74
|
967 &
|
samer@74
|
968 \raggedright
|
samer@74
|
969 \uncover<2->{Observer's belief state at time $t$ includes probability distribution
|
samer@74
|
970 over the parameters $p(\ev(\Theta=\theta)|\ev(\past{X}_t=\past{x}_t))$.}
|
samer@74
|
971 \end{tabular}\\[1em]
|
samer@74
|
972 \uncover<3->{
|
samer@74
|
973 Each observation causes revision of belief state
|
samer@74
|
974 and hence supplies information
|
samer@74
|
975 $
|
samer@74
|
976 I(\ev(X_t=x_t)\to\Theta|\ev(\past{X}_t=\past{x}_t))
|
samer@74
|
977 % = D( p_{\Theta|\ev(X_t=x_t),\ev(\past{X}_t=\past{x}_t)} || p_{\Theta|\ev(\past{X}_t=\past{x}_t)} ).
|
samer@74
|
978 $ about $\Theta$:
|
samer@74
|
979 In previous work we called this the `model information rate'.
|
samer@74
|
980 }
|
samer@74
|
981 \uncover<4->{(Same as Haussler and Opper's \cite{HausslerOpper1995} IIG or
|
samer@74
|
982 Itti and Baldi's \cite{IttiBaldi2005} Bayesian surprise.)}
|
samer@74
|
983 \end{iframe}
|
samer@74
|
984
|
samer@74
|
985 \def\circ{circle (9)}%
|
samer@74
|
986 \def\bs(#1,#2,#3){(barycentric cs:p1=#1,p2=#2,p3=#3)}%
|
samer@74
|
987 \begin{iframe}[IIG equals IPI in (some) XRPs]
|
samer@74
|
988 \begin{tabular}{@{}lc}
|
samer@74
|
989 \parbox[c]{0.5\linewidth}{\raggedright
|
samer@74
|
990 Mild assumptions yield a relationship between IIG (instantaneous information gain) and IPI.
|
samer@74
|
991 (Everything here implicitly conditioned on $\past{X}_t$).}
|
samer@74
|
992 &
|
samer@74
|
993 \pgfsetxvec{\pgfpoint{1mm}{0mm}}%
|
samer@74
|
994 \pgfsetyvec{\pgfpoint{0mm}{1mm}}%
|
samer@74
|
995 \begin{tikzpicture}[baseline=0pt]
|
samer@74
|
996 \coordinate (p1) at (90:6);
|
samer@74
|
997 \coordinate (p2) at (210:6);
|
samer@74
|
998 \coordinate (p3) at (330:6);
|
samer@74
|
999 \only<4->{%
|
samer@74
|
1000 \begin{scope}
|
samer@74
|
1001 \foreach \p in {p1,p2,p3} \clip (\p) \circ;
|
samer@74
|
1002 \fill[lightgray] (-10,-10) rectangle (10,10);
|
samer@74
|
1003 \end{scope}
|
samer@74
|
1004 \path (0,0) node {$\mathcal{I}_t$};}
|
samer@74
|
1005 \foreach \p in {p1,p2,p3} \draw (\p) \circ;
|
samer@74
|
1006 \path (p2) +(210:13) node {$X_t$}
|
samer@74
|
1007 (p3) +(330:13) node {$\fut{X}_t$}
|
samer@74
|
1008 (p1) +(140:12) node {$\Theta$};
|
samer@74
|
1009 \only<2->{\path \bs(-0.25,0.5,0.5) node {$0$};}
|
samer@74
|
1010 \only<3->{\path \bs(0.5,0.5,-0.25) node {$0$};}
|
samer@74
|
1011 \end{tikzpicture}
|
samer@74
|
1012 \end{tabular}\\
|
samer@74
|
1013 \begin{enumerate}
|
samer@74
|
1014 \uncover<2->{\item $X_t \perp \fut{X}_t | \Theta$: observations iid given $\Theta$ for XRPs;}
|
samer@74
|
1015 \uncover<3->{\item $\Theta \perp X_t | \fut{X}_t$:
|
samer@74
|
1016 % $I(X_t;\fut{X}_t|\Theta_t)=0$ due to the conditional independence of
|
samer@74
|
1017 % observables given the parameters $\Theta_t$, and
|
samer@74
|
1018 % $I(\Theta_t;X_t|\fut{X}_t)=0$
|
samer@74
|
1019 assumption that $X_t$ adds no new information about $\Theta$
|
samer@74
|
1020 given infinitely long sequence $\fut{X}_t =X_{t+1:\infty}$.}
|
samer@74
|
1021 \end{enumerate}
|
samer@74
|
1022 \uncover<4->{Hence, $I(X_t;\Theta_t|\past{X}_t)=I(X_t;\fut{X}_t|\past{X}_t) = \mathcal{I}_t$.\\}
|
samer@74
|
1023 \uncover<5->{Can drop assumption 1 and still get $I(X_t;\Theta_t|\past{X}_t)$ as an additive component (lower bound) of $\mathcal{I}_t$.}
|
samer@74
|
1024 \end{iframe}
|
samer@74
|
1025
|
samer@74
|
1026 \def\fid#1{#1}
|
samer@74
|
1027 \def\specint#1{\frac{1}{2\pi}\int_{-\pi}^\pi #1{S(\omega)} \dd \omega}
|
samer@74
|
1028 \begin{iframe}[Discrete-time Gaussian processes]
|
samer@74
|
1029 Information-theoretic quantities used earlier have analogues for continuous-valued
|
samer@74
|
1030 random variables. For stationary Gaussian processes, we can obtain results in
|
samer@74
|
1031 terms of the power spectral density $S(\omega)$, (which for discrete time is periodic
|
samer@74
|
1032 in $\omega$ with period $2\pi$). Standard methods give
|
samer@74
|
1033 \begin{align*}
|
samer@74
|
1034 H(X_t) &= \frac{1}{2}\left( \log 2\pi e + \log \specint{}\right), \\
|
samer@74
|
1035 h_\mu &= \frac{1}{2} \left( \log 2\pi e + \specint{\log} \right), \\
|
samer@74
|
1036 \rho_\mu &= \frac{1}{2} \left( \log \specint{\fid} - \specint{\log}\right).
|
samer@74
|
1037 \end{align*}
|
samer@74
|
1038 Entropy rate is also known as Kolmogorov-Sinai entropy.
|
samer@74
|
1039 % $H(X_t)$ is a function of marginal variance which is just the total power in the spectrum.
|
samer@74
|
1040 \end{iframe}
|
samer@74
|
1041
|
samer@74
|
1042 \begin{iframe}[PIR/Multi-information duality]
|
samer@74
|
1043 Analysis yeilds PIR:
|
samer@74
|
1044 \[
|
samer@74
|
1045 b_\mu = \frac{1}{2} \left( \log \specint{\frac{1}} - \specint{\log\frac{1}} \right).
|
samer@74
|
1046 \]
|
samer@74
|
1047 Yields simple expression for finite-order autogregressive processes, but beware: can diverge
|
samer@74
|
1048 for moving average processes!
|
samer@74
|
1049
|
samer@74
|
1050 \uncover<2->{
|
samer@74
|
1051 Compare with multi-information rate:
|
samer@74
|
1052 \[
|
samer@74
|
1053 \rho_\mu = \frac{1}{2} \left( \log \specint{\fid} - \specint{\log}\right).
|
samer@74
|
1054 \]
|
samer@74
|
1055 Yields simple expression for finite-order moving-average processes, but can diverge
|
samer@74
|
1056 for marginally stable autogregressive processes.
|
samer@74
|
1057 }
|
samer@74
|
1058
|
samer@74
|
1059 \uncover<3->{
|
samer@74
|
1060 Infinities are troublesome and point to problem with notion of infinitely
|
samer@74
|
1061 precise observation of continuous-valued variables.
|
samer@74
|
1062 }
|
samer@74
|
1063 \end{iframe}
|
samer@74
|
1064
|
samer@74
|
1065 % Information gained about model parameters (measured as the KL divergence
|
samer@74
|
1066 % between prior and posterior distributions) is equivalent
|
samer@74
|
1067 % to \textbf{Itti and Baldi's `Bayesian surprise'} \cite{IttiBaldi2005}.
|
samer@74
|
1068
|
samer@74
|
1069
|
samer@74
|
1070 \section{Application: Analysis of minimalist music}
|
samer@74
|
1071 \label{s:Experiments}
|
samer@74
|
1072
|
samer@74
|
1073 \begin{iframe}[Material and Methods]
|
samer@74
|
1074
|
samer@74
|
1075 % Returning to our original goal of modelling the perception of temporal structure
|
samer@74
|
1076 % in music, we computed dynamic information measures for
|
samer@74
|
1077 We took two pieces of minimalist
|
samer@74
|
1078 music by Philip Glass, \emph{Two Pages} (1969) and \emph{Gradus} (1968).
|
samer@74
|
1079 Both monophonic and isochronous, so representable very simply as
|
samer@74
|
1080 a sequence of symbols (notes), one symbol per beat,
|
samer@74
|
1081 yet remain ecologically valid examples of `real' music.
|
samer@74
|
1082
|
samer@74
|
1083 We use an elaboration of the Markov chain model---not necessarily
|
samer@74
|
1084 a good model \latin{per se}, but that wasn't the point of the experiment.
|
samer@74
|
1085 Markov chain model was chosen as it is tractable from and information
|
samer@74
|
1086 dynamics point of view while not being completely trivial.
|
samer@74
|
1087 \end{iframe}
|
samer@74
|
1088
|
samer@74
|
1089 \begin{iframe}[Time-varying transition matrix model]
|
samer@74
|
1090 We allow transition matrix to vary slowly with time to track
|
samer@74
|
1091 changes in the sequence structure.
|
samer@74
|
1092 Hence, observer's belief state includes a probabilitiy
|
samer@74
|
1093 distribution over transition matrices; we choose a product of
|
samer@74
|
1094 Dirichlet distributions:
|
samer@74
|
1095 \[
|
samer@74
|
1096 \textstyle
|
samer@74
|
1097 p(\trans|\param) = \prod_{j=1}^K p_\mathrm{Dir}(\trans_{:j}|\param_{:j}),
|
samer@74
|
1098 \]
|
samer@74
|
1099 where $\trans_{:j}$ is \nth{j} column of $\trans$ and $\param$ is an
|
samer@74
|
1100 $K \times K$ parameter matrix.
|
samer@74
|
1101 % (Dirichlet, being conjugate to discrete/multinomial distribution,
|
samer@74
|
1102 % makes processing of observations particularly simple.)
|
samer@74
|
1103 % such that $\param_{:j}$ is the
|
samer@74
|
1104 % parameter tuple for the $K$-component Dirichlet distribution $p_\mathrm{Dir}$.
|
samer@74
|
1105 % \begin{equation}
|
samer@74
|
1106 % \textstyle
|
samer@74
|
1107 % p(\trans|\param) = \prod_{j=1}^K p_\mathrm{Dir}(\trans_{:j}|\param_{:j})
|
samer@74
|
1108 % = \prod_{j=1}^K (\prod_{i=1}^K \trans_{ij}^{\param_{ij}-1}) / B(\param_{:j}),
|
samer@74
|
1109 % \end{equation}
|
samer@74
|
1110 % where $\trans_{:j}$ is the \nth{j} column of $\trans$ and $\param$ is an
|
samer@74
|
1111 % $K \times K$ matrix of parameters.
|
samer@74
|
1112
|
samer@74
|
1113 At each time step, distribution first \emph{spreads} under mapping
|
samer@74
|
1114 \[
|
samer@74
|
1115 \param_{ij} \mapsto \frac{\beta\param_{ij}}{(\beta + \param_{ij})}
|
samer@74
|
1116 \]
|
samer@74
|
1117 to model possibility that transition matrix
|
samer@74
|
1118 has changed ($\beta=2500$ in our experiments). Then it \emph{contracts}
|
samer@74
|
1119 due to new observation providing fresh evidence about transition matrix.
|
samer@74
|
1120 %
|
samer@74
|
1121 % Each observed symbol % provides fresh evidence about current transition matrix,
|
samer@74
|
1122 % enables observer to update its belief state.
|
samer@74
|
1123 \end{iframe}
|
samer@74
|
1124
|
samer@74
|
1125
|
samer@74
|
1126 \begin{iframe}[Two Pages\nicedot Results]
|
samer@74
|
1127
|
samer@74
|
1128 % \begin{fig}{twopages}
|
samer@74
|
1129 \begin{tabular}{c@{\hspace{1.5ex}}l}%
|
samer@74
|
1130 % \hspace*{-1.5em}
|
samer@74
|
1131 % \hangbox{\colfig[0.5]{matbase/fig20304}} % 3 plots
|
samer@74
|
1132 % \hangbox{\colfig[0.52]{matbase/fig39528}} % 4 plots with means
|
samer@74
|
1133 % \hangbox{\colfig[0.52]{matbase/fig63538}} % two pages, 5 plots
|
samer@74
|
1134 % \hangbox{\colfig[0.52]{matbase/fig53706}} % two pages, 5 plots
|
samer@74
|
1135 \hangbox{\colfig[0.72]{matbase/fig33309}} % two pages, 5 plots
|
samer@74
|
1136 &
|
samer@74
|
1137 \hangbox{%
|
samer@74
|
1138 \parbox{0.28\linewidth}{
|
samer@74
|
1139 \raggedright
|
samer@74
|
1140 \textbf{Thick lines:} part boundaries as indicated
|
samer@74
|
1141 by Glass; \textbf{grey lines (top four panels):} changes in the melodic `figures';
|
samer@74
|
1142 % of which the piece is constructed.
|
samer@74
|
1143 \textbf{grey lines (bottom panel):}
|
samer@74
|
1144 six most surprising moments chosen by expert listenter.
|
samer@74
|
1145 }
|
samer@74
|
1146 }
|
samer@74
|
1147 \end{tabular}
|
samer@74
|
1148 % \end{fig}
|
samer@74
|
1149 \end{iframe}
|
samer@74
|
1150
|
samer@74
|
1151 \begin{iframe}[Two Pages\nicedot Rule based analysis]
|
samer@74
|
1152 \begin{figure}
|
samer@74
|
1153 \colfig[0.98]{matbase/fig13377}
|
samer@74
|
1154 % \hangbox{\colfig[0.98]{matbase/fig13377}}
|
samer@74
|
1155 \end{figure}
|
samer@74
|
1156 Analysis of \emph{Two Pages} using (top) Cambouropoulos'
|
samer@74
|
1157 Local Boundary Detection Model (LBDM) and
|
samer@74
|
1158 (bottom) Lerdahl and Jackendoff's
|
samer@74
|
1159 grouping preference rule 3a (GPR3a), which is a function of pitch proximity.
|
samer@74
|
1160 Both analyses indicate `boundary strength'.
|
samer@74
|
1161 \end{iframe}
|
samer@74
|
1162
|
samer@74
|
1163 \begin{iframe}[Two Pages\nicedot Discussion]
|
samer@74
|
1164 Correspondence between the information
|
samer@74
|
1165 measures and the structure of the piece is quite close.
|
samer@74
|
1166 Good agreement between the six `most surprising
|
samer@74
|
1167 moments' chosen by expert listener and model information signal.
|
samer@74
|
1168
|
samer@74
|
1169 What appears to be an error in the detection of
|
samer@74
|
1170 the major part boundary (between events 5000 and 6000) actually
|
samer@74
|
1171 raises a known anomaly in the score, where Glass places the boundary several events
|
samer@74
|
1172 before there is any change in the pattern of notes. Alternative analyses of \emph{Two Pages}
|
samer@74
|
1173 place the boundary in agreement with peak in our surprisingness signal.
|
samer@74
|
1174 \end{iframe}
|
samer@74
|
1175
|
samer@74
|
1176 \comment{
|
samer@74
|
1177 \begin{iframe}[Gradus\nicedot Results]
|
samer@74
|
1178
|
samer@74
|
1179 % \begin{fig}{gradus}
|
samer@74
|
1180 \begin{tabular}{c@{\hspace{1.5ex}}l}
|
samer@74
|
1181 % &
|
samer@74
|
1182 % \hangbox{\colfig[0.4]{matbase/fig81812}}
|
samer@74
|
1183 % \hangbox{\colfig[0.52]{matbase/fig23177}} % two pages, 5 plots
|
samer@74
|
1184 % \hangbox{\colfig[0.495]{matbase/fig50709}} % Fudged segmentation
|
samer@74
|
1185 % \hangbox{\colfig[0.495]{matbase/fig3124}} % Geraint's segmentation
|
samer@74
|
1186 \hangbox{\colfig[0.715]{matbase/fig11808}} % Geraint's segmentation, corrected
|
samer@74
|
1187 &
|
samer@74
|
1188 % \hangbox{\colfig[0.5]{matbase/fig39914}}
|
samer@74
|
1189 \hangbox{%
|
samer@74
|
1190 \parbox{0.28\linewidth}{
|
samer@74
|
1191 \raggedright
|
samer@74
|
1192 \textbf{Thick lines:} part boundaries as indicated
|
samer@74
|
1193 by the composer.
|
samer@74
|
1194 \textbf{Grey lines:} segmentation by expert listener.
|
samer@74
|
1195
|
samer@74
|
1196 Note: traces smoothed with Gaussian
|
samer@74
|
1197 window about 16 events wide.
|
samer@74
|
1198 }
|
samer@74
|
1199 }
|
samer@74
|
1200 \end{tabular}
|
samer@74
|
1201 % \end{fig}
|
samer@74
|
1202 \end{iframe}
|
samer@74
|
1203
|
samer@74
|
1204 \begin{iframe}[Gradus\nicedot Rule based analysis]
|
samer@74
|
1205 \begin{figure}
|
samer@74
|
1206 \colfig[0.98]{matbase/fig58691}
|
samer@74
|
1207 \end{figure}
|
samer@74
|
1208 Boundary strength analysis of \emph{Gradus} using (top) Cambouropoulos'
|
samer@74
|
1209 \cite{CambouropoulosPhD} Local Boundary Detection Model and
|
samer@74
|
1210 (bottom) Lerdahl and Jackendoff's \cite{LerdahlJackendoff83}
|
samer@74
|
1211 grouping preference rule 3a.
|
samer@74
|
1212 \end{iframe}
|
samer@74
|
1213 }
|
samer@74
|
1214 \begin{iframe}[Gradus\nicedot Metrical analysis]
|
samer@74
|
1215 \begin{figure}
|
samer@74
|
1216 \begin{tabular}{cc}
|
samer@74
|
1217 \colfig[0.40]{matbase/fig56807} & \colfig[0.41]{matbase/fig27144} \\
|
samer@74
|
1218 \colfig[0.40]{matbase/fig87574} & \colfig[0.41]{matbase/fig13651} \\
|
samer@74
|
1219 \hspace*{1ex}\colfig[0.39]{matbase/fig19913} & \hspace*{1ex}\colfig[0.40]{matbase/fig66144}
|
samer@74
|
1220 \end{tabular}
|
samer@74
|
1221 \end{figure}
|
samer@74
|
1222 \end{iframe}
|
samer@74
|
1223
|
samer@74
|
1224 \comment{
|
samer@74
|
1225 \begin{iframe}[Gradus\nicedot Discussion]
|
samer@74
|
1226
|
samer@74
|
1227 \emph{Gradus} is much less systematically structured than \emph{Two Pages}, and
|
samer@74
|
1228 relies more on the conventions of tonal music, which are not represented the model.
|
samer@74
|
1229
|
samer@74
|
1230 For example initial transition matrix is uniform, which does not correctly represent
|
samer@74
|
1231 prior knowledge about tonal music.
|
samer@74
|
1232
|
samer@74
|
1233 Information dynamic analysis does not give such a
|
samer@74
|
1234 clear picture of the structure; but some of the fine structure can be related
|
samer@74
|
1235 to specific events in the music (see Pearce and Wiggins 2006).
|
samer@74
|
1236 % nonetheless, there are some points of correspondence between the analysis and
|
samer@74
|
1237 % segmentation given by Keith Potter.
|
samer@74
|
1238
|
samer@74
|
1239 \end{iframe}
|
samer@74
|
1240 }
|
samer@74
|
1241
|
samer@74
|
1242 \section{Application: Beat tracking and rhythm}
|
samer@74
|
1243
|
samer@74
|
1244 \begin{iframe}[Bayesian beat tracker]
|
samer@74
|
1245 \uncover<1->{
|
samer@74
|
1246 Works by maintaining probabilistic belief state about time of next
|
samer@74
|
1247 beat and current tempo.
|
samer@74
|
1248
|
samer@74
|
1249 \begin{figure}
|
samer@74
|
1250 \colfig{beat_prior}
|
samer@74
|
1251 \end{figure}
|
samer@74
|
1252 }
|
samer@74
|
1253
|
samer@74
|
1254 \uncover<2->{
|
samer@74
|
1255 Receives categorised drum events (kick or snare) from audio analysis front-end.
|
samer@74
|
1256 }
|
samer@74
|
1257
|
samer@74
|
1258 \end{iframe}
|
samer@74
|
1259
|
samer@74
|
1260 \begin{iframe}[Information gain in the beat tracker]
|
samer@74
|
1261 \begin{tabular}{ll}
|
samer@74
|
1262 \parbox[t]{0.43\linewidth}{\raggedright
|
samer@74
|
1263 \uncover<1->{
|
samer@74
|
1264 Each event triggers a change in belief state, so we can compute
|
samer@74
|
1265 information gain about beat parameters.}\\[1em]
|
samer@74
|
1266
|
samer@74
|
1267 \uncover<2->{
|
samer@74
|
1268 Relationship between IIG and IPI
|
samer@74
|
1269 means we treat it as a proxy for IPI.}
|
samer@74
|
1270 }
|
samer@74
|
1271 &
|
samer@74
|
1272 \hangbox{\colfig[0.55]{beat_info}}
|
samer@74
|
1273 \end{tabular}
|
samer@74
|
1274 \end{iframe}
|
samer@74
|
1275
|
samer@74
|
1276 \begin{iframe}[Analysis of drum patterns]
|
samer@74
|
1277 We analysed 17 recordings of drummers, both playing solo or with a band.
|
samer@74
|
1278 All patterns in were in 4/4.
|
samer@74
|
1279 \begin{itemize}
|
samer@74
|
1280 \item
|
samer@74
|
1281 \uncover<1->{
|
samer@74
|
1282 Information tends to arrive at beat times: consequence of structure of model.
|
samer@74
|
1283 }
|
samer@74
|
1284 \item
|
samer@74
|
1285 \uncover<2->{
|
samer@74
|
1286 Lots of information seems to arrive after drum fills and breaks
|
samer@74
|
1287 as the drummer reestablishes the beat.
|
samer@74
|
1288 }
|
samer@74
|
1289 \item
|
samer@74
|
1290 \uncover<3->{
|
samer@74
|
1291 No consistent pattern of information arrival in relation to metrical
|
samer@74
|
1292 structure, so no obvious metrical structure in micro-timing of events.
|
samer@74
|
1293 However, still possible that metrical structure might emerge from predictive
|
samer@74
|
1294 analysis of drum pattern.
|
samer@74
|
1295 }
|
samer@74
|
1296 \end{itemize}
|
samer@74
|
1297 \end{iframe}
|
samer@74
|
1298
|
samer@74
|
1299 \section{Summary and conclusions}
|
samer@74
|
1300 \label{s:Conclusions}
|
samer@74
|
1301
|
samer@74
|
1302 \begin{iframe}[Summary]
|
samer@74
|
1303
|
samer@74
|
1304 \begin{itemize}
|
samer@74
|
1305 \item Dynamic, observer-centric information theory.
|
samer@74
|
1306 \item Applicable to any dynamic probabilistic model.
|
samer@74
|
1307 \item PIR potentially a measure of complexity.
|
samer@74
|
1308 \item Simple analysis for Markov chains and Gaussian processes.
|
samer@74
|
1309 \item Applications in music analysis and composition.
|
samer@74
|
1310 \item Search for neural correlates is ongoing (that's another talk\ldots).
|
samer@74
|
1311 \end{itemize}
|
samer@74
|
1312 Thanks!
|
samer@74
|
1313 \end{iframe}
|
samer@74
|
1314
|
samer@74
|
1315 \begin{bframe}[Bibliography]
|
samer@74
|
1316 \bibliographystyle{alpha}
|
samer@74
|
1317 {\small \bibliography{all,c4dm,compsci}}
|
samer@74
|
1318 \end{bframe}
|
samer@74
|
1319 \end{document}
|