annotate utils/n-grams.lisp @ 330:2fbff655ba47 tip

Removed cpitch-adj and cents SQL columns
author Jeremy Gow <jeremy.gow@gmail.com>
date Mon, 21 Jan 2013 11:08:11 +0000
parents 5e362d998f29
children
rev   line source
d@41 1 (in-package "AMUSE-UTILS")
d@41 2
d@50 3 (defun get-n-grams (state-sequence start-n &optional (finish-n nil) (test-for-duplicates nil))
d@134 4 "Takes a state sequence and an order, or the bounding indices
d@134 5 for a range of orders, and returns an array of n-gram-frequency
d@134 6 hash tables. The order of n-gram is used as the index for the
d@134 7 n-grams in the returned array. N-grams are represented as equal
d@134 8 hash tables with lists of states as the keys and counts as
d@134 9 values. If test-for-duplicates is provided, it is assumed that a)
d@134 10 consecutive instances of a state to not count as an interesting
d@134 11 transition for noting and b) that (funcall test-for-duplicates
d@134 12 state1 state2) returns true if the states are the same by a
d@134 13 relevant measure. FIXME: Why are these optionals, not keys?"
d@41 14 (let ((memory (make-array (or finish-n start-n) :initial-element :BLANK))
d@52 15 (pointer 0) (step 1) (prev-state)
d@41 16 (n-grams (make-array (or finish-n start-n)
d@41 17 :initial-element nil))
d@41 18 (current-sequence) (last-time))
d@41 19 (dolist (state state-sequence n-grams)
d@50 20 (unless (and test-for-duplicates
d@50 21 prev-state
d@50 22 (funcall test-for-duplicates state prev-state))
d@50 23 (setf (aref memory pointer) state
d@50 24 prev-state state
d@50 25 current-sequence nil
d@52 26 last-time nil)
d@50 27 (do ((i (mod (1+ pointer) (length memory)) (mod (1+ i) (length memory))))
d@50 28 (last-time (setf current-sequence (reverse current-sequence)))
d@50 29 (when (= pointer i)
d@50 30 (setf last-time t))
d@50 31 (push (aref memory i) current-sequence))
d@50 32 (setf pointer (mod (1+ pointer) (length memory)))
d@50 33 (loop for i from start-n to (or finish-n start-n)
d@52 34 do (when (>= step i)
d@50 35 (unless (aref n-grams (1- i))
d@50 36 (setf (aref n-grams (1- i)) (make-hash-table :test #'equal)))
d@50 37 (if (gethash (subseq current-sequence (- (or finish-n start-n) i))
d@50 38 (aref n-grams (1- i)))
d@50 39 (incf (gethash (subseq current-sequence (- (or finish-n start-n) i))
d@50 40 (aref n-grams (1- i))))
d@50 41 (setf (gethash (subseq current-sequence (- (or finish-n start-n) i))
d@50 42 (aref n-grams (1- i)))
d@52 43 1)))))
d@52 44 (incf step))))
d@41 45
d@41 46 (defun get-n-gram (state-sequence n)
d@134 47 "Uses get-n-grams to return a single order n n-gram from
d@134 48 state-sequence"
d@41 49 (aref (get-n-grams state-sequence n) (1- n)))
d@41 50
d@41 51 (defun n-gram-stats (n-gram &key (alphabet-size nil))
d@134 52 "Summarises the vital statistics of an n-gram, returning values
d@134 53 for the total number of entries, the number of different n-grams
d@134 54 occurring and a list of the hash table contents as a list of
d@134 55 lists. If alphabet-size is provided, an additional value is
d@134 56 returned for the proportion of n-grams occurring as compared with
d@134 57 the number possible with alphabet-size symbols."
d@41 58 (let ((count 0)
d@41 59 (frequencies)
d@41 60 (n))
d@41 61 (maphash #'(lambda (key val)
d@41 62 (push (cons key val) frequencies)
d@41 63 (incf count val)
d@41 64 (unless n
d@41 65 (setf n (length key))))
d@41 66 n-gram)
d@41 67 (if alphabet-size
d@41 68 (values count (hash-table-count n-gram) frequencies (/ (hash-table-count n-gram)
d@41 69 (expt alphabet-size n)))
d@41 70 (values count (hash-table-count n-gram) frequencies))))