view utils/n-grams.lisp @ 330:2fbff655ba47 tip

Removed cpitch-adj and cents SQL columns
author Jeremy Gow <jeremy.gow@gmail.com>
date Mon, 21 Jan 2013 11:08:11 +0000
parents 5e362d998f29
children
line wrap: on
line source
(in-package "AMUSE-UTILS")

(defun get-n-grams (state-sequence start-n &optional (finish-n nil) (test-for-duplicates nil))
  "Takes a state sequence and an order, or the bounding indices
for a range of orders, and returns an array of n-gram-frequency
hash tables. The order of n-gram is used as the index for the
n-grams in the returned array. N-grams are represented as equal
hash tables with lists of states as the keys and counts as
values. If test-for-duplicates is provided, it is assumed that a)
consecutive instances of a state to not count as an interesting
transition for noting and b) that (funcall test-for-duplicates
state1 state2) returns true if the states are the same by a
relevant measure. FIXME: Why are these optionals, not keys?"
  (let ((memory (make-array (or finish-n start-n) :initial-element :BLANK))
	(pointer 0) (step 1) (prev-state)
	(n-grams (make-array (or finish-n start-n)
			     :initial-element nil))
	(current-sequence) (last-time))
    (dolist (state state-sequence n-grams)
      (unless (and test-for-duplicates
		   prev-state
		   (funcall test-for-duplicates state prev-state))
	(setf (aref memory pointer) state
	      prev-state state
	      current-sequence nil
	      last-time nil)
	(do ((i (mod (1+ pointer) (length memory)) (mod (1+ i) (length memory))))
	    (last-time (setf current-sequence (reverse current-sequence)))
	  (when (= pointer i)
	    (setf last-time t))
	  (push (aref memory i) current-sequence))
	(setf pointer (mod (1+ pointer) (length memory)))
	(loop for i from start-n to (or finish-n start-n)
	   do (when (>= step i)
		(unless (aref n-grams (1- i))
		  (setf (aref n-grams (1- i)) (make-hash-table :test #'equal)))
		(if (gethash (subseq current-sequence (- (or finish-n start-n) i))
			     (aref n-grams (1- i)))
		    (incf (gethash (subseq current-sequence (- (or finish-n start-n) i))
				   (aref n-grams (1- i))))
		    (setf (gethash (subseq current-sequence (- (or finish-n start-n) i))
				   (aref n-grams (1- i)))
			  1)))))
      (incf step))))

(defun get-n-gram (state-sequence n)
  "Uses get-n-grams to return a single order n n-gram from
state-sequence"
  (aref (get-n-grams state-sequence n) (1- n)))

(defun n-gram-stats (n-gram &key (alphabet-size nil))
  "Summarises the vital statistics of an n-gram, returning values
for the total number of entries, the number of different n-grams
occurring and a list of the hash table contents as a list of
lists. If alphabet-size is provided, an additional value is
returned for the proportion of n-grams occurring as compared with
the number possible with alphabet-size symbols."
  (let ((count 0)
	(frequencies)
	(n))
    (maphash #'(lambda (key val)
		 (push (cons key val) frequencies)
		 (incf count val)
		 (unless n
		   (setf n (length key))))
	     n-gram)
    (if alphabet-size
	(values count (hash-table-count n-gram) frequencies (/ (hash-table-count n-gram)
							       (expt alphabet-size n)))
	(values count (hash-table-count n-gram) frequencies))))