Mercurial > hg > amuse
diff utils/n-grams.lisp @ 41:90abdf9adb60
monodising and some n-gram utilities
darcs-hash:20070614140028-f76cc-9bdeba6db4097e425b1fee4f58a3327eeb486685.gz
author | David Lewis <d.lewis@gold.ac.uk> |
---|---|
date | Thu, 14 Jun 2007 15:00:28 +0100 |
parents | |
children | 49aae39b96dc |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/n-grams.lisp Thu Jun 14 15:00:28 2007 +0100 @@ -0,0 +1,48 @@ +(in-package "AMUSE-UTILS") + +(defun get-n-grams (state-sequence start-n &optional (finish-n nil)) + (let ((memory (make-array (or finish-n start-n) :initial-element :BLANK)) + (pointer 0) (step 0) + (n-grams (make-array (or finish-n start-n) + :initial-element nil)) + (current-sequence) (last-time)) + (dolist (state state-sequence n-grams) + (setf (aref memory pointer) state + current-sequence nil + last-time nil + step (1+ step)) + (do ((i (mod (1+ pointer) (length memory)) (mod (1+ i) (length memory)))) + (last-time (setf current-sequence (reverse current-sequence))) + (when (= pointer i) + (setf last-time t)) + (push (aref memory i) current-sequence)) + (setf pointer (mod (1+ pointer) (length memory))) + (loop for i from start-n to (or finish-n start-n) + do (when (> step i) + (unless (aref n-grams (1- i)) + (setf (aref n-grams (1- i)) (make-hash-table :test #'equal))) + (if (gethash (subseq current-sequence (- (or finish-n start-n) i)) + (aref n-grams (1- i))) + (incf (gethash (subseq current-sequence (- (or finish-n start-n) i)) + (aref n-grams (1- i)))) + (setf (gethash (subseq current-sequence (- (or finish-n start-n) i)) + (aref n-grams (1- i))) + 1))))))) + +(defun get-n-gram (state-sequence n) + (aref (get-n-grams state-sequence n) (1- n))) + +(defun n-gram-stats (n-gram &key (alphabet-size nil)) + (let ((count 0) + (frequencies) + (n)) + (maphash #'(lambda (key val) + (push (cons key val) frequencies) + (incf count val) + (unless n + (setf n (length key)))) + n-gram) + (if alphabet-size + (values count (hash-table-count n-gram) frequencies (/ (hash-table-count n-gram) + (expt alphabet-size n))) + (values count (hash-table-count n-gram) frequencies))))