annotate utils/n-grams.lisp @ 41:90abdf9adb60

monodising and some n-gram utilities darcs-hash:20070614140028-f76cc-9bdeba6db4097e425b1fee4f58a3327eeb486685.gz
author David Lewis <d.lewis@gold.ac.uk>
date Thu, 14 Jun 2007 15:00:28 +0100
parents
children 49aae39b96dc
rev   line source
d@41 1 (in-package "AMUSE-UTILS")
d@41 2
d@41 3 (defun get-n-grams (state-sequence start-n &optional (finish-n nil))
d@41 4 (let ((memory (make-array (or finish-n start-n) :initial-element :BLANK))
d@41 5 (pointer 0) (step 0)
d@41 6 (n-grams (make-array (or finish-n start-n)
d@41 7 :initial-element nil))
d@41 8 (current-sequence) (last-time))
d@41 9 (dolist (state state-sequence n-grams)
d@41 10 (setf (aref memory pointer) state
d@41 11 current-sequence nil
d@41 12 last-time nil
d@41 13 step (1+ step))
d@41 14 (do ((i (mod (1+ pointer) (length memory)) (mod (1+ i) (length memory))))
d@41 15 (last-time (setf current-sequence (reverse current-sequence)))
d@41 16 (when (= pointer i)
d@41 17 (setf last-time t))
d@41 18 (push (aref memory i) current-sequence))
d@41 19 (setf pointer (mod (1+ pointer) (length memory)))
d@41 20 (loop for i from start-n to (or finish-n start-n)
d@41 21 do (when (> step i)
d@41 22 (unless (aref n-grams (1- i))
d@41 23 (setf (aref n-grams (1- i)) (make-hash-table :test #'equal)))
d@41 24 (if (gethash (subseq current-sequence (- (or finish-n start-n) i))
d@41 25 (aref n-grams (1- i)))
d@41 26 (incf (gethash (subseq current-sequence (- (or finish-n start-n) i))
d@41 27 (aref n-grams (1- i))))
d@41 28 (setf (gethash (subseq current-sequence (- (or finish-n start-n) i))
d@41 29 (aref n-grams (1- i)))
d@41 30 1)))))))
d@41 31
d@41 32 (defun get-n-gram (state-sequence n)
d@41 33 (aref (get-n-grams state-sequence n) (1- n)))
d@41 34
d@41 35 (defun n-gram-stats (n-gram &key (alphabet-size nil))
d@41 36 (let ((count 0)
d@41 37 (frequencies)
d@41 38 (n))
d@41 39 (maphash #'(lambda (key val)
d@41 40 (push (cons key val) frequencies)
d@41 41 (incf count val)
d@41 42 (unless n
d@41 43 (setf n (length key))))
d@41 44 n-gram)
d@41 45 (if alphabet-size
d@41 46 (values count (hash-table-count n-gram) frequencies (/ (hash-table-count n-gram)
d@41 47 (expt alphabet-size n)))
d@41 48 (values count (hash-table-count n-gram) frequencies))))