annotate utils/n-grams.lisp @ 52:e0acd4c37121

n-gram fix darcs-hash:20070620160515-f76cc-af8dd2c245deac5551ef880abd8afe8b39f7c1e9.gz
author David Lewis <d.lewis@gold.ac.uk>
date Wed, 20 Jun 2007 17:05:15 +0100
parents 49aae39b96dc
children 5e362d998f29
rev   line source
d@41 1 (in-package "AMUSE-UTILS")
d@41 2
d@50 3 (defun get-n-grams (state-sequence start-n &optional (finish-n nil) (test-for-duplicates nil))
d@41 4 (let ((memory (make-array (or finish-n start-n) :initial-element :BLANK))
d@52 5 (pointer 0) (step 1) (prev-state)
d@41 6 (n-grams (make-array (or finish-n start-n)
d@41 7 :initial-element nil))
d@41 8 (current-sequence) (last-time))
d@41 9 (dolist (state state-sequence n-grams)
d@50 10 (unless (and test-for-duplicates
d@50 11 prev-state
d@50 12 (funcall test-for-duplicates state prev-state))
d@50 13 (setf (aref memory pointer) state
d@50 14 prev-state state
d@50 15 current-sequence nil
d@52 16 last-time nil)
d@50 17 (do ((i (mod (1+ pointer) (length memory)) (mod (1+ i) (length memory))))
d@50 18 (last-time (setf current-sequence (reverse current-sequence)))
d@50 19 (when (= pointer i)
d@50 20 (setf last-time t))
d@50 21 (push (aref memory i) current-sequence))
d@50 22 (setf pointer (mod (1+ pointer) (length memory)))
d@50 23 (loop for i from start-n to (or finish-n start-n)
d@52 24 do (when (>= step i)
d@50 25 (unless (aref n-grams (1- i))
d@50 26 (setf (aref n-grams (1- i)) (make-hash-table :test #'equal)))
d@50 27 (if (gethash (subseq current-sequence (- (or finish-n start-n) i))
d@50 28 (aref n-grams (1- i)))
d@50 29 (incf (gethash (subseq current-sequence (- (or finish-n start-n) i))
d@50 30 (aref n-grams (1- i))))
d@50 31 (setf (gethash (subseq current-sequence (- (or finish-n start-n) i))
d@50 32 (aref n-grams (1- i)))
d@52 33 1)))))
d@52 34 (incf step))))
d@41 35
d@41 36 (defun get-n-gram (state-sequence n)
d@41 37 (aref (get-n-grams state-sequence n) (1- n)))
d@41 38
d@41 39 (defun n-gram-stats (n-gram &key (alphabet-size nil))
d@41 40 (let ((count 0)
d@41 41 (frequencies)
d@41 42 (n))
d@41 43 (maphash #'(lambda (key val)
d@41 44 (push (cons key val) frequencies)
d@41 45 (incf count val)
d@41 46 (unless n
d@41 47 (setf n (length key))))
d@41 48 n-gram)
d@41 49 (if alphabet-size
d@41 50 (values count (hash-table-count n-gram) frequencies (/ (hash-table-count n-gram)
d@41 51 (expt alphabet-size n)))
d@41 52 (values count (hash-table-count n-gram) frequencies))))