Mercurial > hg > amuse
view utils/n-grams.lisp @ 52:e0acd4c37121
n-gram fix
darcs-hash:20070620160515-f76cc-af8dd2c245deac5551ef880abd8afe8b39f7c1e9.gz
author | David Lewis <d.lewis@gold.ac.uk> |
---|---|
date | Wed, 20 Jun 2007 17:05:15 +0100 |
parents | 49aae39b96dc |
children | 5e362d998f29 |
line wrap: on
line source
(in-package "AMUSE-UTILS") (defun get-n-grams (state-sequence start-n &optional (finish-n nil) (test-for-duplicates nil)) (let ((memory (make-array (or finish-n start-n) :initial-element :BLANK)) (pointer 0) (step 1) (prev-state) (n-grams (make-array (or finish-n start-n) :initial-element nil)) (current-sequence) (last-time)) (dolist (state state-sequence n-grams) (unless (and test-for-duplicates prev-state (funcall test-for-duplicates state prev-state)) (setf (aref memory pointer) state prev-state state current-sequence nil last-time nil) (do ((i (mod (1+ pointer) (length memory)) (mod (1+ i) (length memory)))) (last-time (setf current-sequence (reverse current-sequence))) (when (= pointer i) (setf last-time t)) (push (aref memory i) current-sequence)) (setf pointer (mod (1+ pointer) (length memory))) (loop for i from start-n to (or finish-n start-n) do (when (>= step i) (unless (aref n-grams (1- i)) (setf (aref n-grams (1- i)) (make-hash-table :test #'equal))) (if (gethash (subseq current-sequence (- (or finish-n start-n) i)) (aref n-grams (1- i))) (incf (gethash (subseq current-sequence (- (or finish-n start-n) i)) (aref n-grams (1- i)))) (setf (gethash (subseq current-sequence (- (or finish-n start-n) i)) (aref n-grams (1- i))) 1))))) (incf step)))) (defun get-n-gram (state-sequence n) (aref (get-n-grams state-sequence n) (1- n))) (defun n-gram-stats (n-gram &key (alphabet-size nil)) (let ((count 0) (frequencies) (n)) (maphash #'(lambda (key val) (push (cons key val) frequencies) (incf count val) (unless n (setf n (length key)))) n-gram) (if alphabet-size (values count (hash-table-count n-gram) frequencies (/ (hash-table-count n-gram) (expt alphabet-size n))) (values count (hash-table-count n-gram) frequencies))))