view utils/n-grams.lisp @ 41:90abdf9adb60

monodising and some n-gram utilities darcs-hash:20070614140028-f76cc-9bdeba6db4097e425b1fee4f58a3327eeb486685.gz
author David Lewis <d.lewis@gold.ac.uk>
date Thu, 14 Jun 2007 15:00:28 +0100
parents
children 49aae39b96dc
line wrap: on
line source
(in-package "AMUSE-UTILS")

(defun get-n-grams (state-sequence start-n &optional (finish-n nil))
  (let ((memory (make-array (or finish-n start-n) :initial-element :BLANK))
	(pointer 0) (step 0)
	(n-grams (make-array (or finish-n start-n)
			     :initial-element nil))
	(current-sequence) (last-time))
    (dolist (state state-sequence n-grams)
      (setf (aref memory pointer) state
	    current-sequence nil
	    last-time nil
	    step (1+ step))
      (do ((i (mod (1+ pointer) (length memory)) (mod (1+ i) (length memory))))
	  (last-time (setf current-sequence (reverse current-sequence)))
	(when (= pointer i)
	  (setf last-time t))
	(push (aref memory i) current-sequence))
      (setf pointer (mod (1+ pointer) (length memory)))
      (loop for i from start-n to (or finish-n start-n)
	 do (when (> step i)
	      (unless (aref n-grams (1- i))
		(setf (aref n-grams (1- i)) (make-hash-table :test #'equal)))
	      (if (gethash (subseq current-sequence (- (or finish-n start-n) i))
			   (aref n-grams (1- i)))
		  (incf (gethash (subseq current-sequence (- (or finish-n start-n) i))
				 (aref n-grams (1- i))))
		  (setf (gethash (subseq current-sequence (- (or finish-n start-n) i))
				 (aref n-grams (1- i)))
			1)))))))

(defun get-n-gram (state-sequence n)
  (aref (get-n-grams state-sequence n) (1- n)))

(defun n-gram-stats (n-gram &key (alphabet-size nil))
  (let ((count 0)
	(frequencies)
	(n))
    (maphash #'(lambda (key val)
		 (push (cons key val) frequencies)
		 (incf count val)
		 (unless n
		   (setf n (length key))))
	     n-gram)
    (if alphabet-size
	(values count (hash-table-count n-gram) frequencies (/ (hash-table-count n-gram)
							       (expt alphabet-size n)))
	(values count (hash-table-count n-gram) frequencies))))