diff utils/n-grams.lisp @ 41:90abdf9adb60

monodising and some n-gram utilities darcs-hash:20070614140028-f76cc-9bdeba6db4097e425b1fee4f58a3327eeb486685.gz
author David Lewis <d.lewis@gold.ac.uk>
date Thu, 14 Jun 2007 15:00:28 +0100
parents
children 49aae39b96dc
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/n-grams.lisp	Thu Jun 14 15:00:28 2007 +0100
@@ -0,0 +1,48 @@
+(in-package "AMUSE-UTILS")
+
+(defun get-n-grams (state-sequence start-n &optional (finish-n nil))
+  (let ((memory (make-array (or finish-n start-n) :initial-element :BLANK))
+	(pointer 0) (step 0)
+	(n-grams (make-array (or finish-n start-n)
+			     :initial-element nil))
+	(current-sequence) (last-time))
+    (dolist (state state-sequence n-grams)
+      (setf (aref memory pointer) state
+	    current-sequence nil
+	    last-time nil
+	    step (1+ step))
+      (do ((i (mod (1+ pointer) (length memory)) (mod (1+ i) (length memory))))
+	  (last-time (setf current-sequence (reverse current-sequence)))
+	(when (= pointer i)
+	  (setf last-time t))
+	(push (aref memory i) current-sequence))
+      (setf pointer (mod (1+ pointer) (length memory)))
+      (loop for i from start-n to (or finish-n start-n)
+	 do (when (> step i)
+	      (unless (aref n-grams (1- i))
+		(setf (aref n-grams (1- i)) (make-hash-table :test #'equal)))
+	      (if (gethash (subseq current-sequence (- (or finish-n start-n) i))
+			   (aref n-grams (1- i)))
+		  (incf (gethash (subseq current-sequence (- (or finish-n start-n) i))
+				 (aref n-grams (1- i))))
+		  (setf (gethash (subseq current-sequence (- (or finish-n start-n) i))
+				 (aref n-grams (1- i)))
+			1)))))))
+
+(defun get-n-gram (state-sequence n)
+  (aref (get-n-grams state-sequence n) (1- n)))
+
+(defun n-gram-stats (n-gram &key (alphabet-size nil))
+  (let ((count 0)
+	(frequencies)
+	(n))
+    (maphash #'(lambda (key val)
+		 (push (cons key val) frequencies)
+		 (incf count val)
+		 (unless n
+		   (setf n (length key))))
+	     n-gram)
+    (if alphabet-size
+	(values count (hash-table-count n-gram) frequencies (/ (hash-table-count n-gram)
+							       (expt alphabet-size n)))
+	(values count (hash-table-count n-gram) frequencies))))