d@41: (in-package "AMUSE-UTILS") d@41: d@50: (defun get-n-grams (state-sequence start-n &optional (finish-n nil) (test-for-duplicates nil)) d@134: "Takes a state sequence and an order, or the bounding indices d@134: for a range of orders, and returns an array of n-gram-frequency d@134: hash tables. The order of n-gram is used as the index for the d@134: n-grams in the returned array. N-grams are represented as equal d@134: hash tables with lists of states as the keys and counts as d@134: values. If test-for-duplicates is provided, it is assumed that a) d@134: consecutive instances of a state to not count as an interesting d@134: transition for noting and b) that (funcall test-for-duplicates d@134: state1 state2) returns true if the states are the same by a d@134: relevant measure. FIXME: Why are these optionals, not keys?" d@41: (let ((memory (make-array (or finish-n start-n) :initial-element :BLANK)) d@52: (pointer 0) (step 1) (prev-state) d@41: (n-grams (make-array (or finish-n start-n) d@41: :initial-element nil)) d@41: (current-sequence) (last-time)) d@41: (dolist (state state-sequence n-grams) d@50: (unless (and test-for-duplicates d@50: prev-state d@50: (funcall test-for-duplicates state prev-state)) d@50: (setf (aref memory pointer) state d@50: prev-state state d@50: current-sequence nil d@52: last-time nil) d@50: (do ((i (mod (1+ pointer) (length memory)) (mod (1+ i) (length memory)))) d@50: (last-time (setf current-sequence (reverse current-sequence))) d@50: (when (= pointer i) d@50: (setf last-time t)) d@50: (push (aref memory i) current-sequence)) d@50: (setf pointer (mod (1+ pointer) (length memory))) d@50: (loop for i from start-n to (or finish-n start-n) d@52: do (when (>= step i) d@50: (unless (aref n-grams (1- i)) d@50: (setf (aref n-grams (1- i)) (make-hash-table :test #'equal))) d@50: (if (gethash (subseq current-sequence (- (or finish-n start-n) i)) d@50: (aref n-grams (1- i))) d@50: (incf (gethash (subseq current-sequence (- (or finish-n start-n) i)) d@50: (aref n-grams (1- i)))) d@50: (setf (gethash (subseq current-sequence (- (or finish-n start-n) i)) d@50: (aref n-grams (1- i))) d@52: 1))))) d@52: (incf step)))) d@41: d@41: (defun get-n-gram (state-sequence n) d@134: "Uses get-n-grams to return a single order n n-gram from d@134: state-sequence" d@41: (aref (get-n-grams state-sequence n) (1- n))) d@41: d@41: (defun n-gram-stats (n-gram &key (alphabet-size nil)) d@134: "Summarises the vital statistics of an n-gram, returning values d@134: for the total number of entries, the number of different n-grams d@134: occurring and a list of the hash table contents as a list of d@134: lists. If alphabet-size is provided, an additional value is d@134: returned for the proportion of n-grams occurring as compared with d@134: the number possible with alphabet-size symbols." d@41: (let ((count 0) d@41: (frequencies) d@41: (n)) d@41: (maphash #'(lambda (key val) d@41: (push (cons key val) frequencies) d@41: (incf count val) d@41: (unless n d@41: (setf n (length key)))) d@41: n-gram) d@41: (if alphabet-size d@41: (values count (hash-table-count n-gram) frequencies (/ (hash-table-count n-gram) d@41: (expt alphabet-size n))) d@41: (values count (hash-table-count n-gram) frequencies))))