d@41
|
1 (in-package "AMUSE-UTILS")
|
d@41
|
2
|
d@50
|
3 (defun get-n-grams (state-sequence start-n &optional (finish-n nil) (test-for-duplicates nil))
|
d@134
|
4 "Takes a state sequence and an order, or the bounding indices
|
d@134
|
5 for a range of orders, and returns an array of n-gram-frequency
|
d@134
|
6 hash tables. The order of n-gram is used as the index for the
|
d@134
|
7 n-grams in the returned array. N-grams are represented as equal
|
d@134
|
8 hash tables with lists of states as the keys and counts as
|
d@134
|
9 values. If test-for-duplicates is provided, it is assumed that a)
|
d@134
|
10 consecutive instances of a state to not count as an interesting
|
d@134
|
11 transition for noting and b) that (funcall test-for-duplicates
|
d@134
|
12 state1 state2) returns true if the states are the same by a
|
d@134
|
13 relevant measure. FIXME: Why are these optionals, not keys?"
|
d@41
|
14 (let ((memory (make-array (or finish-n start-n) :initial-element :BLANK))
|
d@52
|
15 (pointer 0) (step 1) (prev-state)
|
d@41
|
16 (n-grams (make-array (or finish-n start-n)
|
d@41
|
17 :initial-element nil))
|
d@41
|
18 (current-sequence) (last-time))
|
d@41
|
19 (dolist (state state-sequence n-grams)
|
d@50
|
20 (unless (and test-for-duplicates
|
d@50
|
21 prev-state
|
d@50
|
22 (funcall test-for-duplicates state prev-state))
|
d@50
|
23 (setf (aref memory pointer) state
|
d@50
|
24 prev-state state
|
d@50
|
25 current-sequence nil
|
d@52
|
26 last-time nil)
|
d@50
|
27 (do ((i (mod (1+ pointer) (length memory)) (mod (1+ i) (length memory))))
|
d@50
|
28 (last-time (setf current-sequence (reverse current-sequence)))
|
d@50
|
29 (when (= pointer i)
|
d@50
|
30 (setf last-time t))
|
d@50
|
31 (push (aref memory i) current-sequence))
|
d@50
|
32 (setf pointer (mod (1+ pointer) (length memory)))
|
d@50
|
33 (loop for i from start-n to (or finish-n start-n)
|
d@52
|
34 do (when (>= step i)
|
d@50
|
35 (unless (aref n-grams (1- i))
|
d@50
|
36 (setf (aref n-grams (1- i)) (make-hash-table :test #'equal)))
|
d@50
|
37 (if (gethash (subseq current-sequence (- (or finish-n start-n) i))
|
d@50
|
38 (aref n-grams (1- i)))
|
d@50
|
39 (incf (gethash (subseq current-sequence (- (or finish-n start-n) i))
|
d@50
|
40 (aref n-grams (1- i))))
|
d@50
|
41 (setf (gethash (subseq current-sequence (- (or finish-n start-n) i))
|
d@50
|
42 (aref n-grams (1- i)))
|
d@52
|
43 1)))))
|
d@52
|
44 (incf step))))
|
d@41
|
45
|
d@41
|
46 (defun get-n-gram (state-sequence n)
|
d@134
|
47 "Uses get-n-grams to return a single order n n-gram from
|
d@134
|
48 state-sequence"
|
d@41
|
49 (aref (get-n-grams state-sequence n) (1- n)))
|
d@41
|
50
|
d@41
|
51 (defun n-gram-stats (n-gram &key (alphabet-size nil))
|
d@134
|
52 "Summarises the vital statistics of an n-gram, returning values
|
d@134
|
53 for the total number of entries, the number of different n-grams
|
d@134
|
54 occurring and a list of the hash table contents as a list of
|
d@134
|
55 lists. If alphabet-size is provided, an additional value is
|
d@134
|
56 returned for the proportion of n-grams occurring as compared with
|
d@134
|
57 the number possible with alphabet-size symbols."
|
d@41
|
58 (let ((count 0)
|
d@41
|
59 (frequencies)
|
d@41
|
60 (n))
|
d@41
|
61 (maphash #'(lambda (key val)
|
d@41
|
62 (push (cons key val) frequencies)
|
d@41
|
63 (incf count val)
|
d@41
|
64 (unless n
|
d@41
|
65 (setf n (length key))))
|
d@41
|
66 n-gram)
|
d@41
|
67 (if alphabet-size
|
d@41
|
68 (values count (hash-table-count n-gram) frequencies (/ (hash-table-count n-gram)
|
d@41
|
69 (expt alphabet-size n)))
|
d@41
|
70 (values count (hash-table-count n-gram) frequencies))))
|