d@41
|
1 (in-package "AMUSE-UTILS")
|
d@41
|
2
|
d@50
|
3 (defun get-n-grams (state-sequence start-n &optional (finish-n nil) (test-for-duplicates nil))
|
d@41
|
4 (let ((memory (make-array (or finish-n start-n) :initial-element :BLANK))
|
d@52
|
5 (pointer 0) (step 1) (prev-state)
|
d@41
|
6 (n-grams (make-array (or finish-n start-n)
|
d@41
|
7 :initial-element nil))
|
d@41
|
8 (current-sequence) (last-time))
|
d@41
|
9 (dolist (state state-sequence n-grams)
|
d@50
|
10 (unless (and test-for-duplicates
|
d@50
|
11 prev-state
|
d@50
|
12 (funcall test-for-duplicates state prev-state))
|
d@50
|
13 (setf (aref memory pointer) state
|
d@50
|
14 prev-state state
|
d@50
|
15 current-sequence nil
|
d@52
|
16 last-time nil)
|
d@50
|
17 (do ((i (mod (1+ pointer) (length memory)) (mod (1+ i) (length memory))))
|
d@50
|
18 (last-time (setf current-sequence (reverse current-sequence)))
|
d@50
|
19 (when (= pointer i)
|
d@50
|
20 (setf last-time t))
|
d@50
|
21 (push (aref memory i) current-sequence))
|
d@50
|
22 (setf pointer (mod (1+ pointer) (length memory)))
|
d@50
|
23 (loop for i from start-n to (or finish-n start-n)
|
d@52
|
24 do (when (>= step i)
|
d@50
|
25 (unless (aref n-grams (1- i))
|
d@50
|
26 (setf (aref n-grams (1- i)) (make-hash-table :test #'equal)))
|
d@50
|
27 (if (gethash (subseq current-sequence (- (or finish-n start-n) i))
|
d@50
|
28 (aref n-grams (1- i)))
|
d@50
|
29 (incf (gethash (subseq current-sequence (- (or finish-n start-n) i))
|
d@50
|
30 (aref n-grams (1- i))))
|
d@50
|
31 (setf (gethash (subseq current-sequence (- (or finish-n start-n) i))
|
d@50
|
32 (aref n-grams (1- i)))
|
d@52
|
33 1)))))
|
d@52
|
34 (incf step))))
|
d@41
|
35
|
d@41
|
36 (defun get-n-gram (state-sequence n)
|
d@41
|
37 (aref (get-n-grams state-sequence n) (1- n)))
|
d@41
|
38
|
d@41
|
39 (defun n-gram-stats (n-gram &key (alphabet-size nil))
|
d@41
|
40 (let ((count 0)
|
d@41
|
41 (frequencies)
|
d@41
|
42 (n))
|
d@41
|
43 (maphash #'(lambda (key val)
|
d@41
|
44 (push (cons key val) frequencies)
|
d@41
|
45 (incf count val)
|
d@41
|
46 (unless n
|
d@41
|
47 (setf n (length key))))
|
d@41
|
48 n-gram)
|
d@41
|
49 (if alphabet-size
|
d@41
|
50 (values count (hash-table-count n-gram) frequencies (/ (hash-table-count n-gram)
|
d@41
|
51 (expt alphabet-size n)))
|
d@41
|
52 (values count (hash-table-count n-gram) frequencies))))
|