summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCraig Jennings <c@cjennings.net>2026-05-11 17:17:54 -0500
committerCraig Jennings <c@cjennings.net>2026-05-11 17:17:54 -0500
commit7f353e925b7dbd6d0c16962e0e822aea0093308b (patch)
treecc4a86ada9d06050b3176a2da664646c44b16242
parent96d5d6a7454f4e35860dc3d93cb3a447263859d7 (diff)
downloaddotemacs-7f353e925b7dbd6d0c16962e0e822aea0093308b.tar.gz
dotemacs-7f353e925b7dbd6d0c16962e0e822aea0093308b.zip
perf(lorem-optimum): speed up the Markov generation path
`cj/markov-join-tokens' collects tokens in a list and `mapconcat's once instead of repeated string concatenation. `cj/markov-generate' uses `push'/`nreverse' instead of repeated `append'. The Markov keys are cached as a vector so random key selection is O(1). Re-enabled the benchmark tests (the `:slow' tags were stale) and added a `cj/lipsum-title' test after byte-compilation flagged a malformed form there. `assets/liber-primus.txt' is left as-is (36 KB / 5,374 words, small enough not to need trimming). 100K-word learning now measures about 196 ms.
-rw-r--r--modules/lorem-optimum.el102
-rw-r--r--tests/test-lorem-optimum-benchmark.el15
-rw-r--r--tests/test-lorem-optimum.el9
3 files changed, 70 insertions, 56 deletions
diff --git a/modules/lorem-optimum.el b/modules/lorem-optimum.el
index 9125e8ad..7b19125e 100644
--- a/modules/lorem-optimum.el
+++ b/modules/lorem-optimum.el
@@ -107,29 +107,31 @@ Uses O(n) algorithm by matching at position instead of creating substrings."
(defun cj/markov-join-tokens (tokens)
"Join TOKENS into a sentence with proper spacing/punctuation."
- (let ((sentence "") (need-space nil))
+ (let ((parts '())
+ (need-space nil))
(dolist (tok tokens)
(cond
;; punctuation attaches directly
((string-match-p "^[[:punct:]]+$" tok)
- (setq sentence (concat sentence tok))
+ (push tok parts)
(setq need-space t))
;; word
(t
(when need-space
- (setq sentence (concat sentence " ")))
- (setq sentence (concat sentence tok))
+ (push " " parts))
+ (push tok parts)
(setq need-space t))))
- ;; fix capitalization of first word only
- (when (string-match "\\`\\([[:alpha:]]\\)" sentence)
- (setq sentence
- (replace-match (upcase (match-string 1 sentence))
- nil nil sentence)))
- ;; ensure it ends with .!?
- (unless (string-match-p "[.!?]$" sentence)
- (setq sentence (concat (replace-regexp-in-string "[[:punct:]]+$" "" sentence) ".")))
- (setq sentence (cj/markov-fix-capitalization sentence))
- sentence))
+ (let ((sentence (mapconcat #'identity (nreverse parts) "")))
+ ;; fix capitalization of first word only
+ (when (string-match "\\`\\([[:alpha:]]\\)" sentence)
+ (setq sentence
+ (replace-match (upcase (match-string 1 sentence))
+ nil nil sentence)))
+ ;; ensure it ends with .!?
+ (unless (string-match-p "[.!?]$" sentence)
+ (setq sentence (concat (replace-regexp-in-string "[[:punct:]]+$" "" sentence) ".")))
+ (setq sentence (cj/markov-fix-capitalization sentence))
+ sentence)))
(defun cj/markov-generate (chain n &optional start)
"Generate a sentence of N tokens from CHAIN."
@@ -140,19 +142,21 @@ Uses O(n) algorithm by matching at position instead of creating substrings."
(cj/markov-random-key chain)))
(w1 (car state))
(w2 (cadr state))
- (tokens (list w1 w2)))
- (dotimes (_ (- n 2))
+ (tokens (list w2 w1)))
+ (dotimes (_ (max 0 (- n 2)))
(let ((next (cj/markov-next-word chain state)))
(if next
- (setq tokens (append tokens (list next))
- state (list w2 next)
- w1 w2
- w2 next)
+ (progn
+ (push next tokens)
+ (setq state (list w2 next)
+ w1 w2
+ w2 next))
(setq state (cj/markov-random-key chain)
w1 (car state)
- w2 (cadr state)
- tokens (append tokens (list w1 w2))))))
- (cj/markov-join-tokens tokens))))
+ w2 (cadr state))
+ (push w1 tokens)
+ (push w2 tokens))))
+ (cj/markov-join-tokens (nreverse tokens)))))
(defun cj/markov-random-key (chain)
"Return a random bigram key from CHAIN.
@@ -160,11 +164,12 @@ Builds and caches the keys list lazily if not already cached."
(unless (cj/markov-chain-keys chain)
;; Lazily build keys list only when needed
(setf (cj/markov-chain-keys chain)
- (cl-loop for k being the hash-keys of (cj/markov-chain-map chain)
- collect k)))
+ (vconcat
+ (cl-loop for k being the hash-keys of (cj/markov-chain-map chain)
+ collect k))))
(let ((keys (cj/markov-chain-keys chain)))
- (when keys
- (nth (random (length keys)) keys))))
+ (when (> (length keys) 0)
+ (aref keys (random (length keys))))))
(defun cj/markov-next-word (chain bigram)
(let ((candidates (gethash bigram (cj/markov-chain-map chain))))
@@ -219,24 +224,31 @@ Builds and caches the keys list lazily if not already cached."
(defun cj/lipsum-title ()
"Generate a pseudo-Latin title."
(interactive)
- (let* ((n (+ cj/lipsum-title-min
- (random (1+ (- cj/lipsum-title-max cj/lipsum-title-min)))))
- (words
- (cl-loop with state = (cj/markov-random-key cj/lipsum-chain)
- for i from 0 below n
- for w = (car state)
- do (setq state (list (cadr state)
- (or (cj/markov-next-word cj/lipsum-chain state)
- (cadr (cj/markov-random-key cj/lipsum-chain))))))
- collect (replace-regexp-in-string "^[[:punct:]]+\\|[[:punct:]]+$" "" w))))
- ;; Filter empty strings from generated words
- (setq words (cl-remove-if #'string-empty-p words))
- (mapconcat
- (lambda (word idx)
- (if (or (zerop idx) (> (length word) cj/lipsum-title-small))
- (capitalize word)
- word))
- words " "))
+ (let ((n (+ cj/lipsum-title-min
+ (random (1+ (- cj/lipsum-title-max cj/lipsum-title-min)))))
+ (state (cj/markov-random-key cj/lipsum-chain))
+ (words '()))
+ (dotimes (_ n)
+ (when state
+ (let ((word (car state)))
+ (when (stringp word)
+ (setq word (replace-regexp-in-string
+ "^[[:punct:]]+\\|[[:punct:]]+$" "" word))
+ (unless (string-empty-p word)
+ (push word words))))
+ (setq state
+ (when (cadr state)
+ (list (cadr state)
+ (or (cj/markov-next-word cj/lipsum-chain state)
+ (cadr (cj/markov-random-key cj/lipsum-chain))))))))
+ (mapconcat
+ #'identity
+ (cl-loop for word in (nreverse words)
+ for idx from 0
+ collect (if (or (zerop idx) (> (length word) cj/lipsum-title-small))
+ (capitalize word)
+ word))
+ " ")))
;;; Paragraphs
diff --git a/tests/test-lorem-optimum-benchmark.el b/tests/test-lorem-optimum-benchmark.el
index 57d5ae5f..6b2f0163 100644
--- a/tests/test-lorem-optimum-benchmark.el
+++ b/tests/test-lorem-optimum-benchmark.el
@@ -63,10 +63,7 @@
(should (< time 50.0)))) ; Should be < 50ms
(ert-deftest benchmark-learn-10k-words ()
- "Benchmark learning 10,000 words.
-DISABLED: Takes too long (minutes instead of seconds).
-Needs lorem-optimum performance optimization before re-enabling."
- :tags '(:slow)
+ "Benchmark learning 10,000 words."
(let* ((text (generate-test-text 10000))
(chain (cj/markov-chain-create))
(time (benchmark-time
@@ -76,7 +73,6 @@ Needs lorem-optimum performance optimization before re-enabling."
(ert-deftest benchmark-learn-100k-words ()
"Benchmark learning 100,000 words (stress test)."
- :tags '(:slow)
(let* ((text (generate-test-text 100000))
(chain (cj/markov-chain-create))
(time (benchmark-time
@@ -84,7 +80,8 @@ Needs lorem-optimum performance optimization before re-enabling."
(benchmark-report "Learn 100K words" time)
;; This may be slow due to key rebuild
(message "Hash table size: %d bigrams"
- (hash-table-count (cj/markov-chain-map chain)))))
+ (hash-table-count (cj/markov-chain-map chain)))
+ (should (< time 5000.0))))
;;; Multiple Learning Operations (Exposes Quadratic Behavior)
@@ -109,7 +106,6 @@ Needs lorem-optimum performance optimization before re-enabling."
(ert-deftest benchmark-multiple-learns-100x100 ()
"Benchmark 100 learn operations of 100 words each (key rebuild overhead)."
- :tags '(:slow)
(let ((chain (cj/markov-chain-create))
(times '())
(measurements '()))
@@ -154,10 +150,7 @@ Needs lorem-optimum performance optimization before re-enabling."
;;; Tokenization Performance Tests
(ert-deftest benchmark-tokenize-10k-words ()
- "Benchmark tokenizing 10,000 words.
-DISABLED: Takes too long (minutes instead of seconds).
-Needs lorem-optimum performance optimization before re-enabling."
- :tags '(:slow)
+ "Benchmark tokenizing 10,000 words."
(let* ((text (generate-test-text 10000))
(time (benchmark-time
(lambda () (cj/markov-tokenize text)))))
diff --git a/tests/test-lorem-optimum.el b/tests/test-lorem-optimum.el
index ca2e52f4..40bdc684 100644
--- a/tests/test-lorem-optimum.el
+++ b/tests/test-lorem-optimum.el
@@ -238,5 +238,14 @@
(let ((result (cj/markov-generate chain 2)))
(should (stringp result)))))
+(ert-deftest test-title-generation-produces-title ()
+ "Should generate a non-empty title from the global chain."
+ (let ((cj/lipsum-chain
+ (test-learn "lorem ipsum dolor sit amet consectetur adipiscing elit")))
+ (let ((result (cj/lipsum-title)))
+ (should (stringp result))
+ (should (> (length result) 0))
+ (should (string-match-p "^[[:upper:]]" result)))))
+
(provide 'test-lorem-optimum)
;;; test-lorem-optimum.el ends here