perf(lorem-optimum): speed up the Markov generation path

`cj/markov-join-tokens' collects tokens in a list and `mapconcat's once instead of repeated string concatenation. `cj/markov-generate' uses `push'/`nreverse' instead of repeated `append'. The Markov keys are cached as a vector so random key selection is O(1). Re-enabled the benchmark tests (the `:slow' tags were stale) and added a `cj/lipsum-title' test after byte-compilation flagged a malformed form there. `assets/liber-primus.txt' is left as-is (36 KB / 5,374 words, small enough not to need trimming). 100K-word learning now measures about 196 ms.
author: Craig Jennings <c@cjennings.net> 2026-05-11 17:17:54 -0500
committer: Craig Jennings <c@cjennings.net> 2026-05-11 17:17:54 -0500
commit: 7f353e925b7dbd6d0c16962e0e822aea0093308b (patch)
tree: cc4a86ada9d06050b3176a2da664646c44b16242
parent: 96d5d6a7454f4e35860dc3d93cb3a447263859d7 (diff)
download: dotemacs-7f353e925b7dbd6d0c16962e0e822aea0093308b.tar.gz
dotemacs-7f353e925b7dbd6d0c16962e0e822aea0093308b.zip
3 files changed, 70 insertions, 56 deletions
diff --git a/modules/lorem-optimum.el b/modules/lorem-optimum.el
index 9125e8ad..7b19125e 100644
--- a/modules/lorem-optimum.el
+++ b/modules/lorem-optimum.el
@@ -107,29 +107,31 @@ Uses O(n) algorithm by matching at position instead of creating substrings."
 
 (defun cj/markov-join-tokens (tokens)
   "Join TOKENS into a sentence with proper spacing/punctuation."
-  (let ((sentence "") (need-space nil))
+  (let ((parts '())
+        (need-space nil))
 	(dolist (tok tokens)
 	  (cond
 	   ;; punctuation attaches directly
 	   ((string-match-p "^[[:punct:]]+$" tok)
-		(setq sentence (concat sentence tok))
+		(push tok parts)
 		(setq need-space t))
 	   ;; word
 	   (t
 		(when need-space
-		  (setq sentence (concat sentence " ")))
-		(setq sentence (concat sentence tok))
+		  (push " " parts))
+		(push tok parts)
 		(setq need-space t))))
-	;; fix capitalization of first word only
-	(when (string-match "\\`\\([[:alpha:]]\\)" sentence)
-	  (setq sentence
-			(replace-match (upcase (match-string 1 sentence))
-						   nil nil sentence)))
-	;; ensure it ends with .!?
-	(unless (string-match-p "[.!?]$" sentence)
-	  (setq sentence (concat (replace-regexp-in-string "[[:punct:]]+$" "" sentence) ".")))
-  (setq sentence (cj/markov-fix-capitalization sentence))
-		sentence))
+    (let ((sentence (mapconcat #'identity (nreverse parts) "")))
+	  ;; fix capitalization of first word only
+	  (when (string-match "\\`\\([[:alpha:]]\\)" sentence)
+	    (setq sentence
+			  (replace-match (upcase (match-string 1 sentence))
+						     nil nil sentence)))
+	  ;; ensure it ends with .!?
+	  (unless (string-match-p "[.!?]$" sentence)
+	    (setq sentence (concat (replace-regexp-in-string "[[:punct:]]+$" "" sentence) ".")))
+      (setq sentence (cj/markov-fix-capitalization sentence))
+	  sentence)))
 
 (defun cj/markov-generate (chain n &optional start)
   "Generate a sentence of N tokens from CHAIN."
@@ -140,19 +142,21 @@ Uses O(n) algorithm by matching at position instead of creating substrings."
 					  (cj/markov-random-key chain)))
 		   (w1 (car state))
 		   (w2 (cadr state))
-		   (tokens (list w1 w2)))
-	  (dotimes (_ (- n 2))
+		   (tokens (list w2 w1)))
+	  (dotimes (_ (max 0 (- n 2)))
 		(let ((next (cj/markov-next-word chain state)))
 		  (if next
-			  (setq tokens (append tokens (list next))
-					state (list w2 next)
-					w1 w2
-					w2 next)
+			  (progn
+			    (push next tokens)
+			    (setq state (list w2 next)
+				      w1 w2
+				      w2 next))
 			(setq state (cj/markov-random-key chain)
 				  w1 (car state)
-				  w2 (cadr state)
-				  tokens (append tokens (list w1 w2))))))
-	  (cj/markov-join-tokens tokens))))
+				  w2 (cadr state))
+			(push w1 tokens)
+			(push w2 tokens))))
+	  (cj/markov-join-tokens (nreverse tokens)))))
 
 (defun cj/markov-random-key (chain)
   "Return a random bigram key from CHAIN.
@@ -160,11 +164,12 @@ Builds and caches the keys list lazily if not already cached."
   (unless (cj/markov-chain-keys chain)
     ;; Lazily build keys list only when needed
     (setf (cj/markov-chain-keys chain)
-          (cl-loop for k being the hash-keys of (cj/markov-chain-map chain)
-                   collect k)))
+          (vconcat
+           (cl-loop for k being the hash-keys of (cj/markov-chain-map chain)
+                    collect k))))
   (let ((keys (cj/markov-chain-keys chain)))
-    (when keys
-      (nth (random (length keys)) keys))))
+    (when (> (length keys) 0)
+      (aref keys (random (length keys))))))
 
 (defun cj/markov-next-word (chain bigram)
   (let ((candidates (gethash bigram (cj/markov-chain-map chain))))
@@ -219,24 +224,31 @@ Builds and caches the keys list lazily if not already cached."
 (defun cj/lipsum-title ()
   "Generate a pseudo-Latin title."
   (interactive)
-  (let* ((n (+ cj/lipsum-title-min
-			   (random (1+ (- cj/lipsum-title-max cj/lipsum-title-min)))))
-		 (words
-		  (cl-loop with state = (cj/markov-random-key cj/lipsum-chain)
-				   for i from 0 below n
-				   for w = (car state)
-				   do (setq state (list (cadr state)
-										(or (cj/markov-next-word cj/lipsum-chain state)
-											(cadr (cj/markov-random-key cj/lipsum-chain))))))
-				   collect (replace-regexp-in-string "^[[:punct:]]+\\|[[:punct:]]+$" "" w))))
-	;; Filter empty strings from generated words
-	(setq words (cl-remove-if #'string-empty-p words))
-	(mapconcat
-	 (lambda (word idx)
-	   (if (or (zerop idx) (> (length word) cj/lipsum-title-small))
-		   (capitalize word)
-		 word))
-	 words " "))
+  (let ((n (+ cj/lipsum-title-min
+			  (random (1+ (- cj/lipsum-title-max cj/lipsum-title-min)))))
+        (state (cj/markov-random-key cj/lipsum-chain))
+        (words '()))
+    (dotimes (_ n)
+      (when state
+        (let ((word (car state)))
+          (when (stringp word)
+            (setq word (replace-regexp-in-string
+                        "^[[:punct:]]+\\|[[:punct:]]+$" "" word))
+            (unless (string-empty-p word)
+              (push word words))))
+        (setq state
+              (when (cadr state)
+                (list (cadr state)
+                      (or (cj/markov-next-word cj/lipsum-chain state)
+                          (cadr (cj/markov-random-key cj/lipsum-chain))))))))
+    (mapconcat
+     #'identity
+     (cl-loop for word in (nreverse words)
+              for idx from 0
+              collect (if (or (zerop idx) (> (length word) cj/lipsum-title-small))
+		                  (capitalize word)
+		                word))
+     " ")))
 
 ;;; Paragraphs
 
diff --git a/tests/test-lorem-optimum-benchmark.el b/tests/test-lorem-optimum-benchmark.el
index 57d5ae5f..6b2f0163 100644
--- a/tests/test-lorem-optimum-benchmark.el
+++ b/tests/test-lorem-optimum-benchmark.el
@@ -63,10 +63,7 @@
     (should (< time 50.0))))  ; Should be < 50ms
 
 (ert-deftest benchmark-learn-10k-words ()
-  "Benchmark learning 10,000 words.
-DISABLED: Takes too long (minutes instead of seconds).
-Needs lorem-optimum performance optimization before re-enabling."
-  :tags '(:slow)
+  "Benchmark learning 10,000 words."
   (let* ((text (generate-test-text 10000))
          (chain (cj/markov-chain-create))
          (time (benchmark-time
@@ -76,7 +73,6 @@ Needs lorem-optimum performance optimization before re-enabling."
 
 (ert-deftest benchmark-learn-100k-words ()
   "Benchmark learning 100,000 words (stress test)."
-  :tags '(:slow)
   (let* ((text (generate-test-text 100000))
          (chain (cj/markov-chain-create))
          (time (benchmark-time
@@ -84,7 +80,8 @@ Needs lorem-optimum performance optimization before re-enabling."
     (benchmark-report "Learn 100K words" time)
     ;; This may be slow due to key rebuild
     (message "Hash table size: %d bigrams"
-             (hash-table-count (cj/markov-chain-map chain)))))
+             (hash-table-count (cj/markov-chain-map chain)))
+    (should (< time 5000.0))))
 
 ;;; Multiple Learning Operations (Exposes Quadratic Behavior)
 
@@ -109,7 +106,6 @@ Needs lorem-optimum performance optimization before re-enabling."
 
 (ert-deftest benchmark-multiple-learns-100x100 ()
   "Benchmark 100 learn operations of 100 words each (key rebuild overhead)."
-  :tags '(:slow)
   (let ((chain (cj/markov-chain-create))
         (times '())
         (measurements '()))
@@ -154,10 +150,7 @@ Needs lorem-optimum performance optimization before re-enabling."
 ;;; Tokenization Performance Tests
 
 (ert-deftest benchmark-tokenize-10k-words ()
-  "Benchmark tokenizing 10,000 words.
-DISABLED: Takes too long (minutes instead of seconds).
-Needs lorem-optimum performance optimization before re-enabling."
-  :tags '(:slow)
+  "Benchmark tokenizing 10,000 words."
   (let* ((text (generate-test-text 10000))
          (time (benchmark-time
                 (lambda () (cj/markov-tokenize text)))))
diff --git a/tests/test-lorem-optimum.el b/tests/test-lorem-optimum.el
index ca2e52f4..40bdc684 100644
--- a/tests/test-lorem-optimum.el
+++ b/tests/test-lorem-optimum.el
@@ -238,5 +238,14 @@
     (let ((result (cj/markov-generate chain 2)))
       (should (stringp result)))))
 
+(ert-deftest test-title-generation-produces-title ()
+  "Should generate a non-empty title from the global chain."
+  (let ((cj/lipsum-chain
+         (test-learn "lorem ipsum dolor sit amet consectetur adipiscing elit")))
+    (let ((result (cj/lipsum-title)))
+      (should (stringp result))
+      (should (> (length result) 0))
+      (should (string-match-p "^[[:upper:]]" result)))))
+
 (provide 'test-lorem-optimum)
 ;;; test-lorem-optimum.el ends here
author	Craig Jennings <c@cjennings.net>	2026-05-11 17:17:54 -0500
committer	Craig Jennings <c@cjennings.net>	2026-05-11 17:17:54 -0500
commit	7f353e925b7dbd6d0c16962e0e822aea0093308b (patch)
tree	cc4a86ada9d06050b3176a2da664646c44b16242
parent	96d5d6a7454f4e35860dc3d93cb3a447263859d7 (diff)
download	dotemacs-7f353e925b7dbd6d0c16962e0e822aea0093308b.tar.gz dotemacs-7f353e925b7dbd6d0c16962e0e822aea0093308b.zip