From 8af6ef2f8618687b414f9e6b064cf77b8333d73c Mon Sep 17 00:00:00 2001
From: Craig Jennings <c@cjennings.net>
Date: Tue, 3 Feb 2026 08:13:01 -0600
Subject: =?UTF-8?q?perf(lorem-optimum):=20fix=20O(n=C2=B2)=20tokenization?=
 =?UTF-8?q?=20algorithm?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The tokenizer was creating substring copies on every iteration:
- (substring text pos (1+ pos)) for whitespace check
- (substring text pos) for regex matching - copies ALL remaining text

This caused 10K word tokenization to take 727ms instead of 6ms.

Fix: Use string-match with start position parameter and check
characters directly with aref instead of creating substrings.

Performance improvement:
- Tokenize 10K words: 727ms → 6ms (120x faster)
- Learn 10K words: 873ms → 15ms (59x faster)
- Learn 100K words: 70s → 208ms (341x faster)
---
 modules/lorem-optimum.el | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

(limited to 'modules/lorem-optimum.el')

diff --git a/modules/lorem-optimum.el b/modules/lorem-optimum.el
index 6ccca55f..a8ae3aba 100644
--- a/modules/lorem-optimum.el
+++ b/modules/lorem-optimum.el
@@ -45,27 +45,33 @@ or sentences.  By default it points to the file specified in
 
 (defun cj/markov-tokenize (text)
   "Split TEXT into tokens: words and punctuation separately.
-Returns a list of words and punctuation marks as separate tokens."
+Returns a list of words and punctuation marks as separate tokens.
+
+Uses O(n) algorithm by matching at position instead of creating substrings."
   (let ((tokens '())
         (pos 0)
         (len (length text)))
     (while (< pos len)
-      (cond
-       ;; Skip whitespace
-       ((string-match-p "[[:space:]]" (substring text pos (1+ pos)))
-        (setq pos (1+ pos)))
-       ;; Match word (sequence of alphanumeric characters)
-       ((string-match "\\`\\([[:alnum:]]+\\)" (substring text pos))
-        (let ((word (match-string 1 (substring text pos))))
-          (push word tokens)
-          (setq pos (+ pos (length word)))))
-       ;; Match punctuation (single character)
-       ((string-match "\\`\\([[:punct:]]\\)" (substring text pos))
-        (let ((punct (match-string 1 (substring text pos))))
-          (push punct tokens)
-          (setq pos (+ pos (length punct)))))
-       ;; Skip any other character
-       (t (setq pos (1+ pos)))))
+      (let ((char (aref text pos)))
+        (cond
+         ;; Skip whitespace (check char directly, no substring)
+         ((memq char '(?\s ?\t ?\n ?\r ?\f))
+          (setq pos (1+ pos)))
+         ;; Match word at position (no substring needed)
+         ((and (or (<= ?a char ?z)
+                   (<= ?A char ?Z)
+                   (<= ?0 char ?9))
+               (string-match "\\([[:alnum:]]+\\)" text pos)
+               (= (match-beginning 0) pos))
+          (push (match-string 1 text) tokens)
+          (setq pos (match-end 0)))
+         ;; Match punctuation at position
+         ((and (string-match "\\([[:punct:]]\\)" text pos)
+               (= (match-beginning 0) pos))
+          (push (match-string 1 text) tokens)
+          (setq pos (match-end 0)))
+         ;; Skip any other character
+         (t (setq pos (1+ pos))))))
     (nreverse tokens)))
 (defun cj/markov-learn (chain text)
   "Add TEXT into the Markov CHAIN with tokenized input."
-- 
cgit v1.2.3