summaryrefslogtreecommitdiff
path: root/modules
diff options
context:
space:
mode:
authorCraig Jennings <c@cjennings.net>2026-02-03 08:13:01 -0600
committerCraig Jennings <c@cjennings.net>2026-02-03 08:13:01 -0600
commit8af6ef2f8618687b414f9e6b064cf77b8333d73c (patch)
treeb4b1cf82b435e0d0b30cf12ba4ee9c47b43be4d7 /modules
parent09cfcfd6826f9bc8b379dde88e1d9ca719c1bdb2 (diff)
perf(lorem-optimum): fix O(n²) tokenization algorithm
The tokenizer was creating substring copies on every iteration: - (substring text pos (1+ pos)) for whitespace check - (substring text pos) for regex matching - copies ALL remaining text This caused 10K word tokenization to take 727ms instead of 6ms. Fix: Use string-match with start position parameter and check characters directly with aref instead of creating substrings. Performance improvement: - Tokenize 10K words: 727ms → 6ms (120x faster) - Learn 10K words: 873ms → 15ms (59x faster) - Learn 100K words: 70s → 208ms (341x faster)
Diffstat (limited to 'modules')
-rw-r--r--modules/lorem-optimum.el40
1 files changed, 23 insertions, 17 deletions
diff --git a/modules/lorem-optimum.el b/modules/lorem-optimum.el
index 6ccca55f..a8ae3aba 100644
--- a/modules/lorem-optimum.el
+++ b/modules/lorem-optimum.el
@@ -45,27 +45,33 @@ or sentences. By default it points to the file specified in
(defun cj/markov-tokenize (text)
"Split TEXT into tokens: words and punctuation separately.
-Returns a list of words and punctuation marks as separate tokens."
+Returns a list of words and punctuation marks as separate tokens.
+
+Uses O(n) algorithm by matching at position instead of creating substrings."
(let ((tokens '())
(pos 0)
(len (length text)))
(while (< pos len)
- (cond
- ;; Skip whitespace
- ((string-match-p "[[:space:]]" (substring text pos (1+ pos)))
- (setq pos (1+ pos)))
- ;; Match word (sequence of alphanumeric characters)
- ((string-match "\\`\\([[:alnum:]]+\\)" (substring text pos))
- (let ((word (match-string 1 (substring text pos))))
- (push word tokens)
- (setq pos (+ pos (length word)))))
- ;; Match punctuation (single character)
- ((string-match "\\`\\([[:punct:]]\\)" (substring text pos))
- (let ((punct (match-string 1 (substring text pos))))
- (push punct tokens)
- (setq pos (+ pos (length punct)))))
- ;; Skip any other character
- (t (setq pos (1+ pos)))))
+ (let ((char (aref text pos)))
+ (cond
+ ;; Skip whitespace (check char directly, no substring)
+ ((memq char '(?\s ?\t ?\n ?\r ?\f))
+ (setq pos (1+ pos)))
+ ;; Match word at position (no substring needed)
+ ((and (or (<= ?a char ?z)
+ (<= ?A char ?Z)
+ (<= ?0 char ?9))
+ (string-match "\\([[:alnum:]]+\\)" text pos)
+ (= (match-beginning 0) pos))
+ (push (match-string 1 text) tokens)
+ (setq pos (match-end 0)))
+ ;; Match punctuation at position
+ ((and (string-match "\\([[:punct:]]\\)" text pos)
+ (= (match-beginning 0) pos))
+ (push (match-string 1 text) tokens)
+ (setq pos (match-end 0)))
+ ;; Skip any other character
+ (t (setq pos (1+ pos))))))
(nreverse tokens)))
(defun cj/markov-learn (chain text)
"Add TEXT into the Markov CHAIN with tokenized input."