diff options
| author | Craig Jennings <c@cjennings.net> | 2026-02-03 08:13:01 -0600 |
|---|---|---|
| committer | Craig Jennings <c@cjennings.net> | 2026-02-03 08:13:01 -0600 |
| commit | 8af6ef2f8618687b414f9e6b064cf77b8333d73c (patch) | |
| tree | b4b1cf82b435e0d0b30cf12ba4ee9c47b43be4d7 | |
| parent | 09cfcfd6826f9bc8b379dde88e1d9ca719c1bdb2 (diff) | |
perf(lorem-optimum): fix O(n²) tokenization algorithm
The tokenizer was creating substring copies on every iteration:
- (substring text pos (1+ pos)) for whitespace check
- (substring text pos) for regex matching - copies ALL remaining text
This caused 10K word tokenization to take 727ms instead of 6ms.
Fix: Use string-match with start position parameter and check
characters directly with aref instead of creating substrings.
Performance improvement:
- Tokenize 10K words: 727ms ā 6ms (120x faster)
- Learn 10K words: 873ms ā 15ms (59x faster)
- Learn 100K words: 70s ā 208ms (341x faster)
| -rw-r--r-- | modules/lorem-optimum.el | 40 |
1 files changed, 23 insertions, 17 deletions
diff --git a/modules/lorem-optimum.el b/modules/lorem-optimum.el index 6ccca55f..a8ae3aba 100644 --- a/modules/lorem-optimum.el +++ b/modules/lorem-optimum.el @@ -45,27 +45,33 @@ or sentences. By default it points to the file specified in (defun cj/markov-tokenize (text) "Split TEXT into tokens: words and punctuation separately. -Returns a list of words and punctuation marks as separate tokens." +Returns a list of words and punctuation marks as separate tokens. + +Uses O(n) algorithm by matching at position instead of creating substrings." (let ((tokens '()) (pos 0) (len (length text))) (while (< pos len) - (cond - ;; Skip whitespace - ((string-match-p "[[:space:]]" (substring text pos (1+ pos))) - (setq pos (1+ pos))) - ;; Match word (sequence of alphanumeric characters) - ((string-match "\\`\\([[:alnum:]]+\\)" (substring text pos)) - (let ((word (match-string 1 (substring text pos)))) - (push word tokens) - (setq pos (+ pos (length word))))) - ;; Match punctuation (single character) - ((string-match "\\`\\([[:punct:]]\\)" (substring text pos)) - (let ((punct (match-string 1 (substring text pos)))) - (push punct tokens) - (setq pos (+ pos (length punct))))) - ;; Skip any other character - (t (setq pos (1+ pos))))) + (let ((char (aref text pos))) + (cond + ;; Skip whitespace (check char directly, no substring) + ((memq char '(?\s ?\t ?\n ?\r ?\f)) + (setq pos (1+ pos))) + ;; Match word at position (no substring needed) + ((and (or (<= ?a char ?z) + (<= ?A char ?Z) + (<= ?0 char ?9)) + (string-match "\\([[:alnum:]]+\\)" text pos) + (= (match-beginning 0) pos)) + (push (match-string 1 text) tokens) + (setq pos (match-end 0))) + ;; Match punctuation at position + ((and (string-match "\\([[:punct:]]\\)" text pos) + (= (match-beginning 0) pos)) + (push (match-string 1 text) tokens) + (setq pos (match-end 0))) + ;; Skip any other character + (t (setq pos (1+ pos)))))) (nreverse tokens))) (defun cj/markov-learn (chain text) "Add TEXT into the Markov CHAIN with tokenized input." |
