summaryrefslogtreecommitdiff
path: root/modules
diff options
context:
space:
mode:
Diffstat (limited to 'modules')
-rw-r--r--modules/lorem-optimum.el (renamed from modules/lorem-generator.el)93
1 files changed, 59 insertions, 34 deletions
diff --git a/modules/lorem-generator.el b/modules/lorem-optimum.el
index 6f0520c6..6ccca55f 100644
--- a/modules/lorem-generator.el
+++ b/modules/lorem-optimum.el
@@ -1,4 +1,4 @@
-;;; lorem-generator.el --- Fake Latin Text Generator -*- coding: utf-8; lexical-binding: t; -*-
+;;; lorem-optimum.el --- Fake Latin Text Generator -*- coding: utf-8; lexical-binding: t; -*-
;;
;; Author: Craig Jennings
;; Version: 0.5
@@ -24,6 +24,19 @@
(require 'cl-lib)
+;;; Configuration
+
+(defvar cj/lipsum-training-file "assets/liber-primus.txt"
+ "Default training file name (relative to `user-emacs-directory`).")
+
+(defvar cj/lipsum-default-file
+ (expand-file-name cj/lipsum-training-file user-emacs-directory)
+ "Default training file for cj-lipsum.
+
+This should be a plain UTF-8 text file with hundreds of Latin words
+or sentences. By default it points to the file specified in
+`cj/lipsum-training-file` relative to `user-emacs-directory`.")
+
(cl-defstruct (cj/markov-chain
(:constructor cj/markov-chain-create))
"An order-two Markov chain."
@@ -31,25 +44,45 @@
(keys nil))
(defun cj/markov-tokenize (text)
- "Split TEXT into tokens: words and punctuation separately."
- (let ((case-fold-search nil))
- (split-string text "\\b" t "[[:space:]\n]+")))
-
+ "Split TEXT into tokens: words and punctuation separately.
+Returns a list of words and punctuation marks as separate tokens."
+ (let ((tokens '())
+ (pos 0)
+ (len (length text)))
+ (while (< pos len)
+ (cond
+ ;; Skip whitespace
+ ((string-match-p "[[:space:]]" (substring text pos (1+ pos)))
+ (setq pos (1+ pos)))
+ ;; Match word (sequence of alphanumeric characters)
+ ((string-match "\\`\\([[:alnum:]]+\\)" (substring text pos))
+ (let ((word (match-string 1 (substring text pos))))
+ (push word tokens)
+ (setq pos (+ pos (length word)))))
+ ;; Match punctuation (single character)
+ ((string-match "\\`\\([[:punct:]]\\)" (substring text pos))
+ (let ((punct (match-string 1 (substring text pos))))
+ (push punct tokens)
+ (setq pos (+ pos (length punct)))))
+ ;; Skip any other character
+ (t (setq pos (1+ pos)))))
+ (nreverse tokens)))
(defun cj/markov-learn (chain text)
"Add TEXT into the Markov CHAIN with tokenized input."
- (let* ((words (cj/markov-tokenize text))
+ (let* ((word-list (cj/markov-tokenize text))
+ ;; Convert to vector for O(1) access instead of O(n) with nth
+ (words (vconcat word-list))
(len (length words)))
(cl-loop for i from 0 to (- len 3)
- for a = (nth i words)
- for b = (nth (1+ i) words)
- for c = (nth (+ i 2) words)
+ for a = (aref words i)
+ for b = (aref words (1+ i))
+ for c = (aref words (+ i 2))
do (let* ((bigram (list a b))
(nexts (gethash bigram (cj/markov-chain-map chain))))
(puthash bigram (cons c nexts)
(cj/markov-chain-map chain)))))
- (setf (cj/markov-chain-keys chain)
- (cl-loop for k being the hash-keys of (cj/markov-chain-map chain)
- collect k)))
+ ;; Invalidate cached keys after learning new data
+ (setf (cj/markov-chain-keys chain) nil))
(defun cj/markov-fix-capitalization (sentence)
"Capitalize the first word and the first word after .!? in SENTENCE."
@@ -94,7 +127,7 @@
(defun cj/markov-generate (chain n &optional start)
"Generate a sentence of N tokens from CHAIN."
- (when (cj/markov-chain-keys chain)
+ (when (> (hash-table-count (cj/markov-chain-map chain)) 0)
(let* ((state (or (and start
(gethash start (cj/markov-chain-map chain))
start)
@@ -116,8 +149,16 @@
(cj/markov-join-tokens tokens))))
(defun cj/markov-random-key (chain)
- (nth (random (length (cj/markov-chain-keys chain)))
- (cj/markov-chain-keys chain)))
+ "Return a random bigram key from CHAIN.
+Builds and caches the keys list lazily if not already cached."
+ (unless (cj/markov-chain-keys chain)
+ ;; Lazily build keys list only when needed
+ (setf (cj/markov-chain-keys chain)
+ (cl-loop for k being the hash-keys of (cj/markov-chain-map chain)
+ collect k)))
+ (let ((keys (cj/markov-chain-keys chain)))
+ (when keys
+ (nth (random (length keys)) keys))))
(defun cj/markov-next-word (chain bigram)
(let ((candidates (gethash bigram (cj/markov-chain-map chain))))
@@ -182,6 +223,7 @@
(or (cj/markov-next-word cj/lipsum-chain state)
(cadr (cj/markov-random-key cj/lipsum-chain))))))
collect (replace-regexp-in-string "^[[:punct:]]+\\|[[:punct:]]+$" "" w))))
+ ;; Filter empty strings from generated words
(setq words (cl-remove-if #'string-empty-p words))
(mapconcat
(lambda (word idx)
@@ -204,23 +246,6 @@ Defaults: MIN=30, MAX=80."
(let ((len (+ min (random (1+ (- max min))))))
(insert (cj/lipsum len) "\n\n")))))
-;;; Customization
-
-(defgroup cj-lipsum nil
- "Pseudo-Latin lorem ipsum text generator."
- :prefix "cj/lipsum-"
- :group 'text)
-
-(defcustom cj/lipsum-default-file
- (expand-file-name "latin.txt"
- (file-name-directory (or load-file-name buffer-file-name)))
- "Default training file for cj-lipsum.
-
-This should be a plain UTF-8 text file with hundreds of Latin words
-or sentences. By default it points to the bundled `latin.txt`."
- :type 'file
- :group 'cj-lipsum)
-
;;; Initialization: train on default file
(defun cj/lipsum--init ()
"Initialize cj-lipsum by learning from `cj/lipsum-default-file`."
@@ -231,5 +256,5 @@ or sentences. By default it points to the bundled `latin.txt`."
(cj/lipsum--init)
-(provide 'lorem-generator)
-;;; lorem-generator.el ends here.
+(provide 'lorem-optimum)
+;;; lorem-optimum.el ends here.