summaryrefslogtreecommitdiff
path: root/modules/lorem-optimum.el
blob: 6ccca55f1a03b43be4dc0c1b971a7ba3ef15fb81 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
;;; lorem-optimum.el --- Fake Latin Text Generator -*- coding: utf-8; lexical-binding: t; -*-
;;
;; Author: Craig Jennings
;; Version: 0.5
;; Package-Requires: ((emacs "26.1") (cl-lib "0.6"))
;; Keywords: text, lorem-ipsum, dummy, filler, markov
;; URL: https://github.com/yourname/cj-lipsum

;;; Commentary:
;; Generate pseudo-Latin placeholder text using a simple order-two
;; Markov chain.  You can train the chain on region, buffer, or file.
;; By default, it learns from a bundled Latin wordlist, which you can
;; change via customization.
;;
;; Interactive commands:
;;   M-x cj/lipsum             – Return N words
;;   M-x cj/lipsum-insert      – Insert N words at point
;;   M-x cj/lipsum-title       – Generate a pseudo-Latin heading
;;   M-x cj/lipsum-paragraphs  – Insert multiple paragraphs
;;   M-x cj/lipsum-learn-*     – Train the chain
;;   M-x cj/lipsum-reset       – Clear chain

;;; Code:

(require 'cl-lib)

;;; Configuration

(defvar cj/lipsum-training-file "assets/liber-primus.txt"
  "Default training file name (relative to `user-emacs-directory`).")

(defvar cj/lipsum-default-file
  (expand-file-name cj/lipsum-training-file user-emacs-directory)
  "Default training file for cj-lipsum.

This should be a plain UTF-8 text file with hundreds of Latin words
or sentences.  By default it points to the file specified in
`cj/lipsum-training-file` relative to `user-emacs-directory`.")

(cl-defstruct (cj/markov-chain
			   (:constructor cj/markov-chain-create))
  "An order-two Markov chain."
  (map (make-hash-table :test 'equal))
  (keys nil))

(defun cj/markov-tokenize (text)
  "Split TEXT into tokens: words and punctuation separately.
Returns a list of words and punctuation marks as separate tokens."
  (let ((tokens '())
        (pos 0)
        (len (length text)))
    (while (< pos len)
      (cond
       ;; Skip whitespace
       ((string-match-p "[[:space:]]" (substring text pos (1+ pos)))
        (setq pos (1+ pos)))
       ;; Match word (sequence of alphanumeric characters)
       ((string-match "\\`\\([[:alnum:]]+\\)" (substring text pos))
        (let ((word (match-string 1 (substring text pos))))
          (push word tokens)
          (setq pos (+ pos (length word)))))
       ;; Match punctuation (single character)
       ((string-match "\\`\\([[:punct:]]\\)" (substring text pos))
        (let ((punct (match-string 1 (substring text pos))))
          (push punct tokens)
          (setq pos (+ pos (length punct)))))
       ;; Skip any other character
       (t (setq pos (1+ pos)))))
    (nreverse tokens)))
(defun cj/markov-learn (chain text)
  "Add TEXT into the Markov CHAIN with tokenized input."
  (let* ((word-list (cj/markov-tokenize text))
         ;; Convert to vector for O(1) access instead of O(n) with nth
         (words (vconcat word-list))
		 (len (length words)))
	(cl-loop for i from 0 to (- len 3)
			 for a = (aref words i)
			 for b = (aref words (1+ i))
			 for c = (aref words (+ i 2))
			 do (let* ((bigram (list a b))
					   (nexts (gethash bigram (cj/markov-chain-map chain))))
				  (puthash bigram (cons c nexts)
						   (cj/markov-chain-map chain)))))
  ;; Invalidate cached keys after learning new data
  (setf (cj/markov-chain-keys chain) nil))

(defun cj/markov-fix-capitalization (sentence)
  "Capitalize the first word and the first word after .!? in SENTENCE."
  (let* ((tokens (split-string sentence "\\b" t)))
	(cl-loop with capitalize-next = t
			 for i from 0 below (length tokens)
			 for tok = (nth i tokens)
			 do (when (and capitalize-next (string-match-p "^[[:alpha:]]" tok))
				  (setf (nth i tokens)
						(concat (upcase (substring tok 0 1))
								(substring tok 1)))
				  (setq capitalize-next nil))
			 do (when (string-match-p "[.!?]" tok)  ; <-- Changed: removed $ anchor
				  (setq capitalize-next t)))
	(mapconcat #'identity tokens "")))

(defun cj/markov-join-tokens (tokens)
  "Join TOKENS into a sentence with proper spacing/punctuation."
  (let ((sentence "") (need-space nil))
	(dolist (tok tokens)
	  (cond
	   ;; punctuation attaches directly
	   ((string-match-p "^[[:punct:]]+$" tok)
		(setq sentence (concat sentence tok))
		(setq need-space t))
	   ;; word
	   (t
		(when need-space
		  (setq sentence (concat sentence " ")))
		(setq sentence (concat sentence tok))
		(setq need-space t))))
	;; fix capitalization of first word only
	(when (string-match "\\`\\([[:alpha:]]\\)" sentence)
	  (setq sentence
			(replace-match (upcase (match-string 1 sentence))
						   nil nil sentence)))
	;; ensure it ends with .!?
	(unless (string-match-p "[.!?]$" sentence)
	  (setq sentence (concat (replace-regexp-in-string "[[:punct:]]+$" "" sentence) ".")))
  (setq sentence (cj/markov-fix-capitalization sentence))
		sentence))

(defun cj/markov-generate (chain n &optional start)
  "Generate a sentence of N tokens from CHAIN."
  (when (> (hash-table-count (cj/markov-chain-map chain)) 0)
	(let* ((state (or (and start
						   (gethash start (cj/markov-chain-map chain))
						   start)
					  (cj/markov-random-key chain)))
		   (w1 (car state))
		   (w2 (cadr state))
		   (tokens (list w1 w2)))
	  (dotimes (_ (- n 2))
		(let ((next (cj/markov-next-word chain state)))
		  (if next
			  (setq tokens (append tokens (list next))
					state (list w2 next)
					w1 w2
					w2 next)
			(setq state (cj/markov-random-key chain)
				  w1 (car state)
				  w2 (cadr state)
				  tokens (append tokens (list w1 w2))))))
	  (cj/markov-join-tokens tokens))))

(defun cj/markov-random-key (chain)
  "Return a random bigram key from CHAIN.
Builds and caches the keys list lazily if not already cached."
  (unless (cj/markov-chain-keys chain)
    ;; Lazily build keys list only when needed
    (setf (cj/markov-chain-keys chain)
          (cl-loop for k being the hash-keys of (cj/markov-chain-map chain)
                   collect k)))
  (let ((keys (cj/markov-chain-keys chain)))
    (when keys
      (nth (random (length keys)) keys))))

(defun cj/markov-next-word (chain bigram)
  (let ((candidates (gethash bigram (cj/markov-chain-map chain))))
	(when candidates
	  (nth (random (length candidates)) candidates))))

(defvar cj/lipsum-chain (cj/markov-chain-create)
  "Global Markov chain for lipsum generation.")

(defun cj/lipsum-reset ()
  "Reset the global lipsum Markov chain."
  (interactive)
  (setq cj/lipsum-chain (cj/markov-chain-create))
  (message "cj/lipsum-chain reset."))

(defun cj/lipsum-learn-region (beg end)
  "Learn text from region."
  (interactive "r")
  (cj/markov-learn cj/lipsum-chain (buffer-substring-no-properties beg end))
  (message "Learned from region."))

(defun cj/lipsum-learn-buffer ()
  "Learn from entire buffer."
  (interactive)
  (cj/markov-learn cj/lipsum-chain
				   (buffer-substring-no-properties (point-min) (point-max)))
  (message "Learned from buffer."))

(defun cj/lipsum-learn-file (file)
  "Learn from FILE containing plain text."
  (interactive "fTrain from file: ")
  (with-temp-buffer
	(insert-file-contents file)
	(cj/markov-learn cj/lipsum-chain (buffer-string)))
  (message "Learned from file: %s" file))

(defun cj/lipsum (n)
  "Return N words of lorem ipsum."
  (cj/markov-generate cj/lipsum-chain n '("Lorem" "ipsum")))

(defun cj/lipsum-insert (n)
  "Insert N words of lorem ipsum at point."
  (interactive "nNumber of words: ")
  (insert (cj/lipsum n)))

;;; Title generation

(defconst cj/lipsum-title-min 3)
(defconst cj/lipsum-title-max 8)
(defconst cj/lipsum-title-small 3)

(defun cj/lipsum-title ()
  "Generate a pseudo-Latin title."
  (interactive)
  (let* ((n (+ cj/lipsum-title-min
			   (random (1+ (- cj/lipsum-title-max cj/lipsum-title-min)))))
		 (words
		  (cl-loop with state = (cj/markov-random-key cj/lipsum-chain)
				   for i from 0 below n
				   for w = (car state)
				   do (setq state (list (cadr state)
										(or (cj/markov-next-word cj/lipsum-chain state)
											(cadr (cj/markov-random-key cj/lipsum-chain))))))
				   collect (replace-regexp-in-string "^[[:punct:]]+\\|[[:punct:]]+$" "" w))))
	;; Filter empty strings from generated words
	(setq words (cl-remove-if #'string-empty-p words))
	(mapconcat
	 (lambda (word idx)
	   (if (or (zerop idx) (> (length word) cj/lipsum-title-small))
		   (capitalize word)
		 word))
	 words " "))

;;; Paragraphs

(defun cj/lipsum-paragraphs (count &optional min max)
  "Insert COUNT paragraphs of lipsum.

Each paragraph has a random length between MIN and MAX words.
Defaults: MIN=30, MAX=80."
  (interactive "nNumber of paragraphs: ")
  (let ((min (or min 30))
        (max (or max 80)))
    (dotimes (_ count)
      (let ((len (+ min (random (1+ (- max min))))))
		(insert (cj/lipsum len) "\n\n")))))

;;; Initialization: train on default file
(defun cj/lipsum--init ()
  "Initialize cj-lipsum by learning from `cj/lipsum-default-file`."
  (when (and cj/lipsum-default-file
			 (file-readable-p cj/lipsum-default-file))
	(cj/lipsum-reset)
	(cj/lipsum-learn-file cj/lipsum-default-file)))

(cj/lipsum--init)

(provide 'lorem-optimum)
;;; lorem-optimum.el ends here.