;;; lorem-optimum.el --- Fake Latin Text Generator -*- coding: utf-8; lexical-binding: t; -*- ;; ;; Author: Craig Jennings ;; Version: 0.5 ;; Package-Requires: ((emacs "26.1") (cl-lib "0.6")) ;; Keywords: text, lorem-ipsum, dummy, filler, markov ;; URL: https://github.com/yourname/cj-lipsum ;;; Commentary: ;; ;; Layer: 4 (Optional). ;; Category: O/L. ;; Load shape: eager. ;; Eager reason: none; placeholder-text generator, a command-loaded deferral ;; candidate. ;; Top-level side effects: none. ;; Runtime requires: cl-lib. ;; Direct test load: yes. ;; ;; Generate pseudo-Latin placeholder text using a simple order-two ;; Markov chain. You can train the chain on region, buffer, or file. ;; By default, it learns from a bundled Latin wordlist, which you can ;; change via customization. ;; ;; Interactive commands: ;; M-x cj/lipsum – Return N words ;; M-x cj/lipsum-insert – Insert N words at point ;; M-x cj/lipsum-title – Generate a pseudo-Latin heading ;; M-x cj/lipsum-paragraphs – Insert multiple paragraphs ;; M-x cj/lipsum-learn-* – Train the chain ;; M-x cj/lipsum-reset – Clear chain ;;; Code: (require 'cl-lib) ;;; Configuration (defvar cj/lipsum-training-file "assets/liber-primus.txt" "Default training file name (relative to `user-emacs-directory`).") (defvar cj/lipsum-default-file (expand-file-name cj/lipsum-training-file user-emacs-directory) "Default training file for cj-lipsum. This should be a plain UTF-8 text file with hundreds of Latin words or sentences. By default it points to the file specified in `cj/lipsum-training-file` relative to `user-emacs-directory`.") (cl-defstruct (cj/markov-chain (:constructor cj/markov-chain-create)) "An order-two Markov chain." (map (make-hash-table :test 'equal)) (keys nil)) (defun cj/markov-tokenize (text) "Split TEXT into tokens: words and punctuation separately. Returns a list of words and punctuation marks as separate tokens. Uses O(n) algorithm by matching at position instead of creating substrings." (let ((tokens '()) (pos 0) (len (length text))) (while (< pos len) (let ((char (aref text pos))) (cond ;; Skip whitespace (check char directly, no substring) ((memq char '(?\s ?\t ?\n ?\r ?\f)) (setq pos (1+ pos))) ;; Match word at position (no substring needed) ((and (or (<= ?a char ?z) (<= ?A char ?Z) (<= ?0 char ?9)) (string-match "\\([[:alnum:]]+\\)" text pos) (= (match-beginning 0) pos)) (push (match-string 1 text) tokens) (setq pos (match-end 0))) ;; Match punctuation at position ((and (string-match "\\([[:punct:]]\\)" text pos) (= (match-beginning 0) pos)) (push (match-string 1 text) tokens) (setq pos (match-end 0))) ;; Skip any other character (t (setq pos (1+ pos)))))) (nreverse tokens))) (defun cj/markov-learn (chain text) "Add TEXT into the Markov CHAIN with tokenized input." (let* ((word-list (cj/markov-tokenize text)) ;; Convert to vector for O(1) access instead of O(n) with nth (words (vconcat word-list)) (len (length words))) (cl-loop for i from 0 to (- len 3) for a = (aref words i) for b = (aref words (1+ i)) for c = (aref words (+ i 2)) do (let* ((bigram (list a b)) (nexts (gethash bigram (cj/markov-chain-map chain)))) (puthash bigram (cons c nexts) (cj/markov-chain-map chain))))) ;; Invalidate cached keys after learning new data (setf (cj/markov-chain-keys chain) nil)) (defun cj/markov-fix-capitalization (sentence) "Capitalize the first word and the first word after .!? in SENTENCE." (let* ((tokens (split-string sentence "\\b" t))) (cl-loop with capitalize-next = t for i from 0 below (length tokens) for tok = (nth i tokens) do (when (and capitalize-next (string-match-p "^[[:alpha:]]" tok)) (setf (nth i tokens) (concat (upcase (substring tok 0 1)) (substring tok 1))) (setq capitalize-next nil)) do (when (string-match-p "[.!?]" tok) ; <-- Changed: removed $ anchor (setq capitalize-next t))) (mapconcat #'identity tokens ""))) (defun cj/markov-join-tokens (tokens) "Join TOKENS into a sentence with proper spacing/punctuation." (let ((parts '()) (need-space nil)) (dolist (tok tokens) (cond ;; punctuation attaches directly ((string-match-p "^[[:punct:]]+$" tok) (push tok parts) (setq need-space t)) ;; word (t (when need-space (push " " parts)) (push tok parts) (setq need-space t)))) (let ((sentence (mapconcat #'identity (nreverse parts) ""))) ;; fix capitalization of first word only (when (string-match "\\`\\([[:alpha:]]\\)" sentence) (setq sentence (replace-match (upcase (match-string 1 sentence)) nil nil sentence))) ;; ensure it ends with .!? (unless (string-match-p "[.!?]$" sentence) (setq sentence (concat (replace-regexp-in-string "[[:punct:]]+$" "" sentence) "."))) (setq sentence (cj/markov-fix-capitalization sentence)) sentence))) (defun cj/markov-generate (chain n &optional start) "Generate a sentence of N tokens from CHAIN." (when (> (hash-table-count (cj/markov-chain-map chain)) 0) (let* ((state (or (and start (gethash start (cj/markov-chain-map chain)) start) (cj/markov-random-key chain))) (w1 (car state)) (w2 (cadr state)) ;; tokens is built with `push' (prepend) and `nreverse'd at the end, ;; so seed it reversed -- w2 then w1 -- to get w1 w2 ... after the flip. (tokens (list w2 w1))) (dotimes (_ (max 0 (- n 2))) (let ((next (cj/markov-next-word chain state))) (if next (progn (push next tokens) (setq state (list w2 next) w1 w2 w2 next)) (setq state (cj/markov-random-key chain) w1 (car state) w2 (cadr state)) (push w1 tokens) (push w2 tokens)))) (cj/markov-join-tokens (nreverse tokens))))) (defun cj/markov-random-key (chain) "Return a random bigram key from CHAIN. Builds and caches the keys list lazily if not already cached." (unless (cj/markov-chain-keys chain) ;; Lazily build keys list only when needed (setf (cj/markov-chain-keys chain) (vconcat (cl-loop for k being the hash-keys of (cj/markov-chain-map chain) collect k)))) (let ((keys (cj/markov-chain-keys chain))) (when (> (length keys) 0) (aref keys (random (length keys)))))) (defun cj/markov-next-word (chain bigram) (let ((candidates (gethash bigram (cj/markov-chain-map chain)))) (when candidates (nth (random (length candidates)) candidates)))) (defvar cj/lipsum-chain (cj/markov-chain-create) "Global Markov chain for lipsum generation.") (defun cj/lipsum-reset () "Reset the global lipsum Markov chain." (interactive) (setq cj/lipsum-chain (cj/markov-chain-create)) (message "cj/lipsum-chain reset.")) (defun cj/lipsum-learn-region (beg end) "Learn text from region." (interactive "r") (cj/markov-learn cj/lipsum-chain (buffer-substring-no-properties beg end)) (message "Learned from region.")) (defun cj/lipsum-learn-buffer () "Learn from entire buffer." (interactive) (cj/markov-learn cj/lipsum-chain (buffer-substring-no-properties (point-min) (point-max))) (message "Learned from buffer.")) (defun cj/lipsum-learn-file (file) "Learn from FILE containing plain text." (interactive "fTrain from file: ") (with-temp-buffer (insert-file-contents file) (cj/markov-learn cj/lipsum-chain (buffer-string))) (message "Lorem-optimum learned from file: %s" file)) (defun cj/lipsum (n) "Return N words of lorem ipsum." (cj/markov-generate cj/lipsum-chain n '("Lorem" "ipsum"))) (defun cj/lipsum-insert (n) "Insert N words of lorem ipsum at point." (interactive "nNumber of words: ") (insert (cj/lipsum n))) ;;; Title generation (defconst cj/lipsum-title-min 3) (defconst cj/lipsum-title-max 8) (defconst cj/lipsum-title-small 3) (defun cj/lipsum-title () "Generate a pseudo-Latin title." (interactive) (let ((n (+ cj/lipsum-title-min (random (1+ (- cj/lipsum-title-max cj/lipsum-title-min))))) (state (cj/markov-random-key cj/lipsum-chain)) (words '())) (dotimes (_ n) (when state (let ((word (car state))) (when (stringp word) (setq word (replace-regexp-in-string "^[[:punct:]]+\\|[[:punct:]]+$" "" word)) (unless (string-empty-p word) (push word words)))) (setq state (when (cadr state) (list (cadr state) (or (cj/markov-next-word cj/lipsum-chain state) (cadr (cj/markov-random-key cj/lipsum-chain)))))))) (mapconcat #'identity (cl-loop for word in (nreverse words) for idx from 0 collect (if (or (zerop idx) (> (length word) cj/lipsum-title-small)) (capitalize word) word)) " "))) ;;; Paragraphs (defun cj/lipsum-paragraphs (count &optional min max) "Insert COUNT paragraphs of lipsum. Each paragraph has a random length between MIN and MAX words. Defaults: MIN=30, MAX=80." (interactive "nNumber of paragraphs: ") (let ((min (or min 30)) (max (or max 80))) (dotimes (_ count) (let ((len (+ min (random (1+ (- max min)))))) (insert (cj/lipsum len) "\n\n"))))) ;;; Initialization: train on default file (defun cj/lipsum--init () "Initialize cj-lipsum by learning from `cj/lipsum-default-file`." (when (and cj/lipsum-default-file (file-readable-p cj/lipsum-default-file)) (cj/lipsum-learn-file cj/lipsum-default-file))) (cj/lipsum--init) (provide 'lorem-optimum) ;;; lorem-optimum.el ends here.