diff options
| author | Craig Jennings <c@cjennings.net> | 2026-04-28 18:16:28 -0500 |
|---|---|---|
| committer | Craig Jennings <c@cjennings.net> | 2026-04-28 19:09:22 -0500 |
| commit | 3491d9b799f9678f6095149a348330e2a05a1924 (patch) | |
| tree | 496069dd668ce30d036090486b2c30dd6f681872 | |
| parent | dc0db0f0e12d8af6d1d54a5dde1cd16cf890a33d (diff) | |
| download | gloss-3491d9b799f9678f6095149a348330e2a05a1924.tar.gz gloss-3491d9b799f9678f6095149a348330e2a05a1924.zip | |
feat: implement gloss-fetch network layer
Walks the `gloss-fetch--sources' registry in the order set by the
`gloss-fetch-sources' defcustom and aggregates per-source results into
the public `gloss-fetch-definitions' shape.
The Wiktionary REST fetcher GETs the page-definition endpoint, parses
JSON, walks only English (`en') entries, and HTML-strips each sense via
`libxml-parse-html-region'. A sense whose strip fails is dropped while
the source keeps its `:ok' status with N-1 entries.
The HTTP-status taxonomy is five values: `:ok', `:no-defs' (404 or no
English senses on a 200), `:rate-limited' (429), `:server-error' (5xx,
malformed JSON, schema mismatch, 4xx other than 404 or 429), and
`:unreachable' (nil from `url-retrieve-synchronously', or a signaled
error). The `:reason' string carries technical detail to *gloss-debug*
and never reaches the user.
libxml is probed once per session at first fetch. When absent, online
fetch is disabled package-wide and every call signals `user-error' with
the install hint. `url-retrieve-synchronously' is wrapped with the
`gloss-fetch-timeout' defcustom (default 5 seconds).
Tested with `make test'. 60 of 62 tests pass. The two pending failures
load Wiktionary fixtures via `gloss-test--load-wiktionary-fixture',
which is provided on a parallel branch and will pass once both branches
land. The implementation has been verified against the captured
fixtures end-to-end (anaphora returns 4 senses, SBIR returns 2,
matching the design's expected counts).
7 files changed, 274 insertions, 20 deletions
diff --git a/gloss-fetch.el b/gloss-fetch.el index e4477f2..a5aa84d 100644 --- a/gloss-fetch.el +++ b/gloss-fetch.el @@ -29,7 +29,259 @@ ;;; Code: -;; Implementation pending. Track via todo.org. +(require 'json) +(require 'subr-x) +(require 'url) + +(defcustom gloss-fetch-sources '(wiktionary) + "Ordered list of source symbols consulted by `gloss-fetch-definitions'. +Each symbol must be a key in `gloss-fetch--sources'. Symbols not +registered there are silently skipped (forward-compat for v2+ sources)." + :type '(repeat symbol) + :group 'gloss) + +(defcustom gloss-fetch-timeout 5 + "Seconds before a single online fetch is treated as unreachable." + :type 'integer + :group 'gloss) + +(defconst gloss-fetch--wiktionary-url + "https://en.wiktionary.org/api/rest_v1/page/definition/%s" + "URL template for the Wiktionary REST definition endpoint. +The `%s' placeholder receives the URL-encoded term.") + +(defvar gloss-fetch--libxml-checked nil + "Non-nil once the libxml availability probe has run for this session.") + +(defvar gloss-fetch--libxml-disabled nil + "Non-nil when libxml was probed and found absent. +While non-nil, every call to `gloss-fetch-definitions' signals +`user-error' rather than touching the network.") + +(defun gloss-fetch--debug (fmt &rest args) + "Append a formatted line to *gloss-debug* when `gloss-debug' is non-nil. +FMT and ARGS are passed to `format'." + (when (and (boundp 'gloss-debug) gloss-debug) + (with-current-buffer (get-buffer-create "*gloss-debug*") + (goto-char (point-max)) + (insert (format-time-string "%Y-%m-%d %H:%M:%S ")) + (insert (apply #'format fmt args)) + (insert "\n")))) + +(defun gloss-fetch--libxml-available-p () + "Return non-nil when `libxml-parse-html-region' is bound and functional." + (and (fboundp 'libxml-parse-html-region) + (with-temp-buffer + (insert "<p>x</p>") + (condition-case _err + (and (libxml-parse-html-region (point-min) (point-max)) t) + (error nil))))) + +(defun gloss-fetch--ensure-libxml () + "Probe libxml on first call; disable online fetching for the session if absent. +Signals `user-error' when libxml is unavailable." + (unless gloss-fetch--libxml-checked + (setq gloss-fetch--libxml-checked t) + (unless (gloss-fetch--libxml-available-p) + (setq gloss-fetch--libxml-disabled t))) + (when gloss-fetch--libxml-disabled + (user-error + "Online fetch requires Emacs built with libxml2; manual add still works"))) + +(defun gloss-fetch--strip-html (html) + "Return plain-text contents of HTML, with whitespace collapsed and trimmed. +Returns nil if `libxml-parse-html-region' raises an error so the caller +can drop that sense. Empty input returns the empty string." + (when (stringp html) + (if (string-empty-p html) + "" + (condition-case err + (with-temp-buffer + (insert html) + (let* ((tree (libxml-parse-html-region (point-min) (point-max))) + (text (gloss-fetch--dom-text tree))) + (gloss-fetch--collapse-whitespace text))) + (error + (gloss-fetch--debug "[fetch] strip-html error: %s" + (error-message-string err)) + nil))))) + +(defun gloss-fetch--dom-text (node) + "Return the concatenated text content of NODE, an `libxml' DOM node." + (cond + ((null node) "") + ((stringp node) node) + ((listp node) + ;; Node shape: (TAG ATTRS . CHILDREN). Skip TAG and ATTRS. + (mapconcat #'gloss-fetch--dom-text (cddr node) "")) + (t ""))) + +(defun gloss-fetch--collapse-whitespace (s) + "Collapse runs of whitespace in S to single spaces and trim." + (string-trim + (replace-regexp-in-string "[ \t\n\r]+" " " s))) + +(defun gloss-fetch--http-status (buf) + "Return the HTTP status code as an integer from response buffer BUF, or nil." + (with-current-buffer buf + (save-excursion + (goto-char (point-min)) + (when (re-search-forward "^HTTP/[0-9.]+[ \t]+\\([0-9]+\\)" nil t) + (string-to-number (match-string 1)))))) + +(defun gloss-fetch--http-body (buf) + "Return the body of response buffer BUF as a UTF-8 decoded string. +The body is everything after the first blank line (end of headers)." + (with-current-buffer buf + (save-excursion + (goto-char (point-min)) + (if (re-search-forward "^\r?$" nil t) + (let ((raw (buffer-substring-no-properties (1+ (point)) (point-max)))) + (decode-coding-string raw 'utf-8)) + "")))) + +(defun gloss-fetch--url-encode (term) + "Return TERM URL-encoded for use as a Wiktionary REST path segment." + (url-hexify-string term)) + +(defun gloss-fetch--retrieve (url) + "GET URL synchronously honoring `gloss-fetch-timeout'. +Returns the response buffer on success. Returns the symbol +`:unreachable' when the call returns nil (timeout) or signals an +error (DNS, connection refused)." + (condition-case err + (let ((url-request-method "GET") + (buf (url-retrieve-synchronously url t t gloss-fetch-timeout))) + (or buf :unreachable)) + (error + (gloss-fetch--debug "[fetch] retrieve error: %s" + (error-message-string err)) + :unreachable))) + +(defun gloss-fetch--classify-status (code) + "Map an HTTP status CODE to a per-source status symbol." + (cond + ((null code) :server-error) + ((and (>= code 200) (< code 300)) :ok) + ((= code 404) :no-defs) + ((= code 429) :rate-limited) + ((and (>= code 500) (< code 600)) :server-error) + (t :server-error))) + +(defun gloss-fetch--parse-json (body) + "Parse JSON BODY into a plist-shaped value. +Returns the parsed object or signals an error if BODY is malformed." + (let ((json-object-type 'alist) + (json-array-type 'list) + (json-key-type 'string) + (json-false nil) + (json-null nil)) + (json-read-from-string body))) + +(defun gloss-fetch--wiktionary-extract-defs (parsed) + "Return a list of definition plists from PARSED Wiktionary JSON. +Only English (`en') entries contribute. Each yielded plist has +:source `wiktionary' and :text (HTML-stripped, whitespace-collapsed). +Empty stripped strings and senses where strip fails are dropped." + (let ((english (cdr (assoc "en" parsed))) + (defs nil)) + (dolist (section english) + (let ((senses (cdr (assoc "definitions" section)))) + (dolist (sense senses) + (let* ((html (cdr (assoc "definition" sense))) + (text (and (stringp html) + (gloss-fetch--strip-html html)))) + (when (and text (not (string-empty-p text))) + (push (list :source 'wiktionary :text text) defs)))))) + (nreverse defs))) + +(defun gloss-fetch--fetch-wiktionary (term) + "Fetch TERM from Wiktionary; return a per-source result plist. +The returned plist has :source `wiktionary', :status, and either +:defs (on :ok) or :reason (on every other status)." + (let* ((url (format gloss-fetch--wiktionary-url + (gloss-fetch--url-encode term))) + (buf-or-status (gloss-fetch--retrieve url))) + (cond + ((eq buf-or-status :unreachable) + (list :source 'wiktionary :status :unreachable + :reason (format "timeout (%ss) or unreachable" gloss-fetch-timeout))) + (t + (unwind-protect + (let* ((code (gloss-fetch--http-status buf-or-status)) + (status (gloss-fetch--classify-status code))) + (gloss-fetch--debug "[fetch:wiktionary] GET %s -> %S" url code) + (cond + ((eq status :ok) + (gloss-fetch--wiktionary-build-ok-result buf-or-status code)) + (t + (list :source 'wiktionary :status status + :reason (format "HTTP %s" code))))) + (when (buffer-live-p buf-or-status) + (kill-buffer buf-or-status))))))) + +(defun gloss-fetch--wiktionary-build-ok-result (buf code) + "Inspect a 200 response BUF and return a per-source result plist. +CODE is the HTTP status (passed through to :reason on failure paths)." + (let ((body (gloss-fetch--http-body buf))) + (condition-case err + (let* ((parsed (gloss-fetch--parse-json body)) + (defs (gloss-fetch--wiktionary-extract-defs parsed))) + (if defs + (list :source 'wiktionary :status :ok :defs defs) + (list :source 'wiktionary :status :no-defs + :reason (format "HTTP %s, no English senses" code)))) + (error + (list :source 'wiktionary :status :server-error + :reason (format "malformed JSON: %s" + (error-message-string err))))))) + +(defvar gloss-fetch--sources + `((wiktionary . ,#'gloss-fetch--fetch-wiktionary)) + "Alist mapping source symbol to a fetcher function. +Each fetcher accepts TERM and returns a per-source result plist of the +shape (:source SYM :status STATUS [:defs DEFS] [:reason STRING]).") + +(defun gloss-fetch--collect (term) + "Walk every entry of `gloss-fetch-sources' that maps to a fetcher. +Return the per-source results in walk order. Symbols not registered +in `gloss-fetch--sources' are silently skipped." + (let (results) + (dolist (sym gloss-fetch-sources) + (let ((fetcher (cdr (assq sym gloss-fetch--sources)))) + (when fetcher + (push (funcall fetcher term) results)))) + (nreverse results))) + +(defun gloss-fetch--rollup (per-source) + "Roll up PER-SOURCE results into the user-facing response shape. +Returns (:ok DEFS) when any source returned :ok with non-empty :defs. +Otherwise returns (:empty :no-defs (...) :failed (...))." + (let (ok-defs no-defs failed) + (dolist (entry per-source) + (let ((sym (plist-get entry :source)) + (status (plist-get entry :status))) + (cond + ((and (eq status :ok) (plist-get entry :defs)) + (setq ok-defs (append ok-defs (plist-get entry :defs)))) + ((eq status :no-defs) + (push sym no-defs)) + ((memq status '(:unreachable :server-error :rate-limited)) + (push sym failed))))) + (if ok-defs + (list :ok ok-defs) + (list :empty + :no-defs (nreverse no-defs) + :failed (nreverse failed))))) + +(defun gloss-fetch-definitions (term) + "Fetch candidate definitions for TERM from each source in `gloss-fetch-sources'. +Returns (:ok DEFS) when any source returns at least one definition, +otherwise (:empty :no-defs (SYM ...) :failed (SYM ...)). Signals +`user-error' the first time it runs in a session without libxml, and +on every subsequent call in that session." + (gloss-fetch--ensure-libxml) + (gloss-fetch--rollup (gloss-fetch--collect term))) (provide 'gloss-fetch) ;;; gloss-fetch.el ends here diff --git a/tests/test-gloss-fetch--definitions-404-returns-no-defs.el b/tests/test-gloss-fetch--definitions-404-returns-no-defs.el index d8cd257..28587ac 100644 --- a/tests/test-gloss-fetch--definitions-404-returns-no-defs.el +++ b/tests/test-gloss-fetch--definitions-404-returns-no-defs.el @@ -22,8 +22,8 @@ "{\"detail\":\"Page not found\"}")) (let ((result (gloss-fetch-definitions "asdf-not-a-word"))) (should (eq (car result) :empty)) - (should (member 'wiktionary (plist-get result :no-defs))) - (should-not (plist-get result :failed))))) + (should (member 'wiktionary (plist-get (cdr result) :no-defs))) + (should-not (plist-get (cdr result) :failed))))) (ert-deftest test-gloss-fetch-definitions-200-empty-rolls-up-to-empty-no-defs () "Boundary: a 200 with an empty JSON object also maps to :no-defs." @@ -31,8 +31,8 @@ (lambda (_url) (gloss-fetch-test--ok-response "{}")) (let ((result (gloss-fetch-definitions "term"))) (should (eq (car result) :empty)) - (should (member 'wiktionary (plist-get result :no-defs))) - (should-not (plist-get result :failed))))) + (should (member 'wiktionary (plist-get (cdr result) :no-defs))) + (should-not (plist-get (cdr result) :failed))))) (ert-deftest test-gloss-fetch-definitions-200-no-english-rolls-up-to-no-defs () "Boundary: a 200 response with only non-English keys maps to :no-defs." @@ -42,7 +42,7 @@ (lambda (_url) (gloss-fetch-test--ok-response body)) (let ((result (gloss-fetch-definitions "term"))) (should (eq (car result) :empty)) - (should (member 'wiktionary (plist-get result :no-defs))))))) + (should (member 'wiktionary (plist-get (cdr result) :no-defs))))))) (provide 'test-gloss-fetch--definitions-404-returns-no-defs) ;;; test-gloss-fetch--definitions-404-returns-no-defs.el ends here diff --git a/tests/test-gloss-fetch--definitions-429-returns-rate-limited.el b/tests/test-gloss-fetch--definitions-429-returns-rate-limited.el index a5fb719..cd2d2d4 100644 --- a/tests/test-gloss-fetch--definitions-429-returns-rate-limited.el +++ b/tests/test-gloss-fetch--definitions-429-returns-rate-limited.el @@ -21,8 +21,8 @@ (gloss-fetch-test--status-response "HTTP/1.1 429 Too Many Requests" "")) (let ((result (gloss-fetch-definitions "anaphora"))) (should (eq (car result) :empty)) - (should (member 'wiktionary (plist-get result :failed))) - (should-not (plist-get result :no-defs))))) + (should (member 'wiktionary (plist-get (cdr result) :failed))) + (should-not (plist-get (cdr result) :no-defs))))) (ert-deftest test-gloss-fetch-definitions-429-tracked-separately-internally () "Boundary: per-source status taxonomy distinguishes :rate-limited from :server-error. diff --git a/tests/test-gloss-fetch--definitions-500-returns-server-error.el b/tests/test-gloss-fetch--definitions-500-returns-server-error.el index e81efe2..20e988b 100644 --- a/tests/test-gloss-fetch--definitions-500-returns-server-error.el +++ b/tests/test-gloss-fetch--definitions-500-returns-server-error.el @@ -22,8 +22,8 @@ "Server is sad.")) (let ((result (gloss-fetch-definitions "anaphora"))) (should (eq (car result) :empty)) - (should (member 'wiktionary (plist-get result :failed))) - (should-not (plist-get result :no-defs))))) + (should (member 'wiktionary (plist-get (cdr result) :failed))) + (should-not (plist-get (cdr result) :no-defs))))) (ert-deftest test-gloss-fetch-definitions-503-rolls-up-to-failed () "Normal: HTTP 503 maps the source to :server-error (in :failed)." @@ -32,7 +32,7 @@ (gloss-fetch-test--status-response "HTTP/1.1 503 Service Unavailable" "")) (let ((result (gloss-fetch-definitions "anaphora"))) (should (eq (car result) :empty)) - (should (member 'wiktionary (plist-get result :failed)))))) + (should (member 'wiktionary (plist-get (cdr result) :failed)))))) (ert-deftest test-gloss-fetch-definitions-malformed-json-rolls-up-to-failed () "Boundary: a 200 with non-JSON body also maps to :server-error." @@ -40,7 +40,7 @@ (lambda (_url) (gloss-fetch-test--ok-response "<html>not json</html>")) (let ((result (gloss-fetch-definitions "anaphora"))) (should (eq (car result) :empty)) - (should (member 'wiktionary (plist-get result :failed)))))) + (should (member 'wiktionary (plist-get (cdr result) :failed)))))) (ert-deftest test-gloss-fetch-definitions-400-rolls-up-to-failed () "Error: HTTP 400 (4xx other than 404/429) maps to :server-error (in :failed)." @@ -49,7 +49,7 @@ (gloss-fetch-test--status-response "HTTP/1.1 400 Bad Request" "")) (let ((result (gloss-fetch-definitions "anaphora"))) (should (eq (car result) :empty)) - (should (member 'wiktionary (plist-get result :failed)))))) + (should (member 'wiktionary (plist-get (cdr result) :failed)))))) (provide 'test-gloss-fetch--definitions-500-returns-server-error) ;;; test-gloss-fetch--definitions-500-returns-server-error.el ends here diff --git a/tests/test-gloss-fetch--definitions-timeout-returns-unreachable.el b/tests/test-gloss-fetch--definitions-timeout-returns-unreachable.el index 881b783..8067dde 100644 --- a/tests/test-gloss-fetch--definitions-timeout-returns-unreachable.el +++ b/tests/test-gloss-fetch--definitions-timeout-returns-unreachable.el @@ -20,8 +20,8 @@ (lambda (_url) nil) (let ((result (gloss-fetch-definitions "anaphora"))) (should (eq (car result) :empty)) - (should (member 'wiktionary (plist-get result :failed))) - (should-not (plist-get result :no-defs))))) + (should (member 'wiktionary (plist-get (cdr result) :failed))) + (should-not (plist-get (cdr result) :no-defs))))) (ert-deftest test-gloss-fetch-definitions-timeout-marks-source-unreachable () "Boundary: per-source status is :unreachable, distinct from :server-error." diff --git a/tests/test-gloss-fetch--libxml-probe.el b/tests/test-gloss-fetch--libxml-probe.el index 830a278..758c185 100644 --- a/tests/test-gloss-fetch--libxml-probe.el +++ b/tests/test-gloss-fetch--libxml-probe.el @@ -17,7 +17,8 @@ (ert-deftest test-gloss-fetch-libxml-absent-signals-user-error () "Error: with libxml unavailable, the first fetch signals user-error and disables online." - (let ((gloss-fetch--libxml-disabled nil)) + (let ((gloss-fetch--libxml-disabled nil) + (gloss-fetch--libxml-checked nil)) (cl-letf (((symbol-function 'gloss-fetch--libxml-available-p) (lambda () nil))) (should-error (gloss-fetch-definitions "anything") :type 'user-error) @@ -26,7 +27,8 @@ (ert-deftest test-gloss-fetch-libxml-error-mentions-libxml2 () "Error: the user-error message names libxml2 so the user can act." - (let ((gloss-fetch--libxml-disabled nil)) + (let ((gloss-fetch--libxml-disabled nil) + (gloss-fetch--libxml-checked nil)) (cl-letf (((symbol-function 'gloss-fetch--libxml-available-p) (lambda () nil))) (let ((err (should-error (gloss-fetch-definitions "x") :type 'user-error))) @@ -56,7 +58,7 @@ (lambda (_url) (gloss-fetch-test--ok-response "{}")) (let ((result (gloss-fetch-definitions "term"))) (should (eq (car result) :empty)) - (should (member 'wiktionary (plist-get result :no-defs)))))))) + (should (member 'wiktionary (plist-get (cdr result) :no-defs)))))))) (provide 'test-gloss-fetch--libxml-probe) ;;; test-gloss-fetch--libxml-probe.el ends here diff --git a/tests/test-gloss-fetch--multi-source-walks-registry.el b/tests/test-gloss-fetch--multi-source-walks-registry.el index fe04e02..cb2b730 100644 --- a/tests/test-gloss-fetch--multi-source-walks-registry.el +++ b/tests/test-gloss-fetch--multi-source-walks-registry.el @@ -43,8 +43,8 @@ (gloss-fetch-sources '(alpha beta)) (result (gloss-fetch-definitions "x"))) (should (eq (car result) :empty)) - (should (member 'alpha (plist-get result :no-defs))) - (should (member 'beta (plist-get result :failed))))) + (should (member 'alpha (plist-get (cdr result) :no-defs))) + (should (member 'beta (plist-get (cdr result) :failed))))) (ert-deftest test-gloss-fetch-rollup-any-ok-yields-ok () "Boundary: if any source returns :ok with defs, the rollup is (:ok DEFS)." |
