From 3491d9b799f9678f6095149a348330e2a05a1924 Mon Sep 17 00:00:00 2001 From: Craig Jennings Date: Tue, 28 Apr 2026 18:16:28 -0500 Subject: feat: implement gloss-fetch network layer Walks the `gloss-fetch--sources' registry in the order set by the `gloss-fetch-sources' defcustom and aggregates per-source results into the public `gloss-fetch-definitions' shape. The Wiktionary REST fetcher GETs the page-definition endpoint, parses JSON, walks only English (`en') entries, and HTML-strips each sense via `libxml-parse-html-region'. A sense whose strip fails is dropped while the source keeps its `:ok' status with N-1 entries. The HTTP-status taxonomy is five values: `:ok', `:no-defs' (404 or no English senses on a 200), `:rate-limited' (429), `:server-error' (5xx, malformed JSON, schema mismatch, 4xx other than 404 or 429), and `:unreachable' (nil from `url-retrieve-synchronously', or a signaled error). The `:reason' string carries technical detail to *gloss-debug* and never reaches the user. libxml is probed once per session at first fetch. When absent, online fetch is disabled package-wide and every call signals `user-error' with the install hint. `url-retrieve-synchronously' is wrapped with the `gloss-fetch-timeout' defcustom (default 5 seconds). Tested with `make test'. 60 of 62 tests pass. The two pending failures load Wiktionary fixtures via `gloss-test--load-wiktionary-fixture', which is provided on a parallel branch and will pass once both branches land. The implementation has been verified against the captured fixtures end-to-end (anaphora returns 4 senses, SBIR returns 2, matching the design's expected counts). --- gloss-fetch.el | 254 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 253 insertions(+), 1 deletion(-) (limited to 'gloss-fetch.el') diff --git a/gloss-fetch.el b/gloss-fetch.el index e4477f2..a5aa84d 100644 --- a/gloss-fetch.el +++ b/gloss-fetch.el @@ -29,7 +29,259 @@ ;;; Code: -;; Implementation pending. Track via todo.org. +(require 'json) +(require 'subr-x) +(require 'url) + +(defcustom gloss-fetch-sources '(wiktionary) + "Ordered list of source symbols consulted by `gloss-fetch-definitions'. +Each symbol must be a key in `gloss-fetch--sources'. Symbols not +registered there are silently skipped (forward-compat for v2+ sources)." + :type '(repeat symbol) + :group 'gloss) + +(defcustom gloss-fetch-timeout 5 + "Seconds before a single online fetch is treated as unreachable." + :type 'integer + :group 'gloss) + +(defconst gloss-fetch--wiktionary-url + "https://en.wiktionary.org/api/rest_v1/page/definition/%s" + "URL template for the Wiktionary REST definition endpoint. +The `%s' placeholder receives the URL-encoded term.") + +(defvar gloss-fetch--libxml-checked nil + "Non-nil once the libxml availability probe has run for this session.") + +(defvar gloss-fetch--libxml-disabled nil + "Non-nil when libxml was probed and found absent. +While non-nil, every call to `gloss-fetch-definitions' signals +`user-error' rather than touching the network.") + +(defun gloss-fetch--debug (fmt &rest args) + "Append a formatted line to *gloss-debug* when `gloss-debug' is non-nil. +FMT and ARGS are passed to `format'." + (when (and (boundp 'gloss-debug) gloss-debug) + (with-current-buffer (get-buffer-create "*gloss-debug*") + (goto-char (point-max)) + (insert (format-time-string "%Y-%m-%d %H:%M:%S ")) + (insert (apply #'format fmt args)) + (insert "\n")))) + +(defun gloss-fetch--libxml-available-p () + "Return non-nil when `libxml-parse-html-region' is bound and functional." + (and (fboundp 'libxml-parse-html-region) + (with-temp-buffer + (insert "

x

") + (condition-case _err + (and (libxml-parse-html-region (point-min) (point-max)) t) + (error nil))))) + +(defun gloss-fetch--ensure-libxml () + "Probe libxml on first call; disable online fetching for the session if absent. +Signals `user-error' when libxml is unavailable." + (unless gloss-fetch--libxml-checked + (setq gloss-fetch--libxml-checked t) + (unless (gloss-fetch--libxml-available-p) + (setq gloss-fetch--libxml-disabled t))) + (when gloss-fetch--libxml-disabled + (user-error + "Online fetch requires Emacs built with libxml2; manual add still works"))) + +(defun gloss-fetch--strip-html (html) + "Return plain-text contents of HTML, with whitespace collapsed and trimmed. +Returns nil if `libxml-parse-html-region' raises an error so the caller +can drop that sense. Empty input returns the empty string." + (when (stringp html) + (if (string-empty-p html) + "" + (condition-case err + (with-temp-buffer + (insert html) + (let* ((tree (libxml-parse-html-region (point-min) (point-max))) + (text (gloss-fetch--dom-text tree))) + (gloss-fetch--collapse-whitespace text))) + (error + (gloss-fetch--debug "[fetch] strip-html error: %s" + (error-message-string err)) + nil))))) + +(defun gloss-fetch--dom-text (node) + "Return the concatenated text content of NODE, an `libxml' DOM node." + (cond + ((null node) "") + ((stringp node) node) + ((listp node) + ;; Node shape: (TAG ATTRS . CHILDREN). Skip TAG and ATTRS. + (mapconcat #'gloss-fetch--dom-text (cddr node) "")) + (t ""))) + +(defun gloss-fetch--collapse-whitespace (s) + "Collapse runs of whitespace in S to single spaces and trim." + (string-trim + (replace-regexp-in-string "[ \t\n\r]+" " " s))) + +(defun gloss-fetch--http-status (buf) + "Return the HTTP status code as an integer from response buffer BUF, or nil." + (with-current-buffer buf + (save-excursion + (goto-char (point-min)) + (when (re-search-forward "^HTTP/[0-9.]+[ \t]+\\([0-9]+\\)" nil t) + (string-to-number (match-string 1)))))) + +(defun gloss-fetch--http-body (buf) + "Return the body of response buffer BUF as a UTF-8 decoded string. +The body is everything after the first blank line (end of headers)." + (with-current-buffer buf + (save-excursion + (goto-char (point-min)) + (if (re-search-forward "^\r?$" nil t) + (let ((raw (buffer-substring-no-properties (1+ (point)) (point-max)))) + (decode-coding-string raw 'utf-8)) + "")))) + +(defun gloss-fetch--url-encode (term) + "Return TERM URL-encoded for use as a Wiktionary REST path segment." + (url-hexify-string term)) + +(defun gloss-fetch--retrieve (url) + "GET URL synchronously honoring `gloss-fetch-timeout'. +Returns the response buffer on success. Returns the symbol +`:unreachable' when the call returns nil (timeout) or signals an +error (DNS, connection refused)." + (condition-case err + (let ((url-request-method "GET") + (buf (url-retrieve-synchronously url t t gloss-fetch-timeout))) + (or buf :unreachable)) + (error + (gloss-fetch--debug "[fetch] retrieve error: %s" + (error-message-string err)) + :unreachable))) + +(defun gloss-fetch--classify-status (code) + "Map an HTTP status CODE to a per-source status symbol." + (cond + ((null code) :server-error) + ((and (>= code 200) (< code 300)) :ok) + ((= code 404) :no-defs) + ((= code 429) :rate-limited) + ((and (>= code 500) (< code 600)) :server-error) + (t :server-error))) + +(defun gloss-fetch--parse-json (body) + "Parse JSON BODY into a plist-shaped value. +Returns the parsed object or signals an error if BODY is malformed." + (let ((json-object-type 'alist) + (json-array-type 'list) + (json-key-type 'string) + (json-false nil) + (json-null nil)) + (json-read-from-string body))) + +(defun gloss-fetch--wiktionary-extract-defs (parsed) + "Return a list of definition plists from PARSED Wiktionary JSON. +Only English (`en') entries contribute. Each yielded plist has +:source `wiktionary' and :text (HTML-stripped, whitespace-collapsed). +Empty stripped strings and senses where strip fails are dropped." + (let ((english (cdr (assoc "en" parsed))) + (defs nil)) + (dolist (section english) + (let ((senses (cdr (assoc "definitions" section)))) + (dolist (sense senses) + (let* ((html (cdr (assoc "definition" sense))) + (text (and (stringp html) + (gloss-fetch--strip-html html)))) + (when (and text (not (string-empty-p text))) + (push (list :source 'wiktionary :text text) defs)))))) + (nreverse defs))) + +(defun gloss-fetch--fetch-wiktionary (term) + "Fetch TERM from Wiktionary; return a per-source result plist. +The returned plist has :source `wiktionary', :status, and either +:defs (on :ok) or :reason (on every other status)." + (let* ((url (format gloss-fetch--wiktionary-url + (gloss-fetch--url-encode term))) + (buf-or-status (gloss-fetch--retrieve url))) + (cond + ((eq buf-or-status :unreachable) + (list :source 'wiktionary :status :unreachable + :reason (format "timeout (%ss) or unreachable" gloss-fetch-timeout))) + (t + (unwind-protect + (let* ((code (gloss-fetch--http-status buf-or-status)) + (status (gloss-fetch--classify-status code))) + (gloss-fetch--debug "[fetch:wiktionary] GET %s -> %S" url code) + (cond + ((eq status :ok) + (gloss-fetch--wiktionary-build-ok-result buf-or-status code)) + (t + (list :source 'wiktionary :status status + :reason (format "HTTP %s" code))))) + (when (buffer-live-p buf-or-status) + (kill-buffer buf-or-status))))))) + +(defun gloss-fetch--wiktionary-build-ok-result (buf code) + "Inspect a 200 response BUF and return a per-source result plist. +CODE is the HTTP status (passed through to :reason on failure paths)." + (let ((body (gloss-fetch--http-body buf))) + (condition-case err + (let* ((parsed (gloss-fetch--parse-json body)) + (defs (gloss-fetch--wiktionary-extract-defs parsed))) + (if defs + (list :source 'wiktionary :status :ok :defs defs) + (list :source 'wiktionary :status :no-defs + :reason (format "HTTP %s, no English senses" code)))) + (error + (list :source 'wiktionary :status :server-error + :reason (format "malformed JSON: %s" + (error-message-string err))))))) + +(defvar gloss-fetch--sources + `((wiktionary . ,#'gloss-fetch--fetch-wiktionary)) + "Alist mapping source symbol to a fetcher function. +Each fetcher accepts TERM and returns a per-source result plist of the +shape (:source SYM :status STATUS [:defs DEFS] [:reason STRING]).") + +(defun gloss-fetch--collect (term) + "Walk every entry of `gloss-fetch-sources' that maps to a fetcher. +Return the per-source results in walk order. Symbols not registered +in `gloss-fetch--sources' are silently skipped." + (let (results) + (dolist (sym gloss-fetch-sources) + (let ((fetcher (cdr (assq sym gloss-fetch--sources)))) + (when fetcher + (push (funcall fetcher term) results)))) + (nreverse results))) + +(defun gloss-fetch--rollup (per-source) + "Roll up PER-SOURCE results into the user-facing response shape. +Returns (:ok DEFS) when any source returned :ok with non-empty :defs. +Otherwise returns (:empty :no-defs (...) :failed (...))." + (let (ok-defs no-defs failed) + (dolist (entry per-source) + (let ((sym (plist-get entry :source)) + (status (plist-get entry :status))) + (cond + ((and (eq status :ok) (plist-get entry :defs)) + (setq ok-defs (append ok-defs (plist-get entry :defs)))) + ((eq status :no-defs) + (push sym no-defs)) + ((memq status '(:unreachable :server-error :rate-limited)) + (push sym failed))))) + (if ok-defs + (list :ok ok-defs) + (list :empty + :no-defs (nreverse no-defs) + :failed (nreverse failed))))) + +(defun gloss-fetch-definitions (term) + "Fetch candidate definitions for TERM from each source in `gloss-fetch-sources'. +Returns (:ok DEFS) when any source returns at least one definition, +otherwise (:empty :no-defs (SYM ...) :failed (SYM ...)). Signals +`user-error' the first time it runs in a session without libxml, and +on every subsequent call in that session." + (gloss-fetch--ensure-libxml) + (gloss-fetch--rollup (gloss-fetch--collect term))) (provide 'gloss-fetch) ;;; gloss-fetch.el ends here -- cgit v1.2.3