summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCraig Jennings <c@cjennings.net>2026-05-16 05:17:21 -0500
committerCraig Jennings <c@cjennings.net>2026-05-16 05:17:21 -0500
commit99d93203a867294addf4927ceec5644b9d3bf322 (patch)
treef579382868960c77cd38eb350fba4dfdaab13c21
parentceeae9b5e2625e23e6e3792d06a6c8122a36d18b (diff)
downloaddotemacs-99d93203a867294addf4927ceec5644b9d3bf322.tar.gz
dotemacs-99d93203a867294addf4927ceec5644b9d3bf322.zip
feat(gptel-tools): wire web_fetch as a local tool
Fourth ADOPT entry from `docs/design/gptel-tools-shortlist.org'. Lets gptel pull a URL into the conversation so the model can read docs / current API shapes / etc. without me copy-pasting. Shape: - URL must be `http://' or `https://' (file://, ftp://, javascript:, scheme-less, etc. are rejected at the validator). - HTML responses go through `pandoc -f html -t plain' so the model gets a reading shape that isn't full of markup; falls back to `w3m -dump -T text/html' if pandoc isn't on PATH; signals `user-error' if neither is. Pass `raw=t' to skip stripping. - Output capped at 200KB by default, hard cap 1MB; `max_bytes' argument lets the caller pick a lower cap. Truncation reported inline. - 4xx / 5xx response codes signal `error' with the code -- the alternative is returning an error page body, which the model would treat as content. `:confirm t' on the tool because every call is a real outbound network request. The tool's description warns that URLs go wherever the user-agent points, including internal networks if that's what the URL names. `tests/test-gptel-tools-web-fetch.el' -- 20 tests across Normal / Boundary / Error. URL validator covers http / https / non-string / empty / non-http schemes. `--effective-max-bytes' covers default / low-clamp / hard-cap / passthrough. Truncate helper covers under-cap / at-cap / over-cap with the marker. HTML stripper runs against real pandoc / w3m (both installed in dev env, neither should mangle simple markup). Orchestrator stubs `cj/gptel-web-fetch--retrieve' via `cl-letf' to cover normal / raw / 4xx / 5xx / oversize / bad-scheme paths. Wired into `cj/gptel-local-tool-features' so gptel exposes the tool on next restart. Note: `call-process-region' invocation flattened to a single `with-temp-buffer' with DELETE=t -- the initial draft nested a second temp buffer and routed output to the inner one, which got killed before `buffer-string' on the outer ran. Test caught it.
-rw-r--r--gptel-tools/web_fetch.el150
-rw-r--r--modules/ai-config.el3
-rw-r--r--tests/test-gptel-tools-web-fetch.el158
3 files changed, 310 insertions, 1 deletions
diff --git a/gptel-tools/web_fetch.el b/gptel-tools/web_fetch.el
new file mode 100644
index 00000000..1f950a31
--- /dev/null
+++ b/gptel-tools/web_fetch.el
@@ -0,0 +1,150 @@
+;;; web_fetch.el --- Web fetch tool for gptel -*- coding: utf-8; lexical-binding: t; -*-
+
+;; Author: Craig Jennings <c@cjennings.net>
+;; Keywords: convenience, tools, web
+
+;; This file is not part of GNU Emacs.
+
+;;; Commentary:
+
+;; Gptel tool that fetches an HTTP/HTTPS URL and returns its body.
+;; HTML is piped through `pandoc -f html -t plain' (falling back to
+;; `w3m -dump -T text/html') so the model gets a reading shape that
+;; isn't full of markup; pass RAW=t to skip stripping and get the
+;; verbatim response. Output is capped at 200KB by default (hard cap
+;; 1MB) and the cap is reported inline when triggered.
+;;
+;; This tool is `:confirm t' because it makes outbound network
+;; requests -- the user sees every URL before the fetch happens. The
+;; URL goes wherever the user-agent points it, including internal
+;; networks if the URL names one; consider the network posture before
+;; approving sensitive endpoints.
+
+;;; Code:
+
+(require 'gptel)
+(require 'url)
+
+(defconst cj/gptel-web-fetch--default-max-bytes (* 200 1024)
+ "Default cap on returned body size. ~200KB.")
+
+(defconst cj/gptel-web-fetch--hard-max-bytes (* 1024 1024)
+ "Hard upper bound on the user-controllable byte cap. 1MB.")
+
+(defun cj/gptel-web-fetch--validate-url (url)
+ "Validate URL as an http or https request target. Return URL on success.
+Signals `user-error' for non-string, empty, or non-http/https URLs."
+ (unless (and (stringp url) (not (string-empty-p url)))
+ (user-error "web_fetch: expected non-empty URL string, got %S" url))
+ (unless (string-match-p "\\`https?://[^[:space:]]+\\'" url)
+ (user-error "web_fetch: URL must be http:// or https://, got %S" url))
+ url)
+
+(defun cj/gptel-web-fetch--effective-max-bytes (n)
+ "Return the byte cap to use given caller-supplied N.
+Nil / non-integer / out-of-range → default. Above hard cap → hard cap."
+ (cond
+ ((not (integerp n)) cj/gptel-web-fetch--default-max-bytes)
+ ((< n 1) cj/gptel-web-fetch--default-max-bytes)
+ ((> n cj/gptel-web-fetch--hard-max-bytes) cj/gptel-web-fetch--hard-max-bytes)
+ (t n)))
+
+(defun cj/gptel-web-fetch--retrieve (url)
+ "Synchronously GET URL. Return a cons (STATUS-CODE . BODY).
+Signals on network failure. STATUS-CODE is an integer when parseable
+from the response status line, or nil when the line is unrecognized."
+ (let ((buf (url-retrieve-synchronously url t t 30)))
+ (unless buf
+ (error "web_fetch: no response from %s" url))
+ (unwind-protect
+ (with-current-buffer buf
+ (goto-char (point-min))
+ (let* ((status (when (re-search-forward
+ "^HTTP/[0-9.]+ \\([0-9]+\\)" (point-max) t)
+ (string-to-number (match-string 1))))
+ (body-start (when (re-search-forward "\n\n" nil t)
+ (point))))
+ (cons status
+ (if body-start
+ (buffer-substring-no-properties body-start (point-max))
+ (buffer-substring-no-properties (point-min) (point-max))))))
+ (kill-buffer buf))))
+
+(defun cj/gptel-web-fetch--html-to-text (html)
+ "Strip HTML to plain text. Returns the stripped string.
+Tries `pandoc -f html -t plain' first, falls back to
+`w3m -dump -T text/html'. Signals `user-error' if neither is
+on PATH."
+ (let* ((coding-system-for-write 'utf-8)
+ (coding-system-for-read 'utf-8)
+ (tool (cond
+ ((executable-find "pandoc")
+ (list "pandoc" "-f" "html" "-t" "plain"))
+ ((executable-find "w3m")
+ (list "w3m" "-dump" "-T" "text/html"))
+ (t nil))))
+ (unless tool
+ (user-error
+ "web_fetch: HTML stripping needs pandoc or w3m on PATH; pass raw=t to bypass"))
+ ;; `call-process-region' with DELETE=t and OUTPUT=t replaces the
+ ;; input range with the tool's output, so `buffer-string' returns
+ ;; the stripped text.
+ (with-temp-buffer
+ (insert html)
+ (let ((exit (apply #'call-process-region
+ (point-min) (point-max) (car tool)
+ t t nil (cdr tool))))
+ (if (zerop exit)
+ (buffer-string)
+ (error "web_fetch: %s exited with %d" (car tool) exit))))))
+
+(defun cj/gptel-web-fetch--truncate (text max-bytes)
+ "Truncate TEXT to MAX-BYTES. Returns TEXT unchanged when under the cap."
+ (if (<= (length text) max-bytes)
+ text
+ (concat (substring text 0 max-bytes)
+ (format
+ "\n\n[truncated: response exceeded %d bytes; %d bytes total]"
+ max-bytes (length text)))))
+
+(defun cj/gptel-web-fetch--run (url &optional raw max-bytes)
+ "Fetch URL and return its body.
+When RAW is nil (the default) HTML responses are stripped to plain
+text via pandoc or w3m. MAX-BYTES caps the returned size; nil /
+out-of-range falls back to the default 200KB cap."
+ (let* ((validated (cj/gptel-web-fetch--validate-url url))
+ (cap (cj/gptel-web-fetch--effective-max-bytes max-bytes))
+ (response (cj/gptel-web-fetch--retrieve validated))
+ (status (car response))
+ (body (cdr response)))
+ (when (and status (>= status 400))
+ (error "web_fetch: HTTP %d from %s" status validated))
+ (let ((text (if raw body
+ (cj/gptel-web-fetch--html-to-text body))))
+ (cj/gptel-web-fetch--truncate text cap))))
+
+(with-eval-after-load 'gptel
+ (gptel-make-tool
+ :name "web_fetch"
+ :function (lambda (url &optional raw max_bytes)
+ (cj/gptel-web-fetch--run url raw max_bytes))
+ :description "Fetch an http:// or https:// URL and return its body. HTML responses are stripped to plain text via pandoc (or w3m as a fallback); pass raw=true to skip stripping. Output is capped at 200KB by default (max 1MB); the cap is reported inline when triggered. Network call: the URL goes wherever the user-agent points, including internal networks if specified."
+ :args (list '(:name "url"
+ :type string
+ :description "HTTP or HTTPS URL to fetch. Non-http schemes are rejected.")
+ '(:name "raw"
+ :type boolean
+ :description "When true, return the response body verbatim without HTML stripping. Default false."
+ :optional t)
+ '(:name "max_bytes"
+ :type integer
+ :description "Output size cap in bytes. Defaults to 200000; hard-capped at 1048576."
+ :optional t))
+ :category "web"
+ :confirm t
+ :include t)
+
+ (add-to-list 'gptel-tools (gptel-get-tool '("web" "web_fetch"))))
+
+(provide 'web_fetch)
+;;; web_fetch.el ends here
diff --git a/modules/ai-config.el b/modules/ai-config.el
index a04a32a0..02c85e5c 100644
--- a/modules/ai-config.el
+++ b/modules/ai-config.el
@@ -62,7 +62,8 @@
move_to_trash
git_status
git_log
- git_diff)
+ git_diff
+ web_fetch)
"Feature symbols for optional local GPTel tool modules."
:type '(repeat symbol)
:group 'cj)
diff --git a/tests/test-gptel-tools-web-fetch.el b/tests/test-gptel-tools-web-fetch.el
new file mode 100644
index 00000000..0206af3f
--- /dev/null
+++ b/tests/test-gptel-tools-web-fetch.el
@@ -0,0 +1,158 @@
+;;; test-gptel-tools-web-fetch.el --- Tests for web_fetch gptel tool -*- lexical-binding: t; -*-
+
+;;; Commentary:
+;; Validators and helpers tested directly. The orchestrator's network
+;; call is stubbed via `cl-letf' on `url-retrieve-synchronously' / the
+;; module's `--retrieve' helper; HTML stripping runs against real
+;; pandoc / w3m (both are installed in this dev environment, and
+;; verifying they don't mangle inputs is the point).
+
+;;; Code:
+
+(require 'ert)
+(require 'cl-lib)
+
+(eval-and-compile
+ (add-to-list 'load-path (expand-file-name "tests" user-emacs-directory))
+ (add-to-list 'load-path (expand-file-name "gptel-tools" user-emacs-directory))
+ (setq load-prefer-newer t)
+ (unless (featurep 'gptel)
+ (defvar gptel-tools nil)
+ (defun gptel-make-tool (&rest _args) nil)
+ (defun gptel-get-tool (&rest _args) nil)
+ (provide 'gptel)))
+
+(require 'web_fetch)
+
+;; ---------- validate-url
+
+(ert-deftest test-gptel-tools-web-fetch-validate-url-http ()
+ "Normal: http URL passes."
+ (should (equal (cj/gptel-web-fetch--validate-url "http://example.com")
+ "http://example.com")))
+
+(ert-deftest test-gptel-tools-web-fetch-validate-url-https ()
+ "Normal: https URL passes."
+ (should (equal (cj/gptel-web-fetch--validate-url "https://example.com/path")
+ "https://example.com/path")))
+
+(ert-deftest test-gptel-tools-web-fetch-validate-url-error-non-string ()
+ "Error: non-string URL signals."
+ (should-error (cj/gptel-web-fetch--validate-url nil))
+ (should-error (cj/gptel-web-fetch--validate-url 42)))
+
+(ert-deftest test-gptel-tools-web-fetch-validate-url-error-empty ()
+ "Error: empty URL signals."
+ (should-error (cj/gptel-web-fetch--validate-url "")))
+
+(ert-deftest test-gptel-tools-web-fetch-validate-url-error-non-http-scheme ()
+ "Error: schemes other than http/https are rejected."
+ (should-error (cj/gptel-web-fetch--validate-url "file:///etc/hostname"))
+ (should-error (cj/gptel-web-fetch--validate-url "ftp://example.com"))
+ (should-error (cj/gptel-web-fetch--validate-url "javascript:alert(1)"))
+ (should-error (cj/gptel-web-fetch--validate-url "example.com"))) ; no scheme
+
+;; ---------- effective-max-bytes
+
+(ert-deftest test-gptel-tools-web-fetch-max-bytes-default-on-nil ()
+ "Boundary: nil falls back to the default cap."
+ (should (= (cj/gptel-web-fetch--effective-max-bytes nil)
+ cj/gptel-web-fetch--default-max-bytes)))
+
+(ert-deftest test-gptel-tools-web-fetch-max-bytes-clamp-low ()
+ "Boundary: zero / negative fall back to the default."
+ (should (= (cj/gptel-web-fetch--effective-max-bytes 0)
+ cj/gptel-web-fetch--default-max-bytes))
+ (should (= (cj/gptel-web-fetch--effective-max-bytes -1)
+ cj/gptel-web-fetch--default-max-bytes)))
+
+(ert-deftest test-gptel-tools-web-fetch-max-bytes-cap-high ()
+ "Boundary: values above the hard cap are clamped."
+ (should (= (cj/gptel-web-fetch--effective-max-bytes (* 10 1024 1024))
+ cj/gptel-web-fetch--hard-max-bytes)))
+
+(ert-deftest test-gptel-tools-web-fetch-max-bytes-normal ()
+ "Normal: a sensible value passes through."
+ (should (= (cj/gptel-web-fetch--effective-max-bytes 50000) 50000)))
+
+;; ---------- truncate
+
+(ert-deftest test-gptel-tools-web-fetch-truncate-under-cap ()
+ "Normal: small input returns unchanged."
+ (should (equal (cj/gptel-web-fetch--truncate "short" 1000) "short")))
+
+(ert-deftest test-gptel-tools-web-fetch-truncate-at-cap ()
+ "Boundary: input exactly at cap returns unchanged."
+ (let ((s (make-string 10 ?x)))
+ (should (equal (cj/gptel-web-fetch--truncate s 10) s))))
+
+(ert-deftest test-gptel-tools-web-fetch-truncate-over-cap ()
+ "Boundary: oversize input is truncated and marked."
+ (let* ((s (make-string 1000 ?x))
+ (out (cj/gptel-web-fetch--truncate s 100)))
+ (should (string-match-p "\\[truncated:" out))
+ (should (string-match-p "1000 bytes total" out))))
+
+;; ---------- html-to-text
+
+(ert-deftest test-gptel-tools-web-fetch-html-to-text-strips-tags ()
+ "Normal: pandoc / w3m strip HTML tags from real markup."
+ (let ((out (cj/gptel-web-fetch--html-to-text
+ "<html><body><h1>Hello</h1><p>World</p></body></html>")))
+ (should (string-match-p "Hello" out))
+ (should (string-match-p "World" out))
+ (should-not (string-match-p "<h1>" out))
+ (should-not (string-match-p "<p>" out))))
+
+(ert-deftest test-gptel-tools-web-fetch-html-to-text-error-when-neither-on-path ()
+ "Error: when neither pandoc nor w3m is on PATH, signals user-error."
+ (cl-letf (((symbol-function 'executable-find) (lambda (_) nil)))
+ (should-error (cj/gptel-web-fetch--html-to-text "<p>x</p>"))))
+
+;; ---------- run (orchestrator)
+
+(ert-deftest test-gptel-tools-web-fetch-run-normal-strips-html ()
+ "Normal: orchestrator returns stripped text by default."
+ (cl-letf (((symbol-function 'cj/gptel-web-fetch--retrieve)
+ (lambda (_url)
+ (cons 200 "<html><body><p>fetched</p></body></html>"))))
+ (let ((out (cj/gptel-web-fetch--run "https://example.com")))
+ (should (string-match-p "fetched" out))
+ (should-not (string-match-p "<p>" out)))))
+
+(ert-deftest test-gptel-tools-web-fetch-run-raw-returns-body-verbatim ()
+ "Normal: raw=t returns the response body without HTML stripping."
+ (cl-letf (((symbol-function 'cj/gptel-web-fetch--retrieve)
+ (lambda (_url)
+ (cons 200 "<html><body><p>raw</p></body></html>"))))
+ (let ((out (cj/gptel-web-fetch--run "https://example.com" t)))
+ (should (string-match-p "<p>raw</p>" out)))))
+
+(ert-deftest test-gptel-tools-web-fetch-run-error-on-4xx ()
+ "Error: HTTP 4xx response signals."
+ (cl-letf (((symbol-function 'cj/gptel-web-fetch--retrieve)
+ (lambda (_url) (cons 404 "not found"))))
+ (should-error (cj/gptel-web-fetch--run "https://example.com"))))
+
+(ert-deftest test-gptel-tools-web-fetch-run-error-on-5xx ()
+ "Error: HTTP 5xx response signals."
+ (cl-letf (((symbol-function 'cj/gptel-web-fetch--retrieve)
+ (lambda (_url) (cons 503 "service unavailable"))))
+ (should-error (cj/gptel-web-fetch--run "https://example.com"))))
+
+(ert-deftest test-gptel-tools-web-fetch-run-truncates-oversized-body ()
+ "Boundary: an oversize body is truncated by the run wrapper."
+ (let ((big (concat "<html><body>"
+ (make-string 1000 ?x)
+ "</body></html>")))
+ (cl-letf (((symbol-function 'cj/gptel-web-fetch--retrieve)
+ (lambda (_url) (cons 200 big))))
+ (let ((out (cj/gptel-web-fetch--run "https://example.com" t 200)))
+ (should (string-match-p "\\[truncated:" out))))))
+
+(ert-deftest test-gptel-tools-web-fetch-run-error-on-bad-scheme ()
+ "Error: non-http URL fails fast at the validator."
+ (should-error (cj/gptel-web-fetch--run "file:///etc/passwd")))
+
+(provide 'test-gptel-tools-web-fetch)
+;;; test-gptel-tools-web-fetch.el ends here