diff options
| author | Craig Jennings <c@cjennings.net> | 2026-05-16 05:17:21 -0500 |
|---|---|---|
| committer | Craig Jennings <c@cjennings.net> | 2026-05-16 05:17:21 -0500 |
| commit | 99d93203a867294addf4927ceec5644b9d3bf322 (patch) | |
| tree | f579382868960c77cd38eb350fba4dfdaab13c21 | |
| parent | ceeae9b5e2625e23e6e3792d06a6c8122a36d18b (diff) | |
| download | dotemacs-99d93203a867294addf4927ceec5644b9d3bf322.tar.gz dotemacs-99d93203a867294addf4927ceec5644b9d3bf322.zip | |
feat(gptel-tools): wire web_fetch as a local tool
Fourth ADOPT entry from `docs/design/gptel-tools-shortlist.org'.
Lets gptel pull a URL into the conversation so the model can read
docs / current API shapes / etc. without me copy-pasting.
Shape:
- URL must be `http://' or `https://' (file://, ftp://, javascript:,
scheme-less, etc. are rejected at the validator).
- HTML responses go through `pandoc -f html -t plain' so the model
gets a reading shape that isn't full of markup; falls back to
`w3m -dump -T text/html' if pandoc isn't on PATH; signals
`user-error' if neither is. Pass `raw=t' to skip stripping.
- Output capped at 200KB by default, hard cap 1MB; `max_bytes'
argument lets the caller pick a lower cap. Truncation reported
inline.
- 4xx / 5xx response codes signal `error' with the code -- the
alternative is returning an error page body, which the model
would treat as content.
`:confirm t' on the tool because every call is a real outbound
network request. The tool's description warns that URLs go
wherever the user-agent points, including internal networks if
that's what the URL names.
`tests/test-gptel-tools-web-fetch.el' -- 20 tests across Normal /
Boundary / Error. URL validator covers http / https / non-string
/ empty / non-http schemes. `--effective-max-bytes' covers default
/ low-clamp / hard-cap / passthrough. Truncate helper covers
under-cap / at-cap / over-cap with the marker. HTML stripper runs
against real pandoc / w3m (both installed in dev env, neither
should mangle simple markup). Orchestrator stubs
`cj/gptel-web-fetch--retrieve' via `cl-letf' to cover normal /
raw / 4xx / 5xx / oversize / bad-scheme paths.
Wired into `cj/gptel-local-tool-features' so gptel exposes the
tool on next restart.
Note: `call-process-region' invocation flattened to a single
`with-temp-buffer' with DELETE=t -- the initial draft nested a
second temp buffer and routed output to the inner one, which got
killed before `buffer-string' on the outer ran. Test caught it.
| -rw-r--r-- | gptel-tools/web_fetch.el | 150 | ||||
| -rw-r--r-- | modules/ai-config.el | 3 | ||||
| -rw-r--r-- | tests/test-gptel-tools-web-fetch.el | 158 |
3 files changed, 310 insertions, 1 deletions
diff --git a/gptel-tools/web_fetch.el b/gptel-tools/web_fetch.el new file mode 100644 index 00000000..1f950a31 --- /dev/null +++ b/gptel-tools/web_fetch.el @@ -0,0 +1,150 @@ +;;; web_fetch.el --- Web fetch tool for gptel -*- coding: utf-8; lexical-binding: t; -*- + +;; Author: Craig Jennings <c@cjennings.net> +;; Keywords: convenience, tools, web + +;; This file is not part of GNU Emacs. + +;;; Commentary: + +;; Gptel tool that fetches an HTTP/HTTPS URL and returns its body. +;; HTML is piped through `pandoc -f html -t plain' (falling back to +;; `w3m -dump -T text/html') so the model gets a reading shape that +;; isn't full of markup; pass RAW=t to skip stripping and get the +;; verbatim response. Output is capped at 200KB by default (hard cap +;; 1MB) and the cap is reported inline when triggered. +;; +;; This tool is `:confirm t' because it makes outbound network +;; requests -- the user sees every URL before the fetch happens. The +;; URL goes wherever the user-agent points it, including internal +;; networks if the URL names one; consider the network posture before +;; approving sensitive endpoints. + +;;; Code: + +(require 'gptel) +(require 'url) + +(defconst cj/gptel-web-fetch--default-max-bytes (* 200 1024) + "Default cap on returned body size. ~200KB.") + +(defconst cj/gptel-web-fetch--hard-max-bytes (* 1024 1024) + "Hard upper bound on the user-controllable byte cap. 1MB.") + +(defun cj/gptel-web-fetch--validate-url (url) + "Validate URL as an http or https request target. Return URL on success. +Signals `user-error' for non-string, empty, or non-http/https URLs." + (unless (and (stringp url) (not (string-empty-p url))) + (user-error "web_fetch: expected non-empty URL string, got %S" url)) + (unless (string-match-p "\\`https?://[^[:space:]]+\\'" url) + (user-error "web_fetch: URL must be http:// or https://, got %S" url)) + url) + +(defun cj/gptel-web-fetch--effective-max-bytes (n) + "Return the byte cap to use given caller-supplied N. +Nil / non-integer / out-of-range → default. Above hard cap → hard cap." + (cond + ((not (integerp n)) cj/gptel-web-fetch--default-max-bytes) + ((< n 1) cj/gptel-web-fetch--default-max-bytes) + ((> n cj/gptel-web-fetch--hard-max-bytes) cj/gptel-web-fetch--hard-max-bytes) + (t n))) + +(defun cj/gptel-web-fetch--retrieve (url) + "Synchronously GET URL. Return a cons (STATUS-CODE . BODY). +Signals on network failure. STATUS-CODE is an integer when parseable +from the response status line, or nil when the line is unrecognized." + (let ((buf (url-retrieve-synchronously url t t 30))) + (unless buf + (error "web_fetch: no response from %s" url)) + (unwind-protect + (with-current-buffer buf + (goto-char (point-min)) + (let* ((status (when (re-search-forward + "^HTTP/[0-9.]+ \\([0-9]+\\)" (point-max) t) + (string-to-number (match-string 1)))) + (body-start (when (re-search-forward "\n\n" nil t) + (point)))) + (cons status + (if body-start + (buffer-substring-no-properties body-start (point-max)) + (buffer-substring-no-properties (point-min) (point-max)))))) + (kill-buffer buf)))) + +(defun cj/gptel-web-fetch--html-to-text (html) + "Strip HTML to plain text. Returns the stripped string. +Tries `pandoc -f html -t plain' first, falls back to +`w3m -dump -T text/html'. Signals `user-error' if neither is +on PATH." + (let* ((coding-system-for-write 'utf-8) + (coding-system-for-read 'utf-8) + (tool (cond + ((executable-find "pandoc") + (list "pandoc" "-f" "html" "-t" "plain")) + ((executable-find "w3m") + (list "w3m" "-dump" "-T" "text/html")) + (t nil)))) + (unless tool + (user-error + "web_fetch: HTML stripping needs pandoc or w3m on PATH; pass raw=t to bypass")) + ;; `call-process-region' with DELETE=t and OUTPUT=t replaces the + ;; input range with the tool's output, so `buffer-string' returns + ;; the stripped text. + (with-temp-buffer + (insert html) + (let ((exit (apply #'call-process-region + (point-min) (point-max) (car tool) + t t nil (cdr tool)))) + (if (zerop exit) + (buffer-string) + (error "web_fetch: %s exited with %d" (car tool) exit)))))) + +(defun cj/gptel-web-fetch--truncate (text max-bytes) + "Truncate TEXT to MAX-BYTES. Returns TEXT unchanged when under the cap." + (if (<= (length text) max-bytes) + text + (concat (substring text 0 max-bytes) + (format + "\n\n[truncated: response exceeded %d bytes; %d bytes total]" + max-bytes (length text))))) + +(defun cj/gptel-web-fetch--run (url &optional raw max-bytes) + "Fetch URL and return its body. +When RAW is nil (the default) HTML responses are stripped to plain +text via pandoc or w3m. MAX-BYTES caps the returned size; nil / +out-of-range falls back to the default 200KB cap." + (let* ((validated (cj/gptel-web-fetch--validate-url url)) + (cap (cj/gptel-web-fetch--effective-max-bytes max-bytes)) + (response (cj/gptel-web-fetch--retrieve validated)) + (status (car response)) + (body (cdr response))) + (when (and status (>= status 400)) + (error "web_fetch: HTTP %d from %s" status validated)) + (let ((text (if raw body + (cj/gptel-web-fetch--html-to-text body)))) + (cj/gptel-web-fetch--truncate text cap)))) + +(with-eval-after-load 'gptel + (gptel-make-tool + :name "web_fetch" + :function (lambda (url &optional raw max_bytes) + (cj/gptel-web-fetch--run url raw max_bytes)) + :description "Fetch an http:// or https:// URL and return its body. HTML responses are stripped to plain text via pandoc (or w3m as a fallback); pass raw=true to skip stripping. Output is capped at 200KB by default (max 1MB); the cap is reported inline when triggered. Network call: the URL goes wherever the user-agent points, including internal networks if specified." + :args (list '(:name "url" + :type string + :description "HTTP or HTTPS URL to fetch. Non-http schemes are rejected.") + '(:name "raw" + :type boolean + :description "When true, return the response body verbatim without HTML stripping. Default false." + :optional t) + '(:name "max_bytes" + :type integer + :description "Output size cap in bytes. Defaults to 200000; hard-capped at 1048576." + :optional t)) + :category "web" + :confirm t + :include t) + + (add-to-list 'gptel-tools (gptel-get-tool '("web" "web_fetch")))) + +(provide 'web_fetch) +;;; web_fetch.el ends here diff --git a/modules/ai-config.el b/modules/ai-config.el index a04a32a0..02c85e5c 100644 --- a/modules/ai-config.el +++ b/modules/ai-config.el @@ -62,7 +62,8 @@ move_to_trash git_status git_log - git_diff) + git_diff + web_fetch) "Feature symbols for optional local GPTel tool modules." :type '(repeat symbol) :group 'cj) diff --git a/tests/test-gptel-tools-web-fetch.el b/tests/test-gptel-tools-web-fetch.el new file mode 100644 index 00000000..0206af3f --- /dev/null +++ b/tests/test-gptel-tools-web-fetch.el @@ -0,0 +1,158 @@ +;;; test-gptel-tools-web-fetch.el --- Tests for web_fetch gptel tool -*- lexical-binding: t; -*- + +;;; Commentary: +;; Validators and helpers tested directly. The orchestrator's network +;; call is stubbed via `cl-letf' on `url-retrieve-synchronously' / the +;; module's `--retrieve' helper; HTML stripping runs against real +;; pandoc / w3m (both are installed in this dev environment, and +;; verifying they don't mangle inputs is the point). + +;;; Code: + +(require 'ert) +(require 'cl-lib) + +(eval-and-compile + (add-to-list 'load-path (expand-file-name "tests" user-emacs-directory)) + (add-to-list 'load-path (expand-file-name "gptel-tools" user-emacs-directory)) + (setq load-prefer-newer t) + (unless (featurep 'gptel) + (defvar gptel-tools nil) + (defun gptel-make-tool (&rest _args) nil) + (defun gptel-get-tool (&rest _args) nil) + (provide 'gptel))) + +(require 'web_fetch) + +;; ---------- validate-url + +(ert-deftest test-gptel-tools-web-fetch-validate-url-http () + "Normal: http URL passes." + (should (equal (cj/gptel-web-fetch--validate-url "http://example.com") + "http://example.com"))) + +(ert-deftest test-gptel-tools-web-fetch-validate-url-https () + "Normal: https URL passes." + (should (equal (cj/gptel-web-fetch--validate-url "https://example.com/path") + "https://example.com/path"))) + +(ert-deftest test-gptel-tools-web-fetch-validate-url-error-non-string () + "Error: non-string URL signals." + (should-error (cj/gptel-web-fetch--validate-url nil)) + (should-error (cj/gptel-web-fetch--validate-url 42))) + +(ert-deftest test-gptel-tools-web-fetch-validate-url-error-empty () + "Error: empty URL signals." + (should-error (cj/gptel-web-fetch--validate-url ""))) + +(ert-deftest test-gptel-tools-web-fetch-validate-url-error-non-http-scheme () + "Error: schemes other than http/https are rejected." + (should-error (cj/gptel-web-fetch--validate-url "file:///etc/hostname")) + (should-error (cj/gptel-web-fetch--validate-url "ftp://example.com")) + (should-error (cj/gptel-web-fetch--validate-url "javascript:alert(1)")) + (should-error (cj/gptel-web-fetch--validate-url "example.com"))) ; no scheme + +;; ---------- effective-max-bytes + +(ert-deftest test-gptel-tools-web-fetch-max-bytes-default-on-nil () + "Boundary: nil falls back to the default cap." + (should (= (cj/gptel-web-fetch--effective-max-bytes nil) + cj/gptel-web-fetch--default-max-bytes))) + +(ert-deftest test-gptel-tools-web-fetch-max-bytes-clamp-low () + "Boundary: zero / negative fall back to the default." + (should (= (cj/gptel-web-fetch--effective-max-bytes 0) + cj/gptel-web-fetch--default-max-bytes)) + (should (= (cj/gptel-web-fetch--effective-max-bytes -1) + cj/gptel-web-fetch--default-max-bytes))) + +(ert-deftest test-gptel-tools-web-fetch-max-bytes-cap-high () + "Boundary: values above the hard cap are clamped." + (should (= (cj/gptel-web-fetch--effective-max-bytes (* 10 1024 1024)) + cj/gptel-web-fetch--hard-max-bytes))) + +(ert-deftest test-gptel-tools-web-fetch-max-bytes-normal () + "Normal: a sensible value passes through." + (should (= (cj/gptel-web-fetch--effective-max-bytes 50000) 50000))) + +;; ---------- truncate + +(ert-deftest test-gptel-tools-web-fetch-truncate-under-cap () + "Normal: small input returns unchanged." + (should (equal (cj/gptel-web-fetch--truncate "short" 1000) "short"))) + +(ert-deftest test-gptel-tools-web-fetch-truncate-at-cap () + "Boundary: input exactly at cap returns unchanged." + (let ((s (make-string 10 ?x))) + (should (equal (cj/gptel-web-fetch--truncate s 10) s)))) + +(ert-deftest test-gptel-tools-web-fetch-truncate-over-cap () + "Boundary: oversize input is truncated and marked." + (let* ((s (make-string 1000 ?x)) + (out (cj/gptel-web-fetch--truncate s 100))) + (should (string-match-p "\\[truncated:" out)) + (should (string-match-p "1000 bytes total" out)))) + +;; ---------- html-to-text + +(ert-deftest test-gptel-tools-web-fetch-html-to-text-strips-tags () + "Normal: pandoc / w3m strip HTML tags from real markup." + (let ((out (cj/gptel-web-fetch--html-to-text + "<html><body><h1>Hello</h1><p>World</p></body></html>"))) + (should (string-match-p "Hello" out)) + (should (string-match-p "World" out)) + (should-not (string-match-p "<h1>" out)) + (should-not (string-match-p "<p>" out)))) + +(ert-deftest test-gptel-tools-web-fetch-html-to-text-error-when-neither-on-path () + "Error: when neither pandoc nor w3m is on PATH, signals user-error." + (cl-letf (((symbol-function 'executable-find) (lambda (_) nil))) + (should-error (cj/gptel-web-fetch--html-to-text "<p>x</p>")))) + +;; ---------- run (orchestrator) + +(ert-deftest test-gptel-tools-web-fetch-run-normal-strips-html () + "Normal: orchestrator returns stripped text by default." + (cl-letf (((symbol-function 'cj/gptel-web-fetch--retrieve) + (lambda (_url) + (cons 200 "<html><body><p>fetched</p></body></html>")))) + (let ((out (cj/gptel-web-fetch--run "https://example.com"))) + (should (string-match-p "fetched" out)) + (should-not (string-match-p "<p>" out))))) + +(ert-deftest test-gptel-tools-web-fetch-run-raw-returns-body-verbatim () + "Normal: raw=t returns the response body without HTML stripping." + (cl-letf (((symbol-function 'cj/gptel-web-fetch--retrieve) + (lambda (_url) + (cons 200 "<html><body><p>raw</p></body></html>")))) + (let ((out (cj/gptel-web-fetch--run "https://example.com" t))) + (should (string-match-p "<p>raw</p>" out))))) + +(ert-deftest test-gptel-tools-web-fetch-run-error-on-4xx () + "Error: HTTP 4xx response signals." + (cl-letf (((symbol-function 'cj/gptel-web-fetch--retrieve) + (lambda (_url) (cons 404 "not found")))) + (should-error (cj/gptel-web-fetch--run "https://example.com")))) + +(ert-deftest test-gptel-tools-web-fetch-run-error-on-5xx () + "Error: HTTP 5xx response signals." + (cl-letf (((symbol-function 'cj/gptel-web-fetch--retrieve) + (lambda (_url) (cons 503 "service unavailable")))) + (should-error (cj/gptel-web-fetch--run "https://example.com")))) + +(ert-deftest test-gptel-tools-web-fetch-run-truncates-oversized-body () + "Boundary: an oversize body is truncated by the run wrapper." + (let ((big (concat "<html><body>" + (make-string 1000 ?x) + "</body></html>"))) + (cl-letf (((symbol-function 'cj/gptel-web-fetch--retrieve) + (lambda (_url) (cons 200 big)))) + (let ((out (cj/gptel-web-fetch--run "https://example.com" t 200))) + (should (string-match-p "\\[truncated:" out)))))) + +(ert-deftest test-gptel-tools-web-fetch-run-error-on-bad-scheme () + "Error: non-http URL fails fast at the validator." + (should-error (cj/gptel-web-fetch--run "file:///etc/passwd"))) + +(provide 'test-gptel-tools-web-fetch) +;;; test-gptel-tools-web-fetch.el ends here |
