From 99d93203a867294addf4927ceec5644b9d3bf322 Mon Sep 17 00:00:00 2001 From: Craig Jennings Date: Sat, 16 May 2026 05:17:21 -0500 Subject: feat(gptel-tools): wire web_fetch as a local tool Fourth ADOPT entry from `docs/design/gptel-tools-shortlist.org'. Lets gptel pull a URL into the conversation so the model can read docs / current API shapes / etc. without me copy-pasting. Shape: - URL must be `http://' or `https://' (file://, ftp://, javascript:, scheme-less, etc. are rejected at the validator). - HTML responses go through `pandoc -f html -t plain' so the model gets a reading shape that isn't full of markup; falls back to `w3m -dump -T text/html' if pandoc isn't on PATH; signals `user-error' if neither is. Pass `raw=t' to skip stripping. - Output capped at 200KB by default, hard cap 1MB; `max_bytes' argument lets the caller pick a lower cap. Truncation reported inline. - 4xx / 5xx response codes signal `error' with the code -- the alternative is returning an error page body, which the model would treat as content. `:confirm t' on the tool because every call is a real outbound network request. The tool's description warns that URLs go wherever the user-agent points, including internal networks if that's what the URL names. `tests/test-gptel-tools-web-fetch.el' -- 20 tests across Normal / Boundary / Error. URL validator covers http / https / non-string / empty / non-http schemes. `--effective-max-bytes' covers default / low-clamp / hard-cap / passthrough. Truncate helper covers under-cap / at-cap / over-cap with the marker. HTML stripper runs against real pandoc / w3m (both installed in dev env, neither should mangle simple markup). Orchestrator stubs `cj/gptel-web-fetch--retrieve' via `cl-letf' to cover normal / raw / 4xx / 5xx / oversize / bad-scheme paths. Wired into `cj/gptel-local-tool-features' so gptel exposes the tool on next restart. Note: `call-process-region' invocation flattened to a single `with-temp-buffer' with DELETE=t -- the initial draft nested a second temp buffer and routed output to the inner one, which got killed before `buffer-string' on the outer ran. Test caught it. --- tests/test-gptel-tools-web-fetch.el | 158 ++++++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 tests/test-gptel-tools-web-fetch.el (limited to 'tests') diff --git a/tests/test-gptel-tools-web-fetch.el b/tests/test-gptel-tools-web-fetch.el new file mode 100644 index 00000000..0206af3f --- /dev/null +++ b/tests/test-gptel-tools-web-fetch.el @@ -0,0 +1,158 @@ +;;; test-gptel-tools-web-fetch.el --- Tests for web_fetch gptel tool -*- lexical-binding: t; -*- + +;;; Commentary: +;; Validators and helpers tested directly. The orchestrator's network +;; call is stubbed via `cl-letf' on `url-retrieve-synchronously' / the +;; module's `--retrieve' helper; HTML stripping runs against real +;; pandoc / w3m (both are installed in this dev environment, and +;; verifying they don't mangle inputs is the point). + +;;; Code: + +(require 'ert) +(require 'cl-lib) + +(eval-and-compile + (add-to-list 'load-path (expand-file-name "tests" user-emacs-directory)) + (add-to-list 'load-path (expand-file-name "gptel-tools" user-emacs-directory)) + (setq load-prefer-newer t) + (unless (featurep 'gptel) + (defvar gptel-tools nil) + (defun gptel-make-tool (&rest _args) nil) + (defun gptel-get-tool (&rest _args) nil) + (provide 'gptel))) + +(require 'web_fetch) + +;; ---------- validate-url + +(ert-deftest test-gptel-tools-web-fetch-validate-url-http () + "Normal: http URL passes." + (should (equal (cj/gptel-web-fetch--validate-url "http://example.com") + "http://example.com"))) + +(ert-deftest test-gptel-tools-web-fetch-validate-url-https () + "Normal: https URL passes." + (should (equal (cj/gptel-web-fetch--validate-url "https://example.com/path") + "https://example.com/path"))) + +(ert-deftest test-gptel-tools-web-fetch-validate-url-error-non-string () + "Error: non-string URL signals." + (should-error (cj/gptel-web-fetch--validate-url nil)) + (should-error (cj/gptel-web-fetch--validate-url 42))) + +(ert-deftest test-gptel-tools-web-fetch-validate-url-error-empty () + "Error: empty URL signals." + (should-error (cj/gptel-web-fetch--validate-url ""))) + +(ert-deftest test-gptel-tools-web-fetch-validate-url-error-non-http-scheme () + "Error: schemes other than http/https are rejected." + (should-error (cj/gptel-web-fetch--validate-url "file:///etc/hostname")) + (should-error (cj/gptel-web-fetch--validate-url "ftp://example.com")) + (should-error (cj/gptel-web-fetch--validate-url "javascript:alert(1)")) + (should-error (cj/gptel-web-fetch--validate-url "example.com"))) ; no scheme + +;; ---------- effective-max-bytes + +(ert-deftest test-gptel-tools-web-fetch-max-bytes-default-on-nil () + "Boundary: nil falls back to the default cap." + (should (= (cj/gptel-web-fetch--effective-max-bytes nil) + cj/gptel-web-fetch--default-max-bytes))) + +(ert-deftest test-gptel-tools-web-fetch-max-bytes-clamp-low () + "Boundary: zero / negative fall back to the default." + (should (= (cj/gptel-web-fetch--effective-max-bytes 0) + cj/gptel-web-fetch--default-max-bytes)) + (should (= (cj/gptel-web-fetch--effective-max-bytes -1) + cj/gptel-web-fetch--default-max-bytes))) + +(ert-deftest test-gptel-tools-web-fetch-max-bytes-cap-high () + "Boundary: values above the hard cap are clamped." + (should (= (cj/gptel-web-fetch--effective-max-bytes (* 10 1024 1024)) + cj/gptel-web-fetch--hard-max-bytes))) + +(ert-deftest test-gptel-tools-web-fetch-max-bytes-normal () + "Normal: a sensible value passes through." + (should (= (cj/gptel-web-fetch--effective-max-bytes 50000) 50000))) + +;; ---------- truncate + +(ert-deftest test-gptel-tools-web-fetch-truncate-under-cap () + "Normal: small input returns unchanged." + (should (equal (cj/gptel-web-fetch--truncate "short" 1000) "short"))) + +(ert-deftest test-gptel-tools-web-fetch-truncate-at-cap () + "Boundary: input exactly at cap returns unchanged." + (let ((s (make-string 10 ?x))) + (should (equal (cj/gptel-web-fetch--truncate s 10) s)))) + +(ert-deftest test-gptel-tools-web-fetch-truncate-over-cap () + "Boundary: oversize input is truncated and marked." + (let* ((s (make-string 1000 ?x)) + (out (cj/gptel-web-fetch--truncate s 100))) + (should (string-match-p "\\[truncated:" out)) + (should (string-match-p "1000 bytes total" out)))) + +;; ---------- html-to-text + +(ert-deftest test-gptel-tools-web-fetch-html-to-text-strips-tags () + "Normal: pandoc / w3m strip HTML tags from real markup." + (let ((out (cj/gptel-web-fetch--html-to-text + "

Hello

World

"))) + (should (string-match-p "Hello" out)) + (should (string-match-p "World" out)) + (should-not (string-match-p "

" out)) + (should-not (string-match-p "

" out)))) + +(ert-deftest test-gptel-tools-web-fetch-html-to-text-error-when-neither-on-path () + "Error: when neither pandoc nor w3m is on PATH, signals user-error." + (cl-letf (((symbol-function 'executable-find) (lambda (_) nil))) + (should-error (cj/gptel-web-fetch--html-to-text "

x

")))) + +;; ---------- run (orchestrator) + +(ert-deftest test-gptel-tools-web-fetch-run-normal-strips-html () + "Normal: orchestrator returns stripped text by default." + (cl-letf (((symbol-function 'cj/gptel-web-fetch--retrieve) + (lambda (_url) + (cons 200 "

fetched

")))) + (let ((out (cj/gptel-web-fetch--run "https://example.com"))) + (should (string-match-p "fetched" out)) + (should-not (string-match-p "

" out))))) + +(ert-deftest test-gptel-tools-web-fetch-run-raw-returns-body-verbatim () + "Normal: raw=t returns the response body without HTML stripping." + (cl-letf (((symbol-function 'cj/gptel-web-fetch--retrieve) + (lambda (_url) + (cons 200 "

raw

")))) + (let ((out (cj/gptel-web-fetch--run "https://example.com" t))) + (should (string-match-p "

raw

" out))))) + +(ert-deftest test-gptel-tools-web-fetch-run-error-on-4xx () + "Error: HTTP 4xx response signals." + (cl-letf (((symbol-function 'cj/gptel-web-fetch--retrieve) + (lambda (_url) (cons 404 "not found")))) + (should-error (cj/gptel-web-fetch--run "https://example.com")))) + +(ert-deftest test-gptel-tools-web-fetch-run-error-on-5xx () + "Error: HTTP 5xx response signals." + (cl-letf (((symbol-function 'cj/gptel-web-fetch--retrieve) + (lambda (_url) (cons 503 "service unavailable")))) + (should-error (cj/gptel-web-fetch--run "https://example.com")))) + +(ert-deftest test-gptel-tools-web-fetch-run-truncates-oversized-body () + "Boundary: an oversize body is truncated by the run wrapper." + (let ((big (concat "" + (make-string 1000 ?x) + ""))) + (cl-letf (((symbol-function 'cj/gptel-web-fetch--retrieve) + (lambda (_url) (cons 200 big)))) + (let ((out (cj/gptel-web-fetch--run "https://example.com" t 200))) + (should (string-match-p "\\[truncated:" out)))))) + +(ert-deftest test-gptel-tools-web-fetch-run-error-on-bad-scheme () + "Error: non-http URL fails fast at the validator." + (should-error (cj/gptel-web-fetch--run "file:///etc/passwd"))) + +(provide 'test-gptel-tools-web-fetch) +;;; test-gptel-tools-web-fetch.el ends here -- cgit v1.2.3