aboutsummaryrefslogtreecommitdiff
path: root/gptel-tools
diff options
context:
space:
mode:
authorCraig Jennings <c@cjennings.net>2026-05-16 05:17:21 -0500
committerCraig Jennings <c@cjennings.net>2026-05-16 05:17:21 -0500
commit99d93203a867294addf4927ceec5644b9d3bf322 (patch)
treef579382868960c77cd38eb350fba4dfdaab13c21 /gptel-tools
parentceeae9b5e2625e23e6e3792d06a6c8122a36d18b (diff)
downloaddotemacs-99d93203a867294addf4927ceec5644b9d3bf322.tar.gz
dotemacs-99d93203a867294addf4927ceec5644b9d3bf322.zip
feat(gptel-tools): wire web_fetch as a local tool
Fourth ADOPT entry from `docs/design/gptel-tools-shortlist.org'. Lets gptel pull a URL into the conversation so the model can read docs / current API shapes / etc. without me copy-pasting. Shape: - URL must be `http://' or `https://' (file://, ftp://, javascript:, scheme-less, etc. are rejected at the validator). - HTML responses go through `pandoc -f html -t plain' so the model gets a reading shape that isn't full of markup; falls back to `w3m -dump -T text/html' if pandoc isn't on PATH; signals `user-error' if neither is. Pass `raw=t' to skip stripping. - Output capped at 200KB by default, hard cap 1MB; `max_bytes' argument lets the caller pick a lower cap. Truncation reported inline. - 4xx / 5xx response codes signal `error' with the code -- the alternative is returning an error page body, which the model would treat as content. `:confirm t' on the tool because every call is a real outbound network request. The tool's description warns that URLs go wherever the user-agent points, including internal networks if that's what the URL names. `tests/test-gptel-tools-web-fetch.el' -- 20 tests across Normal / Boundary / Error. URL validator covers http / https / non-string / empty / non-http schemes. `--effective-max-bytes' covers default / low-clamp / hard-cap / passthrough. Truncate helper covers under-cap / at-cap / over-cap with the marker. HTML stripper runs against real pandoc / w3m (both installed in dev env, neither should mangle simple markup). Orchestrator stubs `cj/gptel-web-fetch--retrieve' via `cl-letf' to cover normal / raw / 4xx / 5xx / oversize / bad-scheme paths. Wired into `cj/gptel-local-tool-features' so gptel exposes the tool on next restart. Note: `call-process-region' invocation flattened to a single `with-temp-buffer' with DELETE=t -- the initial draft nested a second temp buffer and routed output to the inner one, which got killed before `buffer-string' on the outer ran. Test caught it.
Diffstat (limited to 'gptel-tools')
-rw-r--r--gptel-tools/web_fetch.el150
1 files changed, 150 insertions, 0 deletions
diff --git a/gptel-tools/web_fetch.el b/gptel-tools/web_fetch.el
new file mode 100644
index 00000000..1f950a31
--- /dev/null
+++ b/gptel-tools/web_fetch.el
@@ -0,0 +1,150 @@
+;;; web_fetch.el --- Web fetch tool for gptel -*- coding: utf-8; lexical-binding: t; -*-
+
+;; Author: Craig Jennings <c@cjennings.net>
+;; Keywords: convenience, tools, web
+
+;; This file is not part of GNU Emacs.
+
+;;; Commentary:
+
+;; Gptel tool that fetches an HTTP/HTTPS URL and returns its body.
+;; HTML is piped through `pandoc -f html -t plain' (falling back to
+;; `w3m -dump -T text/html') so the model gets a reading shape that
+;; isn't full of markup; pass RAW=t to skip stripping and get the
+;; verbatim response. Output is capped at 200KB by default (hard cap
+;; 1MB) and the cap is reported inline when triggered.
+;;
+;; This tool is `:confirm t' because it makes outbound network
+;; requests -- the user sees every URL before the fetch happens. The
+;; URL goes wherever the user-agent points it, including internal
+;; networks if the URL names one; consider the network posture before
+;; approving sensitive endpoints.
+
+;;; Code:
+
+(require 'gptel)
+(require 'url)
+
+(defconst cj/gptel-web-fetch--default-max-bytes (* 200 1024)
+ "Default cap on returned body size. ~200KB.")
+
+(defconst cj/gptel-web-fetch--hard-max-bytes (* 1024 1024)
+ "Hard upper bound on the user-controllable byte cap. 1MB.")
+
+(defun cj/gptel-web-fetch--validate-url (url)
+ "Validate URL as an http or https request target. Return URL on success.
+Signals `user-error' for non-string, empty, or non-http/https URLs."
+ (unless (and (stringp url) (not (string-empty-p url)))
+ (user-error "web_fetch: expected non-empty URL string, got %S" url))
+ (unless (string-match-p "\\`https?://[^[:space:]]+\\'" url)
+ (user-error "web_fetch: URL must be http:// or https://, got %S" url))
+ url)
+
+(defun cj/gptel-web-fetch--effective-max-bytes (n)
+ "Return the byte cap to use given caller-supplied N.
+Nil / non-integer / out-of-range → default. Above hard cap → hard cap."
+ (cond
+ ((not (integerp n)) cj/gptel-web-fetch--default-max-bytes)
+ ((< n 1) cj/gptel-web-fetch--default-max-bytes)
+ ((> n cj/gptel-web-fetch--hard-max-bytes) cj/gptel-web-fetch--hard-max-bytes)
+ (t n)))
+
+(defun cj/gptel-web-fetch--retrieve (url)
+ "Synchronously GET URL. Return a cons (STATUS-CODE . BODY).
+Signals on network failure. STATUS-CODE is an integer when parseable
+from the response status line, or nil when the line is unrecognized."
+ (let ((buf (url-retrieve-synchronously url t t 30)))
+ (unless buf
+ (error "web_fetch: no response from %s" url))
+ (unwind-protect
+ (with-current-buffer buf
+ (goto-char (point-min))
+ (let* ((status (when (re-search-forward
+ "^HTTP/[0-9.]+ \\([0-9]+\\)" (point-max) t)
+ (string-to-number (match-string 1))))
+ (body-start (when (re-search-forward "\n\n" nil t)
+ (point))))
+ (cons status
+ (if body-start
+ (buffer-substring-no-properties body-start (point-max))
+ (buffer-substring-no-properties (point-min) (point-max))))))
+ (kill-buffer buf))))
+
+(defun cj/gptel-web-fetch--html-to-text (html)
+ "Strip HTML to plain text. Returns the stripped string.
+Tries `pandoc -f html -t plain' first, falls back to
+`w3m -dump -T text/html'. Signals `user-error' if neither is
+on PATH."
+ (let* ((coding-system-for-write 'utf-8)
+ (coding-system-for-read 'utf-8)
+ (tool (cond
+ ((executable-find "pandoc")
+ (list "pandoc" "-f" "html" "-t" "plain"))
+ ((executable-find "w3m")
+ (list "w3m" "-dump" "-T" "text/html"))
+ (t nil))))
+ (unless tool
+ (user-error
+ "web_fetch: HTML stripping needs pandoc or w3m on PATH; pass raw=t to bypass"))
+ ;; `call-process-region' with DELETE=t and OUTPUT=t replaces the
+ ;; input range with the tool's output, so `buffer-string' returns
+ ;; the stripped text.
+ (with-temp-buffer
+ (insert html)
+ (let ((exit (apply #'call-process-region
+ (point-min) (point-max) (car tool)
+ t t nil (cdr tool))))
+ (if (zerop exit)
+ (buffer-string)
+ (error "web_fetch: %s exited with %d" (car tool) exit))))))
+
+(defun cj/gptel-web-fetch--truncate (text max-bytes)
+ "Truncate TEXT to MAX-BYTES. Returns TEXT unchanged when under the cap."
+ (if (<= (length text) max-bytes)
+ text
+ (concat (substring text 0 max-bytes)
+ (format
+ "\n\n[truncated: response exceeded %d bytes; %d bytes total]"
+ max-bytes (length text)))))
+
+(defun cj/gptel-web-fetch--run (url &optional raw max-bytes)
+ "Fetch URL and return its body.
+When RAW is nil (the default) HTML responses are stripped to plain
+text via pandoc or w3m. MAX-BYTES caps the returned size; nil /
+out-of-range falls back to the default 200KB cap."
+ (let* ((validated (cj/gptel-web-fetch--validate-url url))
+ (cap (cj/gptel-web-fetch--effective-max-bytes max-bytes))
+ (response (cj/gptel-web-fetch--retrieve validated))
+ (status (car response))
+ (body (cdr response)))
+ (when (and status (>= status 400))
+ (error "web_fetch: HTTP %d from %s" status validated))
+ (let ((text (if raw body
+ (cj/gptel-web-fetch--html-to-text body))))
+ (cj/gptel-web-fetch--truncate text cap))))
+
+(with-eval-after-load 'gptel
+ (gptel-make-tool
+ :name "web_fetch"
+ :function (lambda (url &optional raw max_bytes)
+ (cj/gptel-web-fetch--run url raw max_bytes))
+ :description "Fetch an http:// or https:// URL and return its body. HTML responses are stripped to plain text via pandoc (or w3m as a fallback); pass raw=true to skip stripping. Output is capped at 200KB by default (max 1MB); the cap is reported inline when triggered. Network call: the URL goes wherever the user-agent points, including internal networks if specified."
+ :args (list '(:name "url"
+ :type string
+ :description "HTTP or HTTPS URL to fetch. Non-http schemes are rejected.")
+ '(:name "raw"
+ :type boolean
+ :description "When true, return the response body verbatim without HTML stripping. Default false."
+ :optional t)
+ '(:name "max_bytes"
+ :type integer
+ :description "Output size cap in bytes. Defaults to 200000; hard-capped at 1048576."
+ :optional t))
+ :category "web"
+ :confirm t
+ :include t)
+
+ (add-to-list 'gptel-tools (gptel-get-tool '("web" "web_fetch"))))
+
+(provide 'web_fetch)
+;;; web_fetch.el ends here