;;; fetch_page.el --- Web fetch tool for gptel (HTML/text/links) -*- lexical-binding: t; -*- ;; Copyright (C) 2025 cjennings ;; Author: cjennings and AI assistant ;;; Commentary: ;; ;; fetch_page: In-memory web fetch-and-parse tool for gptel tools. ;; ;; Features ;; - Fetch a URL (http/https), follow redirects. ;; - Return one of: rendered text (DOM+shr), raw HTML, or extracted links. ;; - Size/time guards: max_bytes, max_chars, timeout. ;; - User-Agent presets; Accept/Accept-Language overrides. ;; - Predictable, LLM-friendly outputs and explicit error suggestions. ;; ;; Notes ;; - This tool uses url.el (asynchronous url-retrieve) and libxml/shr for HTML. ;; - For non-text content (e.g., PDFs), it returns an explicit error and suggests ;; using a download tool (e.g., download_url) instead. ;; ;; Installation ;; - Load after gptel and registers a gptel tool named "fetch_page" (category "web"). ;; ;;; Code: (require 'url) (require 'url-http) ;; for url-http-end-of-headers, response status, etc. (require 'subr-x) (eval-when-compile (require 'cl-lib)) (require 'dom) (require 'shr) ;; Internal helpers are prefixed gptel-fetch-page-- (defconst gptel-fetch-page--default-max-bytes (* 16 1024 1024)) (defconst gptel-fetch-page--max-allowed-bytes (* 32 1024 1024)) (defconst gptel-fetch-page--default-max-chars 300000) (defconst gptel-fetch-page--default-timeout 30) (defconst gptel-fetch-page--ua-presets '(("desktop-chrome" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") ("desktop-firefox" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0") ("desktop-safari" . "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15") ("edge-windows" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0") ("mobile-iphone" . "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1") ("mobile-android" . "Mozilla/5.0 (Linux; Android 13; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36") ("emacs-default" . "Emacs gptel/fetch_page (+https://github.com/karthink/gptel)") ) "Preset User-Agent strings.") (defun gptel-fetch-page--coerce-bool (v) (cond ((eq v :json-false) nil) ((stringp v) (not (string-blank-p v))) (t (and v t)))) (defun gptel-fetch-page--coerce-int (v &optional default min max) (let ((n (cond ((numberp v) v) ((stringp v) (string-to-number v)) (t nil)))) (setq n (or n default)) (when (and min n (< n min)) (setq n min)) (when (and max n (> n max)) (setq n max)) n)) (defun gptel-fetch-page--ua (ua) "Return a User-Agent string from UA preset or use UA directly." (cond ((null ua) (cdr (assoc "desktop-chrome" gptel-fetch-page--ua-presets))) ((stringp ua) (or (cdr (assoc ua gptel-fetch-page--ua-presets)) ua)) (t (cdr (assoc "desktop-chrome" gptel-fetch-page--ua-presets))))) (defun gptel-fetch-page--ok-scheme-p (url) (let ((u (ignore-errors (url-generic-parse-url url)))) (and u (member (url-type u) '("http" "https"))))) (defun gptel-fetch-page--parse-headers () "Parse headers in current buffer up to `url-http-end-of-headers'." (let (hdrs) (save-excursion (goto-char (point-min)) (while (re-search-forward "^\\([^:]+\\):[ \t]*\\(.*\\)$" url-http-end-of-headers t) (push (cons (downcase (match-string 1)) (string-trim (match-string 2))) hdrs))) (nreverse hdrs))) (defun gptel-fetch-page--content-type (headers) "Return (mime charset) from HEADERS alist." (let* ((ct (cdr (assoc "content-type" headers))) (mime (and ct (car (split-string ct ";" t "[ \t]*")))) (charset (and ct (when (string-match "charset=\\([^;]+\\)" ct) (downcase (match-string 1 ct)))))) (list (or mime "unknown") charset))) (defun gptel-fetch-page--header-bytes (headers) "Extract and return the Content-Length from HEADERS as a number. HEADERS is an alist representing HTTP headers. This function looks up the 'Content-Length' header, safely converting its value to a number, or returning nil if the header is absent or conversion fails." (let ((cl (cdr (assoc "content-length" headers)))) (and cl (ignore-errors (string-to-number cl))))) (defun gptel-fetch-page--region-string (beg end) (save-excursion (goto-char beg) (buffer-substring-no-properties beg end))) (defun gptel-fetch-page--html-to-dom (html) (with-temp-buffer (insert html) (condition-case err (if (fboundp 'libxml-parse-html-region) (libxml-parse-html-region (point-min) (point-max)) (signal 'error '(libxml-missing))) (error (signal (car err) (cdr err)))))) (defun gptel-fetch-page--dom-to-text (dom) (with-temp-buffer ;; Use shr to render; keep output lean and deterministic. (let ((shr-use-fonts nil) (shr-inhibit-images t) (shr-width (min 120 (window-body-width)))) (shr-insert-document dom)) (buffer-string))) (defun gptel-fetch-page--dom-base (dom final-url) (or (when-let* ((node (car (dom-by-tag dom 'base))) (href (dom-attr node 'href))) href) final-url)) (defun gptel-fetch-page--resolve (href base) (condition-case _ (if (and href (not (string-empty-p href))) (url-expand-file-name href base) nil) (error nil))) (defun gptel-fetch-page--collect-links (dom final-url include-titles) (let* ((base (gptel-fetch-page--dom-base dom final-url)) (seen (make-hash-table :test 'equal)) (out '())) (cl-labels ((push-link (u ttitle) (when (and u (not (gethash u seen))) (puthash u t seen) (push (if include-titles (format "%s\t%s" u (or (and ttitle (string-trim ttitle)) "")) u) out)))) ;; and (dolist (tag '(a area)) (dolist (n (dom-by-tag dom tag)) (let ((href (dom-attr n 'href))) (push-link (gptel-fetch-page--resolve href base) (ignore-errors (string-trim (dom-texts n))))))) ;; included by request (dolist (n (dom-by-tag dom 'link)) (let* ((rel (downcase (or (dom-attr n 'rel) ""))) (is-style (string-match-p "stylesheet" rel)) (href (dom-attr n 'href))) (when is-style (push-link (gptel-fetch-page--resolve href base) (or (dom-attr n 'title) rel)))))) (mapconcat #'identity (nreverse out) "\n"))) (defun gptel-fetch-page--truncate (s max-chars) (if (and max-chars (> (length s) max-chars)) (concat (substring s 0 max-chars) "\nnote: output truncated at " (number-to-string max-chars) " chars") s)) (defun gptel-fetch-page--format-error (&rest kv) (let ((summary (plist-get kv :summary)) (status (or (plist-get kv :status) "unknown")) (url (or (plist-get kv :url) "unknown")) (type (or (plist-get kv :type) "unknown")) (limit (plist-get kv :limit)) (suggest (plist-get kv :suggest))) (string-join (delq nil (list (format "ERROR: %s" summary) (format "details: status=%s url=%s type=%s" status url type) (and limit (format "limit: %s" limit)) (and suggest (format "suggest: %s" suggest)))) "\n"))) (defun gptel-fetch-page--format-header (status final-url type size) (format "[%s] final-url: %s | type: %s | size: %s B" status final-url type (or size "unknown"))) ;; ---------------------------- ;; Refactored orchestrator bits ;; ---------------------------- (defun gptel-fetch-page--normalize-args (url out-format timeout max-bytes max-chars include-titles user-agent accept accept-language debug) "Coerce and validate arguments into a request plist." (let* ((out-format (or out-format "text")) (timeout (gptel-fetch-page--coerce-int timeout gptel-fetch-page--default-timeout 1 nil)) (max-bytes (gptel-fetch-page--coerce-int max-bytes gptel-fetch-page--default-max-bytes 1 gptel-fetch-page--max-allowed-bytes)) (max-chars (gptel-fetch-page--coerce-int max-chars gptel-fetch-page--default-max-chars 1000 nil)) (include-titles (gptel-fetch-page--coerce-bool include-titles)) (debug (gptel-fetch-page--coerce-bool debug)) (ua (gptel-fetch-page--ua user-agent)) (accept (or accept "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")) (accept-language (or accept-language "en-US,en;q=0.9"))) (list :url url :out-format out-format :timeout timeout :max-bytes max-bytes :max-chars max-chars :include-titles include-titles :debug debug :ua ua :accept accept :accept-language accept-language))) (defun gptel-fetch-page--build-request-headers (req) "Build url.el headers alist from REQ." (let ((max-bytes (plist-get req :max-bytes))) `(("User-Agent" . ,(plist-get req :ua)) ("Accept" . ,(plist-get req :accept)) ("Accept-Language" . ,(plist-get req :accept-language)) ,@(and max-bytes (list (cons "Range" (format "bytes=0-%d" (1- max-bytes)))))))) (defun gptel-fetch-page--start-timeout (req callback done-cell) "Start a timer based on REQ's :timeout that calls CALLBACK with an error. DONE-CELL is a cons cell used as shared mutable state; its car is set non-nil once done." (let ((timeout (plist-get req :timeout)) (orig-url (plist-get req :url))) (run-at-time timeout nil (lambda () (unless (car done-cell) (setcar done-cell t) (funcall callback (gptel-fetch-page--format-error :summary (format "Request timed out after %ss" timeout) :url orig-url :suggest "retry with a larger timeout or probe connectivity"))))))) (defun gptel-fetch-page--parse-response-buffer () "Parse the current url-retrieve buffer into a response plist." (save-excursion (goto-char (point-min)) (let* ((status-line (buffer-substring (line-beginning-position) (line-end-position))) (http-status (or (and (boundp 'url-http-response-status) url-http-response-status) (and (string-match "HTTP/[0-9.]+ +\\([0-9]+\\)" status-line) (string-to-number (match-string 1 status-line))))) (headers (gptel-fetch-page--parse-headers)) (final-url (or (and (boundp 'url-current-object) (ignore-errors (url-recreate-url url-current-object))) "unknown")) (ctype (gptel-fetch-page--content-type headers)) (mime (car ctype)) (charset (cadr ctype)) (declared-bytes (gptel-fetch-page--header-bytes headers)) (body (gptel-fetch-page--region-string url-http-end-of-headers (point-max))) (actual-bytes (string-bytes body))) (list :status http-status :status-line status-line :headers headers :final-url final-url :mime mime :charset charset :declared-bytes declared-bytes :body body :actual-bytes actual-bytes)))) (defun gptel-fetch-page--enforce-limits (resp req) "Check response sizes and return either (:ok t RESP) or (:error STRING)." (let* ((http-status (plist-get resp :status)) (final-url (plist-get resp :final-url)) (mime (plist-get resp :mime)) (declared (plist-get resp :declared-bytes)) (actual (plist-get resp :actual-bytes)) (max-bytes (plist-get req :max-bytes)) (max-chars (plist-get req :max-chars)) (timeout (plist-get req :timeout))) (cond ((and declared max-bytes (> declared max-bytes)) (list :error (gptel-fetch-page--format-error :summary (format "Response exceeds max_bytes (%d bytes) for in-memory fetch" max-bytes) :status http-status :url final-url :type mime :limit (format "max_bytes=%d max_chars=%d timeout=%ss" max-bytes max-chars timeout) :suggest (format "call download_url to save the file, or retry fetch_page with a larger max_bytes (up to %d)" gptel-fetch-page--max-allowed-bytes))))) ((and max-bytes (> actual max-bytes)) (list :error (gptel-fetch-page--format-error :summary (format "Response exceeds max_bytes (%d bytes) during retrieval" max-bytes) :status http-status :url final-url :type mime :limit (format "max_bytes=%d max_chars=%d timeout=%ss" max-bytes max-chars timeout) :suggest (format "call download_url to save the file, or retry fetch_page with a larger max_bytes (up to %d)" gptel-fetch-page--max-allowed-bytes))))) (t (list :ok t resp))))) (defun gptel-fetch-page--maybe-decode-body (body charset) "Decode BODY using CHARSET if recognized; otherwise return BODY unchanged." (let ((cs (and charset (ignore-errors (coding-system-from-name charset))))) (if (and cs (coding-system-p cs)) (condition-case _ (decode-coding-string body cs) (error body)) body))) (defun gptel-fetch-page--render (resp req) "Render the response according to request preferences. Returns either (:ok t STRING) or (:error STRING)." (let* ((http-status (plist-get resp :status)) (final-url (plist-get resp :final-url)) (mime (plist-get resp :mime)) (charset (plist-get resp :charset)) (raw-body (plist-get resp :body)) (body (gptel-fetch-page--maybe-decode-body raw-body charset)) (out-format (plist-get req :out-format)) (max-chars (plist-get req :max-chars)) (debug (plist-get req :debug)) (include-titles (plist-get req :include-titles)) (actual-bytes (plist-get resp :actual-bytes))) (pcase mime ((or "text/html" "application/xhtml+xml") (pcase out-format ("html" (let* ((content (gptel-fetch-page--truncate body max-chars)) (out (if debug (concat (gptel-fetch-page--format-header http-status final-url mime actual-bytes) "\n\n" content) content))) (list :ok t out))) ((or "text" "links") (if (fboundp 'libxml-parse-html-region) (condition-case _ (let* ((dom (gptel-fetch-page--html-to-dom body)) (payload (if (string= out-format "text") (gptel-fetch-page--dom-to-text dom) (gptel-fetch-page--collect-links dom final-url include-titles))) (content (gptel-fetch-page--truncate payload max-chars)) (out (if debug (concat (gptel-fetch-page--format-header http-status final-url mime actual-bytes) "\n\n" content) content))) (list :ok t out)) (error (list :error (gptel-fetch-page--format-error :summary "HTML parsing failed" :status http-status :url final-url :type mime :suggest "return html format (raw), use download_url, or install Emacs with libxml2")))) (list :error (gptel-fetch-page--format-error :summary "HTML parsing requires Emacs built with libxml2" :status http-status :url final-url :type mime :suggest "return html format (raw), use download_url, or install Emacs with libxml2")))) (_ (list :error (gptel-fetch-page--format-error :summary (format "Invalid format '%s' for HTML content" out-format) :status http-status :url final-url :type mime :suggest "use format=text, format=html, or format=links"))))) ("text/plain" (let* ((content (gptel-fetch-page--truncate body max-chars)) (out (if debug (concat (gptel-fetch-page--format-header http-status final-url mime actual-bytes) "\n\n" content) content))) (list :ok t out))) (_ (list :error (gptel-fetch-page--format-error :summary (format "Non-text content not supported for parsing (%s)" mime) :status http-status :url final-url :type mime :suggest "use download_url to save the file locally, then parse with a file tool")))))) ;; ---------------------------- ;; Orchestrator ;; ---------------------------- (defun gptel-fetch-page--run (callback url out-format timeout max-bytes max-chars include-titles user-agent accept accept-language debug) "Orchestrate async fetch and rendering for gptel fetch_page." (let ((req (gptel-fetch-page--normalize-args url out-format timeout max-bytes max-chars include-titles user-agent accept accept-language debug))) (unless (gptel-fetch-page--ok-scheme-p (plist-get req :url)) (funcall callback (gptel-fetch-page--format-error :summary "Only http/https URLs are supported" :url (plist-get req :url) :suggest "use an http/https URL")) (cl-return-from gptel-fetch-page--run)) (let* ((url-request-method "GET") (url-request-extra-headers (gptel-fetch-page--build-request-headers req)) (done (list nil)) ;; cons cell used as shared mutable flag (orig-url (plist-get req :url)) timer) (setq timer (gptel-fetch-page--start-timeout req callback done)) (url-retrieve orig-url (lambda (status) (let ((inhibit-read-only t)) (unwind-protect (progn (unless (car done) (setcar done t) (when (timerp timer) (cancel-timer timer)) (if (plist-get status :error) (let ((err (plist-get status :error))) (funcall callback (gptel-fetch-page--format-error :summary (error-message-string err) :url orig-url :suggest "check URL or try again"))) (let* ((resp (gptel-fetch-page--parse-response-buffer)) (limit-check (gptel-fetch-page--enforce-limits resp req))) (if (plist-get limit-check :error) (funcall callback (plist-get limit-check :error)) (let ((rendered (gptel-fetch-page--render resp req))) (funcall callback (or (plist-get rendered :error) (caddr rendered)))))))) (kill-buffer (current-buffer))) (ignore-errors (kill-buffer (current-buffer)))))) nil t nil)))) ;; Public entry for the gptel tool function. (defun gptel-fetch-page--tool (callback url &optional format timeout max_bytes max_chars include_titles user_agent accept accept_language debug) (gptel-fetch-page--run callback url format timeout max_bytes max_chars include_titles user_agent accept accept_language debug)) (with-eval-after-load 'gptel (gptel-make-tool :name "fetch_page" :function #'gptel-fetch-page--tool :description (concat "Fetch a web page and return rendered text, raw HTML, or links.\n" "- format=text|html|links (default text)\n" "- Guards: timeout (30s), max_bytes (default 16MiB, up to 32MiB), max_chars (300k)\n" "- Includes stylesheets in links mode; relative URLs resolved with /final URL\n" "- Non-text types error with explicit suggestion to use a downloader.") :category "web" :async t :include t :args (list '(:name "url" :type string :description "The http/https URL to fetch") '(:name "format" :type string :enum ["text" "html" "links"] :optional t :description "Output format: text (rendered), html (raw), or links (absolute URLs)") '(:name "timeout" :type integer :optional t :description "Timeout in seconds (default 30)") '(:name "max_bytes" :type integer :optional t :description "Max response bytes (default 16MiB, up to 32MiB)") '(:name "max_chars" :type integer :optional t :description "Max output chars (default 300k)") '(:name "include_titles" :type boolean :optional t :description "In links mode, include titles as a tab-separated second column") '(:name "user_agent" :type string :optional t :description "User-Agent preset or full string. Presets: desktop-chrome, desktop-firefox, desktop-safari, edge-windows, mobile-iphone, mobile-android, emacs-default") '(:name "accept" :type string :optional t :description "Accept header override") '(:name "accept_language" :type string :optional t :description "Accept-Language header override (e.g., en-US,en;q=0.9)") '(:name "debug" :type boolean :optional t :description "If true, prepend a compact metadata header")))) (provide 'fetch_page) ;;; fetch_page.el ends here