summaryrefslogtreecommitdiff
path: root/gptel-tools
diff options
context:
space:
mode:
Diffstat (limited to 'gptel-tools')
-rw-r--r--gptel-tools/fetch_page.el399
-rw-r--r--gptel-tools/fetch_page.el.disabled475
2 files changed, 475 insertions, 399 deletions
diff --git a/gptel-tools/fetch_page.el b/gptel-tools/fetch_page.el
deleted file mode 100644
index deaae1f9..00000000
--- a/gptel-tools/fetch_page.el
+++ /dev/null
@@ -1,399 +0,0 @@
-;;; fetch_page.el --- Web fetch tool for gptel (HTML/text/links) -*- lexical-binding: t; -*-
-
-;; Copyright (C) 2025 cjennings
-;; Author: cjennings and AI assistant
-;; Keywords: convenience, tools, web
-;; Package-Requires: ((emacs "27.1"))
-;; SPDX-License-Identifier: GPL-3.0-or-later
-
-;;; Commentary:
-;;
-;; fetch_page: In-memory web fetch-and-parse tool for gptel tools.
-;;
-;; Features
-;; - Fetch a URL (http/https), follow redirects.
-;; - Return one of: rendered text (DOM+shr), raw HTML, or extracted links.
-;; - Size/time guards: max_bytes, max_chars, timeout.
-;; - User-Agent presets; Accept/Accept-Language overrides.
-;; - Predictable, LLM-friendly outputs and explicit error suggestions.
-;;
-;; Notes
-;; - This tool uses url.el (asynchronous url-retrieve) and libxml/shr for HTML.
-;; - For non-text content (e.g., PDFs), it returns an explicit error and suggests
-;; using a download tool (e.g., download_url) instead.
-;;
-;; Installation
-;; - Load after gptel and registers a gptel tool named "fetch_page" (category "web").
-;;
-;;; Code:
-
-(require 'url)
-(require 'subr-x)
-(eval-when-compile (require 'cl-lib))
-(require 'dom)
-(require 'shr)
-
-;; Internal helpers are prefixed gptel-fetch-page--
-
-(defconst gptel-fetch-page--default-max-bytes (* 16 1024 1024))
-(defconst gptel-fetch-page--max-allowed-bytes (* 32 1024 1024))
-(defconst gptel-fetch-page--default-max-chars 300000)
-(defconst gptel-fetch-page--default-timeout 30)
-
-(defconst gptel-fetch-page--ua-presets
- '(("desktop-chrome" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
- ("desktop-firefox" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0")
- ("desktop-safari" . "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15")
- ("edge-windows" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0")
- ("mobile-iphone" . "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1")
- ("mobile-android" . "Mozilla/5.0 (Linux; Android 13; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36")
- ("emacs-default" . "Emacs gptel/fetch_page (+https://github.com/karthink/gptel)") )
- "Preset User-Agent strings.")
-
-(defun gptel-fetch-page--coerce-bool (v)
- (cond
- ((eq v :json-false) nil)
- ((stringp v) (not (string-blank-p v)))
- (t (and v t))))
-
-(defun gptel-fetch-page--coerce-int (v &optional default min max)
- (let ((n (cond
- ((numberp v) v)
- ((stringp v) (string-to-number v))
- (t nil))))
- (setq n (or n default))
- (when (and min n (< n min)) (setq n min))
- (when (and max n (> n max)) (setq n max))
- n))
-
-(defun gptel-fetch-page--ua (ua)
- "Return a User-Agent string from UA preset or use UA directly."
- (cond
- ((null ua) (cdr (assoc "desktop-chrome" gptel-fetch-page--ua-presets)))
- ((stringp ua)
- (or (cdr (assoc ua gptel-fetch-page--ua-presets)) ua))
- (t (cdr (assoc "desktop-chrome" gptel-fetch-page--ua-presets)))))
-
-(defun gptel-fetch-page--ok-scheme-p (url)
- (let ((u (ignore-errors (url-generic-parse-url url))))
- (and u (member (url-type u) '("http" "https")))))
-
-(defun gptel-fetch-page--parse-headers ()
- "Parse headers in current buffer up to `url-http-end-of-headers'."
- (let (hdrs)
- (save-excursion
- (goto-char (point-min))
- (while (re-search-forward "^\([^:]+\):[ \t]*\(.*\)$" url-http-end-of-headers t)
- (push (cons (downcase (match-string 1)) (string-trim (match-string 2))) hdrs)))
- (nreverse hdrs)))
-
-(defun gptel-fetch-page--content-type (headers)
- "Return (mime charset) from HEADERS alist."
- (let* ((ct (cdr (assoc "content-type" headers)))
- (mime (and ct (car (split-string ct ";" t "[ \t]*"))))
- (charset (and ct (when (string-match "charset=\\([^;]+\\)" ct)
- (downcase (match-string 1 ct))))))
- (list (or mime "unknown") charset)))
-
-(defun gptel-fetch-page--header-bytes (headers)
- (let ((cl (cdr (assoc "content-length" headers))))
- (and cl (ignore-errors (string-to-number cl)))))
-
-(defun gptel-fetch-page--region-string (beg end)
- (save-excursion
- (goto-char beg)
- (buffer-substring-no-properties beg end)))
-
-(defun gptel-fetch-page--html-to-dom (html)
- (with-temp-buffer
- (insert html)
- (condition-case err
- (if (fboundp 'libxml-parse-html-region)
- (libxml-parse-html-region (point-min) (point-max))
- (signal 'error '(libxml-missing)))
- (error (signal (car err) (cdr err))))))
-
-(defun gptel-fetch-page--dom-to-text (dom)
- (with-temp-buffer
- ;; Use shr to render
- (let ((shr-use-fonts nil)
- (shr-width (min 120 (window-body-width))))
- (shr-insert-document dom))
- (buffer-string)))
-
-(defun gptel-fetch-page--dom-base (dom final-url)
- (or (when-let* ((node (car (dom-by-tag dom 'base)))
- (href (dom-attr node 'href))) href)
- final-url))
-
-(defun gptel-fetch-page--resolve (href base)
- (condition-case _
- (if (and href (not (string-empty-p href)))
- (url-expand-file-name href base)
- nil)
- (error nil)))
-
-(defun gptel-fetch-page--collect-links (dom final-url include-titles)
- (let* ((base (gptel-fetch-page--dom-base dom final-url))
- (seen (make-hash-table :test 'equal))
- (out '()))
- (cl-labels ((push-link (u ttitle)
- (when (and u (not (gethash u seen)))
- (puthash u t seen)
- (push (if include-titles
- (format "%s\t%s" u (or (and ttitle (string-trim ttitle)) ""))
- u)
- out))))
- ;; <a> and <area>
- (dolist (tag '(a area))
- (dolist (n (dom-by-tag dom tag))
- (let ((href (dom-attr n 'href)))
- (push-link (gptel-fetch-page--resolve href base)
- (ignore-errors (string-trim (dom-texts n)))))))
- ;; <link rel="stylesheet"> included by request
- (dolist (n (dom-by-tag dom 'link))
- (let* ((rel (downcase (or (dom-attr n 'rel) "")))
- (is-style (or (string-match-p "stylesheet" rel)
- (string-match-p "\bstyle\b" rel)))
- (href (dom-attr n 'href)))
- (when is-style
- (push-link (gptel-fetch-page--resolve href base)
- (or (dom-attr n 'title) rel))))))
- (mapconcat #'identity (nreverse out) "\n")))
-
-(defun gptel-fetch-page--truncate (s max-chars)
- (if (and max-chars (> (length s) max-chars))
- (concat (substring s 0 max-chars) "\nnote: output truncated at " (number-to-string max-chars) " chars")
- s))
-
-(defun gptel-fetch-page--format-error (&rest kv)
- (let ((summary (plist-get kv :summary))
- (status (or (plist-get kv :status) "unknown"))
- (url (or (plist-get kv :url) "unknown"))
- (type (or (plist-get kv :type) "unknown"))
- (limit (plist-get kv :limit))
- (suggest (plist-get kv :suggest)))
- (string-join
- (delq nil
- (list (format "ERROR: %s" summary)
- (format "details: status=%s url=%s type=%s" status url type)
- (and limit (format "limit: %s" limit))
- (and suggest (format "suggest: %s" suggest))))
- "\n")))
-
-(defun gptel-fetch-page--format-header (status final-url type size)
- (format "[%s] final-url: %s | type: %s | size: %s B"
- status final-url type (or size "unknown")))
-
-(defun gptel-fetch-page--run (callback url format timeout max-bytes max-chars include-titles
- user-agent accept accept-language debug)
- (let* ((format (or format "text"))
- (timeout (gptel-fetch-page--coerce-int timeout gptel-fetch-page--default-timeout 1 nil))
- (max-bytes (gptel-fetch-page--coerce-int max-bytes gptel-fetch-page--default-max-bytes 1 gptel-fetch-page--max-allowed-bytes))
- (max-chars (gptel-fetch-page--coerce-int max-chars gptel-fetch-page--default-max-chars 1000 nil))
- (include-titles (gptel-fetch-page--coerce-bool include-titles))
- (debug (gptel-fetch-page--coerce-bool debug))
- (ua (gptel-fetch-page--ua user-agent))
- (accept (or accept "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"))
- (accept-language (or accept-language "en-US,en;q=0.9")))
- (unless (gptel-fetch-page--ok-scheme-p url)
- (funcall callback (gptel-fetch-page--format-error
- :summary "Only http/https URLs are supported"
- :url url
- :suggest "use an http/https URL") )
- (cl-return-from gptel-fetch-page--run))
-
- (let* ((url-request-method "GET")
- (url-request-extra-headers
- `(("User-Agent" . ,ua)
- ("Accept" . ,accept)
- ("Accept-Language" . ,accept-language)
- ;; Byte range to hint the server to cap size
- ,@(and max-bytes (list (cons "Range" (format "bytes=0-%d" (1- max-bytes)))))))
- (done nil)
- (timer nil)
- (orig-url url))
- (setq timer (run-at-time timeout nil
- (lambda ()
- (unless done
- (setq done t)
- (funcall callback (gptel-fetch-page--format-error
- :summary (format "Request timed out after %ss" timeout)
- :url orig-url
- :suggest "retry with a larger timeout or probe connectivity")))) )
- (url-retrieve
- url
- (lambda (status)
- (let ((inhibit-read-only t))
- (unwind-protect
- (progn
- (when (and (not done)) (setq done t) (when timer (cancel-timer timer)))
- (let ((bail nil))
- (when (plist-get status :error)
- (let ((err (plist-get status :error)))
- (funcall callback (gptel-fetch-page--format-error
- :summary (error-message-string err)
- :url orig-url
- :suggest "check URL or try again")))
- (setq bail t))
- ;; Parse status line & headers
- (goto-char (point-min))
- (let* ((status-line (buffer-substring (line-beginning-position) (line-end-position)))
- (http-status (or (and (boundp 'url-http-response-status) url-http-response-status)
- (and (string-match "HTTP/[0-9.]+ +\([0-9]+\)" status-line)
- (string-to-number (match-string 1 status-line)))))
- (status-msg (or (and (boundp 'url-http-response-status) url-http-response-status)
- http-status))
- (_ (search-forward "\n\n" nil t))
- (headers (gptel-fetch-page--parse-headers))
- (final-url (or (and (boundp 'url-current-object)
- (ignore-errors (url-recreate-url url-current-object))) orig-url))
- (ctype (gptel-fetch-page--content-type headers))
- (mime (car ctype))
- (_charset (cadr ctype))
- (declared-bytes (gptel-fetch-page--header-bytes headers))
- (body (gptel-fetch-page--region-string url-http-end-of-headers (point-max)))
- (actual-bytes (string-bytes body)))
- ;; Enforce size limits
- (when (and (not bail) declared-bytes max-bytes (> declared-bytes max-bytes))
- (funcall callback (gptel-fetch-page--format-error
- :summary (format "Response exceeds max_bytes (%d bytes) for in-memory fetch" max-bytes)
- :status http-status
- :url final-url
- :type mime
- :limit (format "max_bytes=%d max_chars=%d timeout=%ss" max-bytes max-chars timeout)
- :suggest (format "call download_url to save the file, or retry fetch_page with a larger max_bytes (up to %d)" gptel-fetch-page--max-allowed-bytes)))
- (setq bail t))
- (when (and (not bail) max-bytes (> actual-bytes max-bytes))
- (funcall callback (gptel-fetch-page--format-error
- :summary (format "Response exceeds max_bytes (%d bytes) during retrieval" max-bytes)
- :status http-status
- :url final-url
- :type mime
- :limit (format "max_bytes=%d max_chars=%d timeout=%ss" max-bytes max-chars timeout)
- :suggest (format "call download_url to save the file, or retry fetch_page with a larger max_bytes (up to %d)" gptel-fetch-page--max-allowed-bytes)))
- (setq bail t))
-
- (unless bail
- ;; Route based on content-type & requested format
- (pcase mime
- ((or "text/html" "application/xhtml+xml")
- (pcase format
- ("html"
- (let* ((content (gptel-fetch-page--truncate body max-chars))
- (out (if debug
- (concat (gptel-fetch-page--format-header status-msg final-url mime actual-bytes)
- "\n\n" content)
- content)))
- (funcall callback out)))
- ("text"
- (if (fboundp 'libxml-parse-html-region)
- (condition-case err
- (let* ((dom (gptel-fetch-page--html-to-dom body))
- (txt (gptel-fetch-page--truncate (gptel-fetch-page--dom-to-text dom) max-chars))
- (out (if debug
- (concat (gptel-fetch-page--format-header status-msg final-url mime actual-bytes)
- "\n\n" txt)
- txt)))
- (funcall callback out))
- (error
- (funcall callback (gptel-fetch-page--format-error
- :summary "HTML parsing failed"
- :status http-status
- :url final-url
- :type mime
- :suggest "return html format (raw), use download_url, or install Emacs with libxml2")))
- (funcall callback (gptel-fetch-page--format-error
- :summary "HTML parsing requires Emacs built with libxml2"
- :status http-status
- :url final-url
- :type mime
- :suggest "return html format (raw), use download_url, or install Emacs with libxml2"))))
- ("links"
- (if (fboundp 'libxml-parse-html-region)
- (condition-case err
- (let* ((dom (gptel-fetch-page--html-to-dom body))
- (links (gptel-fetch-page--collect-links dom final-url include-titles))
- (content (gptel-fetch-page--truncate links max-chars))
- (out (if debug
- (concat (gptel-fetch-page--format-header status-msg final-url mime actual-bytes)
- "\n\n" content)
- content)))
- (funcall callback out))
- (error
- (funcall callback (gptel-fetch-page--format-error
- :summary "HTML parsing failed"
- :status http-status
- :url final-url
- :type mime
- :suggest "return html format (raw), use download_url, or install Emacs with libxml2")))
- (funcall callback (gptel-fetch-page--format-error
- :summary "HTML parsing requires Emacs built with libxml2"
- :status http-status
- :url final-url
- :type mime
- :suggest "return html format (raw), use download_url, or install Emacs with libxml2"))))))
- ("text/plain"
- (let* ((content (gptel-fetch-page--truncate body max-chars))
- (out (if debug
- (concat (gptel-fetch-page--format-header status-msg final-url mime actual-bytes)
- "\n\n" content)
- content)))
- (funcall callback out)))
- (_
- (funcall callback (gptel-fetch-page--format-error
- :summary (format "Non-text content not supported for parsing (%s)" mime)
- :status http-status
- :url final-url
- :type mime
- :suggest "use download_url to save the file locally, then parse with a file tool")))))))
- (kill-buffer (current-buffer)))))
- nil t nil))))))))
-
-;; Public entry for the gptel tool function.
-(defun gptel-fetch-page--tool (callback url &optional format timeout max_bytes max_chars include_titles user_agent accept accept_language debug)
- (gptel-fetch-page--run callback url format timeout max_bytes max_chars include_titles user_agent accept accept_language debug))
-
-(with-eval-after-load 'gptel
- (gptel-make-tool
- :name "fetch_page"
- :function #'gptel-fetch-page--tool
- :description (concat
- "Fetch a web page and return rendered text, raw HTML, or links.\n"
- "- format=text|html|links (default text)\n"
- "- Guards: timeout (30s), max_bytes (default 16MiB, up to 32MiB), max_chars (300k)\n"
- "- Includes stylesheets in links mode; relative URLs resolved with <base>/final URL\n"
- "- Non-text types error with explicit suggestion to use a downloader.")
- :category "web"
- :async t
- :include t
- :args (list
- '(:name "url" :type string
- :description "The http/https URL to fetch")
- '(:name "format" :type string
- :enum ["text" "html" "links"]
- :optional t
- :description "Output format: text (rendered), html (raw), or links (absolute URLs)")
- '(:name "timeout" :type integer :optional t
- :description "Timeout in seconds (default 30)")
- '(:name "max_bytes" :type integer :optional t
- :description "Max response bytes (default 16MiB, up to 32MiB)")
- '(:name "max_chars" :type integer :optional t
- :description "Max output chars (default 300k)")
- '(:name "include_titles" :type boolean :optional t
- :description "In links mode, include titles as a tab-separated second column")
- '(:name "user_agent" :type string :optional t
- :description "User-Agent preset or full string. Presets: desktop-chrome, desktop-firefox, desktop-safari, edge-windows, mobile-iphone, mobile-android, emacs-default")
- '(:name "accept" :type string :optional t
- :description "Accept header override")
- '(:name "accept_language" :type string :optional t
- :description "Accept-Language header override (e.g., en-US,en;q=0.9)")
- '(:name "debug" :type boolean :optional t
- :description "If true, prepend a compact metadata header"))))
-
-;; Automatically add to gptel-tools on load
-(add-to-list 'gptel-tools (gptel-get-tool '("web" "fetch_page")))
-
-
-(provide 'fetch_page)
-;;; fetch_page.el ends here
diff --git a/gptel-tools/fetch_page.el.disabled b/gptel-tools/fetch_page.el.disabled
new file mode 100644
index 00000000..664f9be8
--- /dev/null
+++ b/gptel-tools/fetch_page.el.disabled
@@ -0,0 +1,475 @@
+;;; fetch_page.el --- Web fetch tool for gptel (HTML/text/links) -*- lexical-binding: t; -*-
+
+;; Copyright (C) 2025 cjennings
+;; Author: cjennings and AI assistant
+
+;;; Commentary:
+;;
+;; fetch_page: In-memory web fetch-and-parse tool for gptel tools.
+;;
+;; Features
+;; - Fetch a URL (http/https), follow redirects.
+;; - Return one of: rendered text (DOM+shr), raw HTML, or extracted links.
+;; - Size/time guards: max_bytes, max_chars, timeout.
+;; - User-Agent presets; Accept/Accept-Language overrides.
+;; - Predictable, LLM-friendly outputs and explicit error suggestions.
+;;
+;; Notes
+;; - This tool uses url.el (asynchronous url-retrieve) and libxml/shr for HTML.
+;; - For non-text content (e.g., PDFs), it returns an explicit error and suggests
+;; using a download tool (e.g., download_url) instead.
+;;
+;; Installation
+;; - Load after gptel and registers a gptel tool named "fetch_page" (category "web").
+;;
+;;; Code:
+
+(require 'url)
+(require 'url-http) ;; for url-http-end-of-headers, response status, etc.
+(require 'subr-x)
+(eval-when-compile (require 'cl-lib))
+(require 'dom)
+(require 'shr)
+
+;; Internal helpers are prefixed gptel-fetch-page--
+
+(defconst gptel-fetch-page--default-max-bytes (* 16 1024 1024))
+(defconst gptel-fetch-page--max-allowed-bytes (* 32 1024 1024))
+(defconst gptel-fetch-page--default-max-chars 300000)
+(defconst gptel-fetch-page--default-timeout 30)
+
+(defconst gptel-fetch-page--ua-presets
+ '(("desktop-chrome" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+ ("desktop-firefox" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0")
+ ("desktop-safari" . "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15")
+ ("edge-windows" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0")
+ ("mobile-iphone" . "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1")
+ ("mobile-android" . "Mozilla/5.0 (Linux; Android 13; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36")
+ ("emacs-default" . "Emacs gptel/fetch_page (+https://github.com/karthink/gptel)") )
+ "Preset User-Agent strings.")
+
+(defun gptel-fetch-page--coerce-bool (v)
+ (cond
+ ((eq v :json-false) nil)
+ ((stringp v) (not (string-blank-p v)))
+ (t (and v t))))
+
+(defun gptel-fetch-page--coerce-int (v &optional default min max)
+ (let ((n (cond
+ ((numberp v) v)
+ ((stringp v) (string-to-number v))
+ (t nil))))
+ (setq n (or n default))
+ (when (and min n (< n min)) (setq n min))
+ (when (and max n (> n max)) (setq n max))
+ n))
+
+(defun gptel-fetch-page--ua (ua)
+ "Return a User-Agent string from UA preset or use UA directly."
+ (cond
+ ((null ua) (cdr (assoc "desktop-chrome" gptel-fetch-page--ua-presets)))
+ ((stringp ua)
+ (or (cdr (assoc ua gptel-fetch-page--ua-presets)) ua))
+ (t (cdr (assoc "desktop-chrome" gptel-fetch-page--ua-presets)))))
+
+(defun gptel-fetch-page--ok-scheme-p (url)
+ (let ((u (ignore-errors (url-generic-parse-url url))))
+ (and u (member (url-type u) '("http" "https")))))
+
+(defun gptel-fetch-page--parse-headers ()
+ "Parse headers in current buffer up to `url-http-end-of-headers'."
+ (let (hdrs)
+ (save-excursion
+ (goto-char (point-min))
+ (while (re-search-forward "^\\([^:]+\\):[ \t]*\\(.*\\)$" url-http-end-of-headers t)
+ (push (cons (downcase (match-string 1)) (string-trim (match-string 2))) hdrs)))
+ (nreverse hdrs)))
+
+(defun gptel-fetch-page--content-type (headers)
+ "Return (mime charset) from HEADERS alist."
+ (let* ((ct (cdr (assoc "content-type" headers)))
+ (mime (and ct (car (split-string ct ";" t "[ \t]*"))))
+ (charset (and ct (when (string-match "charset=\\([^;]+\\)" ct)
+ (downcase (match-string 1 ct))))))
+ (list (or mime "unknown") charset)))
+
+(defun gptel-fetch-page--header-bytes (headers)
+ "Extract and return the Content-Length from HEADERS as a number.
+
+HEADERS is an alist representing HTTP headers. This function looks
+up the 'Content-Length' header, safely converting its value to a number,
+or returning nil if the header is absent or conversion fails."
+ (let ((cl (cdr (assoc "content-length" headers))))
+ (and cl (ignore-errors (string-to-number cl)))))
+
+(defun gptel-fetch-page--region-string (beg end)
+ (save-excursion
+ (goto-char beg)
+ (buffer-substring-no-properties beg end)))
+
+(defun gptel-fetch-page--html-to-dom (html)
+ (with-temp-buffer
+ (insert html)
+ (condition-case err
+ (if (fboundp 'libxml-parse-html-region)
+ (libxml-parse-html-region (point-min) (point-max))
+ (signal 'error '(libxml-missing)))
+ (error (signal (car err) (cdr err))))))
+
+(defun gptel-fetch-page--dom-to-text (dom)
+ (with-temp-buffer
+ ;; Use shr to render; keep output lean and deterministic.
+ (let ((shr-use-fonts nil)
+ (shr-inhibit-images t)
+ (shr-width (min 120 (window-body-width))))
+ (shr-insert-document dom))
+ (buffer-string)))
+
+(defun gptel-fetch-page--dom-base (dom final-url)
+ (or (when-let* ((node (car (dom-by-tag dom 'base)))
+ (href (dom-attr node 'href))) href)
+ final-url))
+
+(defun gptel-fetch-page--resolve (href base)
+ (condition-case _
+ (if (and href (not (string-empty-p href)))
+ (url-expand-file-name href base)
+ nil)
+ (error nil)))
+
+(defun gptel-fetch-page--collect-links (dom final-url include-titles)
+ (let* ((base (gptel-fetch-page--dom-base dom final-url))
+ (seen (make-hash-table :test 'equal))
+ (out '()))
+ (cl-labels ((push-link (u ttitle)
+ (when (and u (not (gethash u seen)))
+ (puthash u t seen)
+ (push (if include-titles
+ (format "%s\t%s" u (or (and ttitle (string-trim ttitle)) ""))
+ u)
+ out))))
+ ;; <a> and <area>
+ (dolist (tag '(a area))
+ (dolist (n (dom-by-tag dom tag))
+ (let ((href (dom-attr n 'href)))
+ (push-link (gptel-fetch-page--resolve href base)
+ (ignore-errors (string-trim (dom-texts n)))))))
+ ;; <link rel="stylesheet"> included by request
+ (dolist (n (dom-by-tag dom 'link))
+ (let* ((rel (downcase (or (dom-attr n 'rel) "")))
+ (is-style (string-match-p "stylesheet" rel))
+ (href (dom-attr n 'href)))
+ (when is-style
+ (push-link (gptel-fetch-page--resolve href base)
+ (or (dom-attr n 'title) rel))))))
+ (mapconcat #'identity (nreverse out) "\n")))
+
+(defun gptel-fetch-page--truncate (s max-chars)
+ (if (and max-chars (> (length s) max-chars))
+ (concat (substring s 0 max-chars)
+ "\nnote: output truncated at " (number-to-string max-chars) " chars")
+ s))
+
+(defun gptel-fetch-page--format-error (&rest kv)
+ (let ((summary (plist-get kv :summary))
+ (status (or (plist-get kv :status) "unknown"))
+ (url (or (plist-get kv :url) "unknown"))
+ (type (or (plist-get kv :type) "unknown"))
+ (limit (plist-get kv :limit))
+ (suggest (plist-get kv :suggest)))
+ (string-join
+ (delq nil
+ (list (format "ERROR: %s" summary)
+ (format "details: status=%s url=%s type=%s" status url type)
+ (and limit (format "limit: %s" limit))
+ (and suggest (format "suggest: %s" suggest))))
+ "\n")))
+
+(defun gptel-fetch-page--format-header (status final-url type size)
+ (format "[%s] final-url: %s | type: %s | size: %s B"
+ status final-url type (or size "unknown")))
+
+;; ----------------------------
+;; Refactored orchestrator bits
+;; ----------------------------
+
+(defun gptel-fetch-page--normalize-args (url out-format timeout max-bytes max-chars include-titles
+ user-agent accept accept-language debug)
+ "Coerce and validate arguments into a request plist."
+ (let* ((out-format (or out-format "text"))
+ (timeout (gptel-fetch-page--coerce-int timeout gptel-fetch-page--default-timeout 1 nil))
+ (max-bytes (gptel-fetch-page--coerce-int max-bytes gptel-fetch-page--default-max-bytes
+ 1 gptel-fetch-page--max-allowed-bytes))
+ (max-chars (gptel-fetch-page--coerce-int max-chars gptel-fetch-page--default-max-chars 1000 nil))
+ (include-titles (gptel-fetch-page--coerce-bool include-titles))
+ (debug (gptel-fetch-page--coerce-bool debug))
+ (ua (gptel-fetch-page--ua user-agent))
+ (accept (or accept "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"))
+ (accept-language (or accept-language "en-US,en;q=0.9")))
+ (list :url url
+ :out-format out-format
+ :timeout timeout
+ :max-bytes max-bytes
+ :max-chars max-chars
+ :include-titles include-titles
+ :debug debug
+ :ua ua
+ :accept accept
+ :accept-language accept-language)))
+
+(defun gptel-fetch-page--build-request-headers (req)
+ "Build url.el headers alist from REQ."
+ (let ((max-bytes (plist-get req :max-bytes)))
+ `(("User-Agent" . ,(plist-get req :ua))
+ ("Accept" . ,(plist-get req :accept))
+ ("Accept-Language" . ,(plist-get req :accept-language))
+ ,@(and max-bytes (list (cons "Range" (format "bytes=0-%d" (1- max-bytes))))))))
+
+(defun gptel-fetch-page--start-timeout (req callback done-cell)
+ "Start a timer based on REQ's :timeout that calls CALLBACK with an error.
+DONE-CELL is a cons cell used as shared mutable state; its car is set non-nil once done."
+ (let ((timeout (plist-get req :timeout))
+ (orig-url (plist-get req :url)))
+ (run-at-time timeout nil
+ (lambda ()
+ (unless (car done-cell)
+ (setcar done-cell t)
+ (funcall callback
+ (gptel-fetch-page--format-error
+ :summary (format "Request timed out after %ss" timeout)
+ :url orig-url
+ :suggest "retry with a larger timeout or probe connectivity")))))))
+
+(defun gptel-fetch-page--parse-response-buffer ()
+ "Parse the current url-retrieve buffer into a response plist."
+ (save-excursion
+ (goto-char (point-min))
+ (let* ((status-line (buffer-substring (line-beginning-position) (line-end-position)))
+ (http-status
+ (or (and (boundp 'url-http-response-status) url-http-response-status)
+ (and (string-match "HTTP/[0-9.]+ +\\([0-9]+\\)" status-line)
+ (string-to-number (match-string 1 status-line)))))
+ (headers (gptel-fetch-page--parse-headers))
+ (final-url (or (and (boundp 'url-current-object)
+ (ignore-errors (url-recreate-url url-current-object)))
+ "unknown"))
+ (ctype (gptel-fetch-page--content-type headers))
+ (mime (car ctype))
+ (charset (cadr ctype))
+ (declared-bytes (gptel-fetch-page--header-bytes headers))
+ (body (gptel-fetch-page--region-string url-http-end-of-headers (point-max)))
+ (actual-bytes (string-bytes body)))
+ (list :status http-status
+ :status-line status-line
+ :headers headers
+ :final-url final-url
+ :mime mime
+ :charset charset
+ :declared-bytes declared-bytes
+ :body body
+ :actual-bytes actual-bytes))))
+
+(defun gptel-fetch-page--enforce-limits (resp req)
+ "Check response sizes and return either (:ok t RESP) or (:error STRING)."
+ (let* ((http-status (plist-get resp :status))
+ (final-url (plist-get resp :final-url))
+ (mime (plist-get resp :mime))
+ (declared (plist-get resp :declared-bytes))
+ (actual (plist-get resp :actual-bytes))
+ (max-bytes (plist-get req :max-bytes))
+ (max-chars (plist-get req :max-chars))
+ (timeout (plist-get req :timeout)))
+ (cond
+ ((and declared max-bytes (> declared max-bytes))
+ (list :error (gptel-fetch-page--format-error
+ :summary (format "Response exceeds max_bytes (%d bytes) for in-memory fetch" max-bytes)
+ :status http-status
+ :url final-url
+ :type mime
+ :limit (format "max_bytes=%d max_chars=%d timeout=%ss" max-bytes max-chars timeout)
+ :suggest (format "call download_url to save the file, or retry fetch_page with a larger max_bytes (up to %d)"
+ gptel-fetch-page--max-allowed-bytes)))))
+ ((and max-bytes (> actual max-bytes))
+ (list :error (gptel-fetch-page--format-error
+ :summary (format "Response exceeds max_bytes (%d bytes) during retrieval" max-bytes)
+ :status http-status
+ :url final-url
+ :type mime
+ :limit (format "max_bytes=%d max_chars=%d timeout=%ss" max-bytes max-chars timeout)
+ :suggest (format "call download_url to save the file, or retry fetch_page with a larger max_bytes (up to %d)"
+ gptel-fetch-page--max-allowed-bytes)))))
+ (t (list :ok t resp)))))
+
+(defun gptel-fetch-page--maybe-decode-body (body charset)
+ "Decode BODY using CHARSET if recognized; otherwise return BODY unchanged."
+ (let ((cs (and charset (ignore-errors (coding-system-from-name charset)))))
+ (if (and cs (coding-system-p cs))
+ (condition-case _
+ (decode-coding-string body cs)
+ (error body))
+ body)))
+
+(defun gptel-fetch-page--render (resp req)
+ "Render the response according to request preferences.
+Returns either (:ok t STRING) or (:error STRING)."
+ (let* ((http-status (plist-get resp :status))
+ (final-url (plist-get resp :final-url))
+ (mime (plist-get resp :mime))
+ (charset (plist-get resp :charset))
+ (raw-body (plist-get resp :body))
+ (body (gptel-fetch-page--maybe-decode-body raw-body charset))
+ (out-format (plist-get req :out-format))
+ (max-chars (plist-get req :max-chars))
+ (debug (plist-get req :debug))
+ (include-titles (plist-get req :include-titles))
+ (actual-bytes (plist-get resp :actual-bytes)))
+ (pcase mime
+ ((or "text/html" "application/xhtml+xml")
+ (pcase out-format
+ ("html"
+ (let* ((content (gptel-fetch-page--truncate body max-chars))
+ (out (if debug
+ (concat (gptel-fetch-page--format-header http-status final-url mime actual-bytes)
+ "\n\n" content)
+ content)))
+ (list :ok t out)))
+ ((or "text" "links")
+ (if (fboundp 'libxml-parse-html-region)
+ (condition-case _
+ (let* ((dom (gptel-fetch-page--html-to-dom body))
+ (payload (if (string= out-format "text")
+ (gptel-fetch-page--dom-to-text dom)
+ (gptel-fetch-page--collect-links dom final-url include-titles)))
+ (content (gptel-fetch-page--truncate payload max-chars))
+ (out (if debug
+ (concat (gptel-fetch-page--format-header http-status final-url mime actual-bytes)
+ "\n\n" content)
+ content)))
+ (list :ok t out))
+ (error
+ (list :error (gptel-fetch-page--format-error
+ :summary "HTML parsing failed"
+ :status http-status
+ :url final-url
+ :type mime
+ :suggest "return html format (raw), use download_url, or install Emacs with libxml2"))))
+ (list :error (gptel-fetch-page--format-error
+ :summary "HTML parsing requires Emacs built with libxml2"
+ :status http-status
+ :url final-url
+ :type mime
+ :suggest "return html format (raw), use download_url, or install Emacs with libxml2"))))
+ (_
+ (list :error (gptel-fetch-page--format-error
+ :summary (format "Invalid format '%s' for HTML content" out-format)
+ :status http-status
+ :url final-url
+ :type mime
+ :suggest "use format=text, format=html, or format=links")))))
+ ("text/plain"
+ (let* ((content (gptel-fetch-page--truncate body max-chars))
+ (out (if debug
+ (concat (gptel-fetch-page--format-header http-status final-url mime actual-bytes)
+ "\n\n" content)
+ content)))
+ (list :ok t out)))
+ (_
+ (list :error (gptel-fetch-page--format-error
+ :summary (format "Non-text content not supported for parsing (%s)" mime)
+ :status http-status
+ :url final-url
+ :type mime
+ :suggest "use download_url to save the file locally, then parse with a file tool"))))))
+
+;; ----------------------------
+;; Orchestrator
+;; ----------------------------
+
+(defun gptel-fetch-page--run (callback url out-format timeout max-bytes max-chars include-titles
+ user-agent accept accept-language debug)
+ "Orchestrate async fetch and rendering for gptel fetch_page."
+ (let ((req (gptel-fetch-page--normalize-args url out-format timeout max-bytes max-chars
+ include-titles user-agent accept accept-language debug)))
+ (unless (gptel-fetch-page--ok-scheme-p (plist-get req :url))
+ (funcall callback (gptel-fetch-page--format-error
+ :summary "Only http/https URLs are supported"
+ :url (plist-get req :url)
+ :suggest "use an http/https URL"))
+ (cl-return-from gptel-fetch-page--run))
+ (let* ((url-request-method "GET")
+ (url-request-extra-headers (gptel-fetch-page--build-request-headers req))
+ (done (list nil)) ;; cons cell used as shared mutable flag
+ (orig-url (plist-get req :url))
+ timer)
+ (setq timer (gptel-fetch-page--start-timeout req callback done))
+ (url-retrieve
+ orig-url
+ (lambda (status)
+ (let ((inhibit-read-only t))
+ (unwind-protect
+ (progn
+ (unless (car done)
+ (setcar done t)
+ (when (timerp timer) (cancel-timer timer))
+ (if (plist-get status :error)
+ (let ((err (plist-get status :error)))
+ (funcall callback (gptel-fetch-page--format-error
+ :summary (error-message-string err)
+ :url orig-url
+ :suggest "check URL or try again")))
+ (let* ((resp (gptel-fetch-page--parse-response-buffer))
+ (limit-check (gptel-fetch-page--enforce-limits resp req)))
+ (if (plist-get limit-check :error)
+ (funcall callback (plist-get limit-check :error))
+ (let ((rendered (gptel-fetch-page--render resp req)))
+ (funcall callback
+ (or (plist-get rendered :error)
+ (caddr rendered))))))))
+ (kill-buffer (current-buffer)))
+ (ignore-errors (kill-buffer (current-buffer))))))
+ nil t nil))))
+
+
+;; Public entry for the gptel tool function.
+(defun gptel-fetch-page--tool (callback url &optional format timeout max_bytes max_chars include_titles user_agent accept accept_language debug)
+ (gptel-fetch-page--run callback url format timeout max_bytes max_chars include_titles user_agent accept accept_language debug))
+
+(with-eval-after-load 'gptel
+ (gptel-make-tool
+ :name "fetch_page"
+ :function #'gptel-fetch-page--tool
+ :description (concat
+ "Fetch a web page and return rendered text, raw HTML, or links.\n"
+ "- format=text|html|links (default text)\n"
+ "- Guards: timeout (30s), max_bytes (default 16MiB, up to 32MiB), max_chars (300k)\n"
+ "- Includes stylesheets in links mode; relative URLs resolved with <base>/final URL\n"
+ "- Non-text types error with explicit suggestion to use a downloader.")
+ :category "web"
+ :async t
+ :include t
+ :args (list
+ '(:name "url" :type string
+ :description "The http/https URL to fetch")
+ '(:name "format" :type string
+ :enum ["text" "html" "links"]
+ :optional t
+ :description "Output format: text (rendered), html (raw), or links (absolute URLs)")
+ '(:name "timeout" :type integer :optional t
+ :description "Timeout in seconds (default 30)")
+ '(:name "max_bytes" :type integer :optional t
+ :description "Max response bytes (default 16MiB, up to 32MiB)")
+ '(:name "max_chars" :type integer :optional t
+ :description "Max output chars (default 300k)")
+ '(:name "include_titles" :type boolean :optional t
+ :description "In links mode, include titles as a tab-separated second column")
+ '(:name "user_agent" :type string :optional t
+ :description "User-Agent preset or full string. Presets: desktop-chrome, desktop-firefox, desktop-safari, edge-windows, mobile-iphone, mobile-android, emacs-default")
+ '(:name "accept" :type string :optional t
+ :description "Accept header override")
+ '(:name "accept_language" :type string :optional t
+ :description "Accept-Language header override (e.g., en-US,en;q=0.9)")
+ '(:name "debug" :type boolean :optional t
+ :description "If true, prepend a compact metadata header"))))
+
+(provide 'fetch_page)
+;;; fetch_page.el ends here