diff options
Diffstat (limited to 'gptel-tools')
| -rw-r--r-- | gptel-tools/fetch_page.el | 399 | ||||
| -rw-r--r-- | gptel-tools/fetch_page.el.disabled | 475 |
2 files changed, 475 insertions, 399 deletions
diff --git a/gptel-tools/fetch_page.el b/gptel-tools/fetch_page.el deleted file mode 100644 index deaae1f9..00000000 --- a/gptel-tools/fetch_page.el +++ /dev/null @@ -1,399 +0,0 @@ -;;; fetch_page.el --- Web fetch tool for gptel (HTML/text/links) -*- lexical-binding: t; -*- - -;; Copyright (C) 2025 cjennings -;; Author: cjennings and AI assistant -;; Keywords: convenience, tools, web -;; Package-Requires: ((emacs "27.1")) -;; SPDX-License-Identifier: GPL-3.0-or-later - -;;; Commentary: -;; -;; fetch_page: In-memory web fetch-and-parse tool for gptel tools. -;; -;; Features -;; - Fetch a URL (http/https), follow redirects. -;; - Return one of: rendered text (DOM+shr), raw HTML, or extracted links. -;; - Size/time guards: max_bytes, max_chars, timeout. -;; - User-Agent presets; Accept/Accept-Language overrides. -;; - Predictable, LLM-friendly outputs and explicit error suggestions. -;; -;; Notes -;; - This tool uses url.el (asynchronous url-retrieve) and libxml/shr for HTML. -;; - For non-text content (e.g., PDFs), it returns an explicit error and suggests -;; using a download tool (e.g., download_url) instead. -;; -;; Installation -;; - Load after gptel and registers a gptel tool named "fetch_page" (category "web"). -;; -;;; Code: - -(require 'url) -(require 'subr-x) -(eval-when-compile (require 'cl-lib)) -(require 'dom) -(require 'shr) - -;; Internal helpers are prefixed gptel-fetch-page-- - -(defconst gptel-fetch-page--default-max-bytes (* 16 1024 1024)) -(defconst gptel-fetch-page--max-allowed-bytes (* 32 1024 1024)) -(defconst gptel-fetch-page--default-max-chars 300000) -(defconst gptel-fetch-page--default-timeout 30) - -(defconst gptel-fetch-page--ua-presets - '(("desktop-chrome" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") - ("desktop-firefox" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0") - ("desktop-safari" . "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15") - ("edge-windows" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0") - ("mobile-iphone" . "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1") - ("mobile-android" . "Mozilla/5.0 (Linux; Android 13; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36") - ("emacs-default" . "Emacs gptel/fetch_page (+https://github.com/karthink/gptel)") ) - "Preset User-Agent strings.") - -(defun gptel-fetch-page--coerce-bool (v) - (cond - ((eq v :json-false) nil) - ((stringp v) (not (string-blank-p v))) - (t (and v t)))) - -(defun gptel-fetch-page--coerce-int (v &optional default min max) - (let ((n (cond - ((numberp v) v) - ((stringp v) (string-to-number v)) - (t nil)))) - (setq n (or n default)) - (when (and min n (< n min)) (setq n min)) - (when (and max n (> n max)) (setq n max)) - n)) - -(defun gptel-fetch-page--ua (ua) - "Return a User-Agent string from UA preset or use UA directly." - (cond - ((null ua) (cdr (assoc "desktop-chrome" gptel-fetch-page--ua-presets))) - ((stringp ua) - (or (cdr (assoc ua gptel-fetch-page--ua-presets)) ua)) - (t (cdr (assoc "desktop-chrome" gptel-fetch-page--ua-presets))))) - -(defun gptel-fetch-page--ok-scheme-p (url) - (let ((u (ignore-errors (url-generic-parse-url url)))) - (and u (member (url-type u) '("http" "https"))))) - -(defun gptel-fetch-page--parse-headers () - "Parse headers in current buffer up to `url-http-end-of-headers'." - (let (hdrs) - (save-excursion - (goto-char (point-min)) - (while (re-search-forward "^\([^:]+\):[ \t]*\(.*\)$" url-http-end-of-headers t) - (push (cons (downcase (match-string 1)) (string-trim (match-string 2))) hdrs))) - (nreverse hdrs))) - -(defun gptel-fetch-page--content-type (headers) - "Return (mime charset) from HEADERS alist." - (let* ((ct (cdr (assoc "content-type" headers))) - (mime (and ct (car (split-string ct ";" t "[ \t]*")))) - (charset (and ct (when (string-match "charset=\\([^;]+\\)" ct) - (downcase (match-string 1 ct)))))) - (list (or mime "unknown") charset))) - -(defun gptel-fetch-page--header-bytes (headers) - (let ((cl (cdr (assoc "content-length" headers)))) - (and cl (ignore-errors (string-to-number cl))))) - -(defun gptel-fetch-page--region-string (beg end) - (save-excursion - (goto-char beg) - (buffer-substring-no-properties beg end))) - -(defun gptel-fetch-page--html-to-dom (html) - (with-temp-buffer - (insert html) - (condition-case err - (if (fboundp 'libxml-parse-html-region) - (libxml-parse-html-region (point-min) (point-max)) - (signal 'error '(libxml-missing))) - (error (signal (car err) (cdr err)))))) - -(defun gptel-fetch-page--dom-to-text (dom) - (with-temp-buffer - ;; Use shr to render - (let ((shr-use-fonts nil) - (shr-width (min 120 (window-body-width)))) - (shr-insert-document dom)) - (buffer-string))) - -(defun gptel-fetch-page--dom-base (dom final-url) - (or (when-let* ((node (car (dom-by-tag dom 'base))) - (href (dom-attr node 'href))) href) - final-url)) - -(defun gptel-fetch-page--resolve (href base) - (condition-case _ - (if (and href (not (string-empty-p href))) - (url-expand-file-name href base) - nil) - (error nil))) - -(defun gptel-fetch-page--collect-links (dom final-url include-titles) - (let* ((base (gptel-fetch-page--dom-base dom final-url)) - (seen (make-hash-table :test 'equal)) - (out '())) - (cl-labels ((push-link (u ttitle) - (when (and u (not (gethash u seen))) - (puthash u t seen) - (push (if include-titles - (format "%s\t%s" u (or (and ttitle (string-trim ttitle)) "")) - u) - out)))) - ;; <a> and <area> - (dolist (tag '(a area)) - (dolist (n (dom-by-tag dom tag)) - (let ((href (dom-attr n 'href))) - (push-link (gptel-fetch-page--resolve href base) - (ignore-errors (string-trim (dom-texts n))))))) - ;; <link rel="stylesheet"> included by request - (dolist (n (dom-by-tag dom 'link)) - (let* ((rel (downcase (or (dom-attr n 'rel) ""))) - (is-style (or (string-match-p "stylesheet" rel) - (string-match-p "\bstyle\b" rel))) - (href (dom-attr n 'href))) - (when is-style - (push-link (gptel-fetch-page--resolve href base) - (or (dom-attr n 'title) rel)))))) - (mapconcat #'identity (nreverse out) "\n"))) - -(defun gptel-fetch-page--truncate (s max-chars) - (if (and max-chars (> (length s) max-chars)) - (concat (substring s 0 max-chars) "\nnote: output truncated at " (number-to-string max-chars) " chars") - s)) - -(defun gptel-fetch-page--format-error (&rest kv) - (let ((summary (plist-get kv :summary)) - (status (or (plist-get kv :status) "unknown")) - (url (or (plist-get kv :url) "unknown")) - (type (or (plist-get kv :type) "unknown")) - (limit (plist-get kv :limit)) - (suggest (plist-get kv :suggest))) - (string-join - (delq nil - (list (format "ERROR: %s" summary) - (format "details: status=%s url=%s type=%s" status url type) - (and limit (format "limit: %s" limit)) - (and suggest (format "suggest: %s" suggest)))) - "\n"))) - -(defun gptel-fetch-page--format-header (status final-url type size) - (format "[%s] final-url: %s | type: %s | size: %s B" - status final-url type (or size "unknown"))) - -(defun gptel-fetch-page--run (callback url format timeout max-bytes max-chars include-titles - user-agent accept accept-language debug) - (let* ((format (or format "text")) - (timeout (gptel-fetch-page--coerce-int timeout gptel-fetch-page--default-timeout 1 nil)) - (max-bytes (gptel-fetch-page--coerce-int max-bytes gptel-fetch-page--default-max-bytes 1 gptel-fetch-page--max-allowed-bytes)) - (max-chars (gptel-fetch-page--coerce-int max-chars gptel-fetch-page--default-max-chars 1000 nil)) - (include-titles (gptel-fetch-page--coerce-bool include-titles)) - (debug (gptel-fetch-page--coerce-bool debug)) - (ua (gptel-fetch-page--ua user-agent)) - (accept (or accept "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")) - (accept-language (or accept-language "en-US,en;q=0.9"))) - (unless (gptel-fetch-page--ok-scheme-p url) - (funcall callback (gptel-fetch-page--format-error - :summary "Only http/https URLs are supported" - :url url - :suggest "use an http/https URL") ) - (cl-return-from gptel-fetch-page--run)) - - (let* ((url-request-method "GET") - (url-request-extra-headers - `(("User-Agent" . ,ua) - ("Accept" . ,accept) - ("Accept-Language" . ,accept-language) - ;; Byte range to hint the server to cap size - ,@(and max-bytes (list (cons "Range" (format "bytes=0-%d" (1- max-bytes))))))) - (done nil) - (timer nil) - (orig-url url)) - (setq timer (run-at-time timeout nil - (lambda () - (unless done - (setq done t) - (funcall callback (gptel-fetch-page--format-error - :summary (format "Request timed out after %ss" timeout) - :url orig-url - :suggest "retry with a larger timeout or probe connectivity")))) ) - (url-retrieve - url - (lambda (status) - (let ((inhibit-read-only t)) - (unwind-protect - (progn - (when (and (not done)) (setq done t) (when timer (cancel-timer timer))) - (let ((bail nil)) - (when (plist-get status :error) - (let ((err (plist-get status :error))) - (funcall callback (gptel-fetch-page--format-error - :summary (error-message-string err) - :url orig-url - :suggest "check URL or try again"))) - (setq bail t)) - ;; Parse status line & headers - (goto-char (point-min)) - (let* ((status-line (buffer-substring (line-beginning-position) (line-end-position))) - (http-status (or (and (boundp 'url-http-response-status) url-http-response-status) - (and (string-match "HTTP/[0-9.]+ +\([0-9]+\)" status-line) - (string-to-number (match-string 1 status-line))))) - (status-msg (or (and (boundp 'url-http-response-status) url-http-response-status) - http-status)) - (_ (search-forward "\n\n" nil t)) - (headers (gptel-fetch-page--parse-headers)) - (final-url (or (and (boundp 'url-current-object) - (ignore-errors (url-recreate-url url-current-object))) orig-url)) - (ctype (gptel-fetch-page--content-type headers)) - (mime (car ctype)) - (_charset (cadr ctype)) - (declared-bytes (gptel-fetch-page--header-bytes headers)) - (body (gptel-fetch-page--region-string url-http-end-of-headers (point-max))) - (actual-bytes (string-bytes body))) - ;; Enforce size limits - (when (and (not bail) declared-bytes max-bytes (> declared-bytes max-bytes)) - (funcall callback (gptel-fetch-page--format-error - :summary (format "Response exceeds max_bytes (%d bytes) for in-memory fetch" max-bytes) - :status http-status - :url final-url - :type mime - :limit (format "max_bytes=%d max_chars=%d timeout=%ss" max-bytes max-chars timeout) - :suggest (format "call download_url to save the file, or retry fetch_page with a larger max_bytes (up to %d)" gptel-fetch-page--max-allowed-bytes))) - (setq bail t)) - (when (and (not bail) max-bytes (> actual-bytes max-bytes)) - (funcall callback (gptel-fetch-page--format-error - :summary (format "Response exceeds max_bytes (%d bytes) during retrieval" max-bytes) - :status http-status - :url final-url - :type mime - :limit (format "max_bytes=%d max_chars=%d timeout=%ss" max-bytes max-chars timeout) - :suggest (format "call download_url to save the file, or retry fetch_page with a larger max_bytes (up to %d)" gptel-fetch-page--max-allowed-bytes))) - (setq bail t)) - - (unless bail - ;; Route based on content-type & requested format - (pcase mime - ((or "text/html" "application/xhtml+xml") - (pcase format - ("html" - (let* ((content (gptel-fetch-page--truncate body max-chars)) - (out (if debug - (concat (gptel-fetch-page--format-header status-msg final-url mime actual-bytes) - "\n\n" content) - content))) - (funcall callback out))) - ("text" - (if (fboundp 'libxml-parse-html-region) - (condition-case err - (let* ((dom (gptel-fetch-page--html-to-dom body)) - (txt (gptel-fetch-page--truncate (gptel-fetch-page--dom-to-text dom) max-chars)) - (out (if debug - (concat (gptel-fetch-page--format-header status-msg final-url mime actual-bytes) - "\n\n" txt) - txt))) - (funcall callback out)) - (error - (funcall callback (gptel-fetch-page--format-error - :summary "HTML parsing failed" - :status http-status - :url final-url - :type mime - :suggest "return html format (raw), use download_url, or install Emacs with libxml2"))) - (funcall callback (gptel-fetch-page--format-error - :summary "HTML parsing requires Emacs built with libxml2" - :status http-status - :url final-url - :type mime - :suggest "return html format (raw), use download_url, or install Emacs with libxml2")))) - ("links" - (if (fboundp 'libxml-parse-html-region) - (condition-case err - (let* ((dom (gptel-fetch-page--html-to-dom body)) - (links (gptel-fetch-page--collect-links dom final-url include-titles)) - (content (gptel-fetch-page--truncate links max-chars)) - (out (if debug - (concat (gptel-fetch-page--format-header status-msg final-url mime actual-bytes) - "\n\n" content) - content))) - (funcall callback out)) - (error - (funcall callback (gptel-fetch-page--format-error - :summary "HTML parsing failed" - :status http-status - :url final-url - :type mime - :suggest "return html format (raw), use download_url, or install Emacs with libxml2"))) - (funcall callback (gptel-fetch-page--format-error - :summary "HTML parsing requires Emacs built with libxml2" - :status http-status - :url final-url - :type mime - :suggest "return html format (raw), use download_url, or install Emacs with libxml2")))))) - ("text/plain" - (let* ((content (gptel-fetch-page--truncate body max-chars)) - (out (if debug - (concat (gptel-fetch-page--format-header status-msg final-url mime actual-bytes) - "\n\n" content) - content))) - (funcall callback out))) - (_ - (funcall callback (gptel-fetch-page--format-error - :summary (format "Non-text content not supported for parsing (%s)" mime) - :status http-status - :url final-url - :type mime - :suggest "use download_url to save the file locally, then parse with a file tool"))))))) - (kill-buffer (current-buffer))))) - nil t nil)))))))) - -;; Public entry for the gptel tool function. -(defun gptel-fetch-page--tool (callback url &optional format timeout max_bytes max_chars include_titles user_agent accept accept_language debug) - (gptel-fetch-page--run callback url format timeout max_bytes max_chars include_titles user_agent accept accept_language debug)) - -(with-eval-after-load 'gptel - (gptel-make-tool - :name "fetch_page" - :function #'gptel-fetch-page--tool - :description (concat - "Fetch a web page and return rendered text, raw HTML, or links.\n" - "- format=text|html|links (default text)\n" - "- Guards: timeout (30s), max_bytes (default 16MiB, up to 32MiB), max_chars (300k)\n" - "- Includes stylesheets in links mode; relative URLs resolved with <base>/final URL\n" - "- Non-text types error with explicit suggestion to use a downloader.") - :category "web" - :async t - :include t - :args (list - '(:name "url" :type string - :description "The http/https URL to fetch") - '(:name "format" :type string - :enum ["text" "html" "links"] - :optional t - :description "Output format: text (rendered), html (raw), or links (absolute URLs)") - '(:name "timeout" :type integer :optional t - :description "Timeout in seconds (default 30)") - '(:name "max_bytes" :type integer :optional t - :description "Max response bytes (default 16MiB, up to 32MiB)") - '(:name "max_chars" :type integer :optional t - :description "Max output chars (default 300k)") - '(:name "include_titles" :type boolean :optional t - :description "In links mode, include titles as a tab-separated second column") - '(:name "user_agent" :type string :optional t - :description "User-Agent preset or full string. Presets: desktop-chrome, desktop-firefox, desktop-safari, edge-windows, mobile-iphone, mobile-android, emacs-default") - '(:name "accept" :type string :optional t - :description "Accept header override") - '(:name "accept_language" :type string :optional t - :description "Accept-Language header override (e.g., en-US,en;q=0.9)") - '(:name "debug" :type boolean :optional t - :description "If true, prepend a compact metadata header")))) - -;; Automatically add to gptel-tools on load -(add-to-list 'gptel-tools (gptel-get-tool '("web" "fetch_page"))) - - -(provide 'fetch_page) -;;; fetch_page.el ends here diff --git a/gptel-tools/fetch_page.el.disabled b/gptel-tools/fetch_page.el.disabled new file mode 100644 index 00000000..664f9be8 --- /dev/null +++ b/gptel-tools/fetch_page.el.disabled @@ -0,0 +1,475 @@ +;;; fetch_page.el --- Web fetch tool for gptel (HTML/text/links) -*- lexical-binding: t; -*- + +;; Copyright (C) 2025 cjennings +;; Author: cjennings and AI assistant + +;;; Commentary: +;; +;; fetch_page: In-memory web fetch-and-parse tool for gptel tools. +;; +;; Features +;; - Fetch a URL (http/https), follow redirects. +;; - Return one of: rendered text (DOM+shr), raw HTML, or extracted links. +;; - Size/time guards: max_bytes, max_chars, timeout. +;; - User-Agent presets; Accept/Accept-Language overrides. +;; - Predictable, LLM-friendly outputs and explicit error suggestions. +;; +;; Notes +;; - This tool uses url.el (asynchronous url-retrieve) and libxml/shr for HTML. +;; - For non-text content (e.g., PDFs), it returns an explicit error and suggests +;; using a download tool (e.g., download_url) instead. +;; +;; Installation +;; - Load after gptel and registers a gptel tool named "fetch_page" (category "web"). +;; +;;; Code: + +(require 'url) +(require 'url-http) ;; for url-http-end-of-headers, response status, etc. +(require 'subr-x) +(eval-when-compile (require 'cl-lib)) +(require 'dom) +(require 'shr) + +;; Internal helpers are prefixed gptel-fetch-page-- + +(defconst gptel-fetch-page--default-max-bytes (* 16 1024 1024)) +(defconst gptel-fetch-page--max-allowed-bytes (* 32 1024 1024)) +(defconst gptel-fetch-page--default-max-chars 300000) +(defconst gptel-fetch-page--default-timeout 30) + +(defconst gptel-fetch-page--ua-presets + '(("desktop-chrome" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") + ("desktop-firefox" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0") + ("desktop-safari" . "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15") + ("edge-windows" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0") + ("mobile-iphone" . "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1") + ("mobile-android" . "Mozilla/5.0 (Linux; Android 13; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36") + ("emacs-default" . "Emacs gptel/fetch_page (+https://github.com/karthink/gptel)") ) + "Preset User-Agent strings.") + +(defun gptel-fetch-page--coerce-bool (v) + (cond + ((eq v :json-false) nil) + ((stringp v) (not (string-blank-p v))) + (t (and v t)))) + +(defun gptel-fetch-page--coerce-int (v &optional default min max) + (let ((n (cond + ((numberp v) v) + ((stringp v) (string-to-number v)) + (t nil)))) + (setq n (or n default)) + (when (and min n (< n min)) (setq n min)) + (when (and max n (> n max)) (setq n max)) + n)) + +(defun gptel-fetch-page--ua (ua) + "Return a User-Agent string from UA preset or use UA directly." + (cond + ((null ua) (cdr (assoc "desktop-chrome" gptel-fetch-page--ua-presets))) + ((stringp ua) + (or (cdr (assoc ua gptel-fetch-page--ua-presets)) ua)) + (t (cdr (assoc "desktop-chrome" gptel-fetch-page--ua-presets))))) + +(defun gptel-fetch-page--ok-scheme-p (url) + (let ((u (ignore-errors (url-generic-parse-url url)))) + (and u (member (url-type u) '("http" "https"))))) + +(defun gptel-fetch-page--parse-headers () + "Parse headers in current buffer up to `url-http-end-of-headers'." + (let (hdrs) + (save-excursion + (goto-char (point-min)) + (while (re-search-forward "^\\([^:]+\\):[ \t]*\\(.*\\)$" url-http-end-of-headers t) + (push (cons (downcase (match-string 1)) (string-trim (match-string 2))) hdrs))) + (nreverse hdrs))) + +(defun gptel-fetch-page--content-type (headers) + "Return (mime charset) from HEADERS alist." + (let* ((ct (cdr (assoc "content-type" headers))) + (mime (and ct (car (split-string ct ";" t "[ \t]*")))) + (charset (and ct (when (string-match "charset=\\([^;]+\\)" ct) + (downcase (match-string 1 ct)))))) + (list (or mime "unknown") charset))) + +(defun gptel-fetch-page--header-bytes (headers) + "Extract and return the Content-Length from HEADERS as a number. + +HEADERS is an alist representing HTTP headers. This function looks +up the 'Content-Length' header, safely converting its value to a number, +or returning nil if the header is absent or conversion fails." + (let ((cl (cdr (assoc "content-length" headers)))) + (and cl (ignore-errors (string-to-number cl))))) + +(defun gptel-fetch-page--region-string (beg end) + (save-excursion + (goto-char beg) + (buffer-substring-no-properties beg end))) + +(defun gptel-fetch-page--html-to-dom (html) + (with-temp-buffer + (insert html) + (condition-case err + (if (fboundp 'libxml-parse-html-region) + (libxml-parse-html-region (point-min) (point-max)) + (signal 'error '(libxml-missing))) + (error (signal (car err) (cdr err)))))) + +(defun gptel-fetch-page--dom-to-text (dom) + (with-temp-buffer + ;; Use shr to render; keep output lean and deterministic. + (let ((shr-use-fonts nil) + (shr-inhibit-images t) + (shr-width (min 120 (window-body-width)))) + (shr-insert-document dom)) + (buffer-string))) + +(defun gptel-fetch-page--dom-base (dom final-url) + (or (when-let* ((node (car (dom-by-tag dom 'base))) + (href (dom-attr node 'href))) href) + final-url)) + +(defun gptel-fetch-page--resolve (href base) + (condition-case _ + (if (and href (not (string-empty-p href))) + (url-expand-file-name href base) + nil) + (error nil))) + +(defun gptel-fetch-page--collect-links (dom final-url include-titles) + (let* ((base (gptel-fetch-page--dom-base dom final-url)) + (seen (make-hash-table :test 'equal)) + (out '())) + (cl-labels ((push-link (u ttitle) + (when (and u (not (gethash u seen))) + (puthash u t seen) + (push (if include-titles + (format "%s\t%s" u (or (and ttitle (string-trim ttitle)) "")) + u) + out)))) + ;; <a> and <area> + (dolist (tag '(a area)) + (dolist (n (dom-by-tag dom tag)) + (let ((href (dom-attr n 'href))) + (push-link (gptel-fetch-page--resolve href base) + (ignore-errors (string-trim (dom-texts n))))))) + ;; <link rel="stylesheet"> included by request + (dolist (n (dom-by-tag dom 'link)) + (let* ((rel (downcase (or (dom-attr n 'rel) ""))) + (is-style (string-match-p "stylesheet" rel)) + (href (dom-attr n 'href))) + (when is-style + (push-link (gptel-fetch-page--resolve href base) + (or (dom-attr n 'title) rel)))))) + (mapconcat #'identity (nreverse out) "\n"))) + +(defun gptel-fetch-page--truncate (s max-chars) + (if (and max-chars (> (length s) max-chars)) + (concat (substring s 0 max-chars) + "\nnote: output truncated at " (number-to-string max-chars) " chars") + s)) + +(defun gptel-fetch-page--format-error (&rest kv) + (let ((summary (plist-get kv :summary)) + (status (or (plist-get kv :status) "unknown")) + (url (or (plist-get kv :url) "unknown")) + (type (or (plist-get kv :type) "unknown")) + (limit (plist-get kv :limit)) + (suggest (plist-get kv :suggest))) + (string-join + (delq nil + (list (format "ERROR: %s" summary) + (format "details: status=%s url=%s type=%s" status url type) + (and limit (format "limit: %s" limit)) + (and suggest (format "suggest: %s" suggest)))) + "\n"))) + +(defun gptel-fetch-page--format-header (status final-url type size) + (format "[%s] final-url: %s | type: %s | size: %s B" + status final-url type (or size "unknown"))) + +;; ---------------------------- +;; Refactored orchestrator bits +;; ---------------------------- + +(defun gptel-fetch-page--normalize-args (url out-format timeout max-bytes max-chars include-titles + user-agent accept accept-language debug) + "Coerce and validate arguments into a request plist." + (let* ((out-format (or out-format "text")) + (timeout (gptel-fetch-page--coerce-int timeout gptel-fetch-page--default-timeout 1 nil)) + (max-bytes (gptel-fetch-page--coerce-int max-bytes gptel-fetch-page--default-max-bytes + 1 gptel-fetch-page--max-allowed-bytes)) + (max-chars (gptel-fetch-page--coerce-int max-chars gptel-fetch-page--default-max-chars 1000 nil)) + (include-titles (gptel-fetch-page--coerce-bool include-titles)) + (debug (gptel-fetch-page--coerce-bool debug)) + (ua (gptel-fetch-page--ua user-agent)) + (accept (or accept "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")) + (accept-language (or accept-language "en-US,en;q=0.9"))) + (list :url url + :out-format out-format + :timeout timeout + :max-bytes max-bytes + :max-chars max-chars + :include-titles include-titles + :debug debug + :ua ua + :accept accept + :accept-language accept-language))) + +(defun gptel-fetch-page--build-request-headers (req) + "Build url.el headers alist from REQ." + (let ((max-bytes (plist-get req :max-bytes))) + `(("User-Agent" . ,(plist-get req :ua)) + ("Accept" . ,(plist-get req :accept)) + ("Accept-Language" . ,(plist-get req :accept-language)) + ,@(and max-bytes (list (cons "Range" (format "bytes=0-%d" (1- max-bytes)))))))) + +(defun gptel-fetch-page--start-timeout (req callback done-cell) + "Start a timer based on REQ's :timeout that calls CALLBACK with an error. +DONE-CELL is a cons cell used as shared mutable state; its car is set non-nil once done." + (let ((timeout (plist-get req :timeout)) + (orig-url (plist-get req :url))) + (run-at-time timeout nil + (lambda () + (unless (car done-cell) + (setcar done-cell t) + (funcall callback + (gptel-fetch-page--format-error + :summary (format "Request timed out after %ss" timeout) + :url orig-url + :suggest "retry with a larger timeout or probe connectivity"))))))) + +(defun gptel-fetch-page--parse-response-buffer () + "Parse the current url-retrieve buffer into a response plist." + (save-excursion + (goto-char (point-min)) + (let* ((status-line (buffer-substring (line-beginning-position) (line-end-position))) + (http-status + (or (and (boundp 'url-http-response-status) url-http-response-status) + (and (string-match "HTTP/[0-9.]+ +\\([0-9]+\\)" status-line) + (string-to-number (match-string 1 status-line))))) + (headers (gptel-fetch-page--parse-headers)) + (final-url (or (and (boundp 'url-current-object) + (ignore-errors (url-recreate-url url-current-object))) + "unknown")) + (ctype (gptel-fetch-page--content-type headers)) + (mime (car ctype)) + (charset (cadr ctype)) + (declared-bytes (gptel-fetch-page--header-bytes headers)) + (body (gptel-fetch-page--region-string url-http-end-of-headers (point-max))) + (actual-bytes (string-bytes body))) + (list :status http-status + :status-line status-line + :headers headers + :final-url final-url + :mime mime + :charset charset + :declared-bytes declared-bytes + :body body + :actual-bytes actual-bytes)))) + +(defun gptel-fetch-page--enforce-limits (resp req) + "Check response sizes and return either (:ok t RESP) or (:error STRING)." + (let* ((http-status (plist-get resp :status)) + (final-url (plist-get resp :final-url)) + (mime (plist-get resp :mime)) + (declared (plist-get resp :declared-bytes)) + (actual (plist-get resp :actual-bytes)) + (max-bytes (plist-get req :max-bytes)) + (max-chars (plist-get req :max-chars)) + (timeout (plist-get req :timeout))) + (cond + ((and declared max-bytes (> declared max-bytes)) + (list :error (gptel-fetch-page--format-error + :summary (format "Response exceeds max_bytes (%d bytes) for in-memory fetch" max-bytes) + :status http-status + :url final-url + :type mime + :limit (format "max_bytes=%d max_chars=%d timeout=%ss" max-bytes max-chars timeout) + :suggest (format "call download_url to save the file, or retry fetch_page with a larger max_bytes (up to %d)" + gptel-fetch-page--max-allowed-bytes))))) + ((and max-bytes (> actual max-bytes)) + (list :error (gptel-fetch-page--format-error + :summary (format "Response exceeds max_bytes (%d bytes) during retrieval" max-bytes) + :status http-status + :url final-url + :type mime + :limit (format "max_bytes=%d max_chars=%d timeout=%ss" max-bytes max-chars timeout) + :suggest (format "call download_url to save the file, or retry fetch_page with a larger max_bytes (up to %d)" + gptel-fetch-page--max-allowed-bytes))))) + (t (list :ok t resp))))) + +(defun gptel-fetch-page--maybe-decode-body (body charset) + "Decode BODY using CHARSET if recognized; otherwise return BODY unchanged." + (let ((cs (and charset (ignore-errors (coding-system-from-name charset))))) + (if (and cs (coding-system-p cs)) + (condition-case _ + (decode-coding-string body cs) + (error body)) + body))) + +(defun gptel-fetch-page--render (resp req) + "Render the response according to request preferences. +Returns either (:ok t STRING) or (:error STRING)." + (let* ((http-status (plist-get resp :status)) + (final-url (plist-get resp :final-url)) + (mime (plist-get resp :mime)) + (charset (plist-get resp :charset)) + (raw-body (plist-get resp :body)) + (body (gptel-fetch-page--maybe-decode-body raw-body charset)) + (out-format (plist-get req :out-format)) + (max-chars (plist-get req :max-chars)) + (debug (plist-get req :debug)) + (include-titles (plist-get req :include-titles)) + (actual-bytes (plist-get resp :actual-bytes))) + (pcase mime + ((or "text/html" "application/xhtml+xml") + (pcase out-format + ("html" + (let* ((content (gptel-fetch-page--truncate body max-chars)) + (out (if debug + (concat (gptel-fetch-page--format-header http-status final-url mime actual-bytes) + "\n\n" content) + content))) + (list :ok t out))) + ((or "text" "links") + (if (fboundp 'libxml-parse-html-region) + (condition-case _ + (let* ((dom (gptel-fetch-page--html-to-dom body)) + (payload (if (string= out-format "text") + (gptel-fetch-page--dom-to-text dom) + (gptel-fetch-page--collect-links dom final-url include-titles))) + (content (gptel-fetch-page--truncate payload max-chars)) + (out (if debug + (concat (gptel-fetch-page--format-header http-status final-url mime actual-bytes) + "\n\n" content) + content))) + (list :ok t out)) + (error + (list :error (gptel-fetch-page--format-error + :summary "HTML parsing failed" + :status http-status + :url final-url + :type mime + :suggest "return html format (raw), use download_url, or install Emacs with libxml2")))) + (list :error (gptel-fetch-page--format-error + :summary "HTML parsing requires Emacs built with libxml2" + :status http-status + :url final-url + :type mime + :suggest "return html format (raw), use download_url, or install Emacs with libxml2")))) + (_ + (list :error (gptel-fetch-page--format-error + :summary (format "Invalid format '%s' for HTML content" out-format) + :status http-status + :url final-url + :type mime + :suggest "use format=text, format=html, or format=links"))))) + ("text/plain" + (let* ((content (gptel-fetch-page--truncate body max-chars)) + (out (if debug + (concat (gptel-fetch-page--format-header http-status final-url mime actual-bytes) + "\n\n" content) + content))) + (list :ok t out))) + (_ + (list :error (gptel-fetch-page--format-error + :summary (format "Non-text content not supported for parsing (%s)" mime) + :status http-status + :url final-url + :type mime + :suggest "use download_url to save the file locally, then parse with a file tool")))))) + +;; ---------------------------- +;; Orchestrator +;; ---------------------------- + +(defun gptel-fetch-page--run (callback url out-format timeout max-bytes max-chars include-titles + user-agent accept accept-language debug) + "Orchestrate async fetch and rendering for gptel fetch_page." + (let ((req (gptel-fetch-page--normalize-args url out-format timeout max-bytes max-chars + include-titles user-agent accept accept-language debug))) + (unless (gptel-fetch-page--ok-scheme-p (plist-get req :url)) + (funcall callback (gptel-fetch-page--format-error + :summary "Only http/https URLs are supported" + :url (plist-get req :url) + :suggest "use an http/https URL")) + (cl-return-from gptel-fetch-page--run)) + (let* ((url-request-method "GET") + (url-request-extra-headers (gptel-fetch-page--build-request-headers req)) + (done (list nil)) ;; cons cell used as shared mutable flag + (orig-url (plist-get req :url)) + timer) + (setq timer (gptel-fetch-page--start-timeout req callback done)) + (url-retrieve + orig-url + (lambda (status) + (let ((inhibit-read-only t)) + (unwind-protect + (progn + (unless (car done) + (setcar done t) + (when (timerp timer) (cancel-timer timer)) + (if (plist-get status :error) + (let ((err (plist-get status :error))) + (funcall callback (gptel-fetch-page--format-error + :summary (error-message-string err) + :url orig-url + :suggest "check URL or try again"))) + (let* ((resp (gptel-fetch-page--parse-response-buffer)) + (limit-check (gptel-fetch-page--enforce-limits resp req))) + (if (plist-get limit-check :error) + (funcall callback (plist-get limit-check :error)) + (let ((rendered (gptel-fetch-page--render resp req))) + (funcall callback + (or (plist-get rendered :error) + (caddr rendered)))))))) + (kill-buffer (current-buffer))) + (ignore-errors (kill-buffer (current-buffer)))))) + nil t nil)))) + + +;; Public entry for the gptel tool function. +(defun gptel-fetch-page--tool (callback url &optional format timeout max_bytes max_chars include_titles user_agent accept accept_language debug) + (gptel-fetch-page--run callback url format timeout max_bytes max_chars include_titles user_agent accept accept_language debug)) + +(with-eval-after-load 'gptel + (gptel-make-tool + :name "fetch_page" + :function #'gptel-fetch-page--tool + :description (concat + "Fetch a web page and return rendered text, raw HTML, or links.\n" + "- format=text|html|links (default text)\n" + "- Guards: timeout (30s), max_bytes (default 16MiB, up to 32MiB), max_chars (300k)\n" + "- Includes stylesheets in links mode; relative URLs resolved with <base>/final URL\n" + "- Non-text types error with explicit suggestion to use a downloader.") + :category "web" + :async t + :include t + :args (list + '(:name "url" :type string + :description "The http/https URL to fetch") + '(:name "format" :type string + :enum ["text" "html" "links"] + :optional t + :description "Output format: text (rendered), html (raw), or links (absolute URLs)") + '(:name "timeout" :type integer :optional t + :description "Timeout in seconds (default 30)") + '(:name "max_bytes" :type integer :optional t + :description "Max response bytes (default 16MiB, up to 32MiB)") + '(:name "max_chars" :type integer :optional t + :description "Max output chars (default 300k)") + '(:name "include_titles" :type boolean :optional t + :description "In links mode, include titles as a tab-separated second column") + '(:name "user_agent" :type string :optional t + :description "User-Agent preset or full string. Presets: desktop-chrome, desktop-firefox, desktop-safari, edge-windows, mobile-iphone, mobile-android, emacs-default") + '(:name "accept" :type string :optional t + :description "Accept header override") + '(:name "accept_language" :type string :optional t + :description "Accept-Language header override (e.g., en-US,en;q=0.9)") + '(:name "debug" :type boolean :optional t + :description "If true, prepend a compact metadata header")))) + +(provide 'fetch_page) +;;; fetch_page.el ends here |
