summaryrefslogtreecommitdiff
path: root/gptel-tools/fetch_page.el
blob: deaae1f941feccefdc867d460114cd32627287d7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
;;; fetch_page.el --- Web fetch tool for gptel (HTML/text/links) -*- lexical-binding: t; -*-

;; Copyright (C) 2025  cjennings
;; Author: cjennings and AI assistant
;; Keywords: convenience, tools, web
;; Package-Requires: ((emacs "27.1"))
;; SPDX-License-Identifier: GPL-3.0-or-later

;;; Commentary:
;;
;; fetch_page: In-memory web fetch-and-parse tool for gptel tools.
;;
;; Features
;; - Fetch a URL (http/https), follow redirects.
;; - Return one of: rendered text (DOM+shr), raw HTML, or extracted links.
;; - Size/time guards: max_bytes, max_chars, timeout.
;; - User-Agent presets; Accept/Accept-Language overrides.
;; - Predictable, LLM-friendly outputs and explicit error suggestions.
;;
;; Notes
;; - This tool uses url.el (asynchronous url-retrieve) and libxml/shr for HTML.
;; - For non-text content (e.g., PDFs), it returns an explicit error and suggests
;;   using a download tool (e.g., download_url) instead.
;;
;; Installation
;; - Load after gptel and registers a gptel tool named "fetch_page" (category "web").
;;
;;; Code:

(require 'url)
(require 'subr-x)
(eval-when-compile (require 'cl-lib))
(require 'dom)
(require 'shr)

;; Internal helpers are prefixed gptel-fetch-page--

(defconst gptel-fetch-page--default-max-bytes (* 16 1024 1024))
(defconst gptel-fetch-page--max-allowed-bytes (* 32 1024 1024))
(defconst gptel-fetch-page--default-max-chars 300000)
(defconst gptel-fetch-page--default-timeout 30)

(defconst gptel-fetch-page--ua-presets
  '(("desktop-chrome"  . "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    ("desktop-firefox" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0")
    ("desktop-safari"  . "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15")
    ("edge-windows"    . "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0")
    ("mobile-iphone"   . "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1")
    ("mobile-android"  . "Mozilla/5.0 (Linux; Android 13; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36")
    ("emacs-default"   . "Emacs gptel/fetch_page (+https://github.com/karthink/gptel)") )
  "Preset User-Agent strings.")

(defun gptel-fetch-page--coerce-bool (v)
  (cond
   ((eq v :json-false) nil)
   ((stringp v) (not (string-blank-p v)))
   (t (and v t))))

(defun gptel-fetch-page--coerce-int (v &optional default min max)
  (let ((n (cond
            ((numberp v) v)
            ((stringp v) (string-to-number v))
            (t nil))))
    (setq n (or n default))
    (when (and min n (< n min)) (setq n min))
    (when (and max n (> n max)) (setq n max))
    n))

(defun gptel-fetch-page--ua (ua)
  "Return a User-Agent string from UA preset or use UA directly."
  (cond
   ((null ua) (cdr (assoc "desktop-chrome" gptel-fetch-page--ua-presets)))
   ((stringp ua)
    (or (cdr (assoc ua gptel-fetch-page--ua-presets)) ua))
   (t (cdr (assoc "desktop-chrome" gptel-fetch-page--ua-presets)))))

(defun gptel-fetch-page--ok-scheme-p (url)
  (let ((u (ignore-errors (url-generic-parse-url url))))
    (and u (member (url-type u) '("http" "https")))))

(defun gptel-fetch-page--parse-headers ()
  "Parse headers in current buffer up to `url-http-end-of-headers'."
  (let (hdrs)
    (save-excursion
      (goto-char (point-min))
      (while (re-search-forward "^\([^:]+\):[ \t]*\(.*\)$" url-http-end-of-headers t)
        (push (cons (downcase (match-string 1)) (string-trim (match-string 2))) hdrs)))
    (nreverse hdrs)))

(defun gptel-fetch-page--content-type (headers)
  "Return (mime charset) from HEADERS alist."
  (let* ((ct (cdr (assoc "content-type" headers)))
         (mime (and ct (car (split-string ct ";" t "[ \t]*"))))
         (charset (and ct (when (string-match "charset=\\([^;]+\\)" ct)
                            (downcase (match-string 1 ct))))))
    (list (or mime "unknown") charset)))

(defun gptel-fetch-page--header-bytes (headers)
  (let ((cl (cdr (assoc "content-length" headers))))
    (and cl (ignore-errors (string-to-number cl)))))

(defun gptel-fetch-page--region-string (beg end)
  (save-excursion
    (goto-char beg)
    (buffer-substring-no-properties beg end)))

(defun gptel-fetch-page--html-to-dom (html)
  (with-temp-buffer
    (insert html)
    (condition-case err
        (if (fboundp 'libxml-parse-html-region)
            (libxml-parse-html-region (point-min) (point-max))
          (signal 'error '(libxml-missing)))
      (error (signal (car err) (cdr err))))))

(defun gptel-fetch-page--dom-to-text (dom)
  (with-temp-buffer
    ;; Use shr to render
    (let ((shr-use-fonts nil)
          (shr-width (min 120 (window-body-width))))
      (shr-insert-document dom))
    (buffer-string)))

(defun gptel-fetch-page--dom-base (dom final-url)
  (or (when-let* ((node (car (dom-by-tag dom 'base)))
                  (href (dom-attr node 'href))) href)
      final-url))

(defun gptel-fetch-page--resolve (href base)
  (condition-case _
      (if (and href (not (string-empty-p href)))
          (url-expand-file-name href base)
        nil)
    (error nil)))

(defun gptel-fetch-page--collect-links (dom final-url include-titles)
  (let* ((base (gptel-fetch-page--dom-base dom final-url))
         (seen (make-hash-table :test 'equal))
         (out '()))
    (cl-labels ((push-link (u ttitle)
				  (when (and u (not (gethash u seen)))
					(puthash u t seen)
					(push (if include-titles
							  (format "%s\t%s" u (or (and ttitle (string-trim ttitle)) ""))
							u)
						  out))))
      ;; <a> and <area>
      (dolist (tag '(a area))
        (dolist (n (dom-by-tag dom tag))
          (let ((href (dom-attr n 'href)))
            (push-link (gptel-fetch-page--resolve href base)
                       (ignore-errors (string-trim (dom-texts n)))))))
      ;; <link rel="stylesheet"> included by request
      (dolist (n (dom-by-tag dom 'link))
        (let* ((rel (downcase (or (dom-attr n 'rel) "")))
               (is-style (or (string-match-p "stylesheet" rel)
                             (string-match-p "\bstyle\b" rel)))
               (href (dom-attr n 'href)))
          (when is-style
            (push-link (gptel-fetch-page--resolve href base)
					   (or (dom-attr n 'title) rel))))))
	(mapconcat #'identity (nreverse out) "\n")))

(defun gptel-fetch-page--truncate (s max-chars)
  (if (and max-chars (> (length s) max-chars))
	  (concat (substring s 0 max-chars) "\nnote: output truncated at " (number-to-string max-chars) " chars")
	s))

(defun gptel-fetch-page--format-error (&rest kv)
  (let ((summary (plist-get kv :summary))
		(status  (or (plist-get kv :status) "unknown"))
		(url     (or (plist-get kv :url) "unknown"))
		(type    (or (plist-get kv :type) "unknown"))
		(limit   (plist-get kv :limit))
		(suggest (plist-get kv :suggest)))
	(string-join
	 (delq nil
		   (list (format "ERROR: %s" summary)
				 (format "details: status=%s url=%s type=%s" status url type)
				 (and limit (format "limit: %s" limit))
				 (and suggest (format "suggest: %s" suggest))))
	 "\n")))

(defun gptel-fetch-page--format-header (status final-url type size)
  (format "[%s] final-url: %s | type: %s | size: %s B"
		  status final-url type (or size "unknown")))

(defun gptel-fetch-page--run (callback url format timeout max-bytes max-chars include-titles
									   user-agent accept accept-language debug)
  (let* ((format (or format "text"))
		 (timeout (gptel-fetch-page--coerce-int timeout gptel-fetch-page--default-timeout 1 nil))
		 (max-bytes (gptel-fetch-page--coerce-int max-bytes gptel-fetch-page--default-max-bytes 1 gptel-fetch-page--max-allowed-bytes))
		 (max-chars (gptel-fetch-page--coerce-int max-chars gptel-fetch-page--default-max-chars 1000 nil))
		 (include-titles (gptel-fetch-page--coerce-bool include-titles))
		 (debug (gptel-fetch-page--coerce-bool debug))
		 (ua (gptel-fetch-page--ua user-agent))
		 (accept (or accept "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"))
		 (accept-language (or accept-language "en-US,en;q=0.9")))
	(unless (gptel-fetch-page--ok-scheme-p url)
	  (funcall callback (gptel-fetch-page--format-error
						 :summary "Only http/https URLs are supported"
						 :url url
						 :suggest "use an http/https URL") )
	  (cl-return-from gptel-fetch-page--run))

	(let* ((url-request-method "GET")
		   (url-request-extra-headers
			`(("User-Agent" . ,ua)
			  ("Accept" . ,accept)
			  ("Accept-Language" . ,accept-language)
			  ;; Byte range to hint the server to cap size
			  ,@(and max-bytes (list (cons "Range" (format "bytes=0-%d" (1- max-bytes)))))))
		   (done nil)
		   (timer nil)
		   (orig-url url))
	  (setq timer (run-at-time timeout nil
							   (lambda ()
								 (unless done
								   (setq done t)
								   (funcall callback (gptel-fetch-page--format-error
													  :summary (format "Request timed out after %ss" timeout)
													  :url orig-url
													  :suggest "retry with a larger timeout or probe connectivity")))) )
			(url-retrieve
			 url
			 (lambda (status)
			   (let ((inhibit-read-only t))
				 (unwind-protect
					 (progn
					   (when (and (not done)) (setq done t) (when timer (cancel-timer timer)))
					   (let ((bail nil))
						 (when (plist-get status :error)
						   (let ((err (plist-get status :error)))
							 (funcall callback (gptel-fetch-page--format-error
												:summary (error-message-string err)
												:url orig-url
												:suggest "check URL or try again")))
						   (setq bail t))
						 ;; Parse status line & headers
						 (goto-char (point-min))
						 (let* ((status-line (buffer-substring (line-beginning-position) (line-end-position)))
								(http-status (or (and (boundp 'url-http-response-status) url-http-response-status)
												 (and (string-match "HTTP/[0-9.]+ +\([0-9]+\)" status-line)
													  (string-to-number (match-string 1 status-line)))))
								(status-msg (or (and (boundp 'url-http-response-status) url-http-response-status)
												http-status))
								(_ (search-forward "\n\n" nil t))
								(headers (gptel-fetch-page--parse-headers))
								(final-url (or (and (boundp 'url-current-object)
													(ignore-errors (url-recreate-url url-current-object))) orig-url))
								(ctype (gptel-fetch-page--content-type headers))
								(mime (car ctype))
								(_charset (cadr ctype))
								(declared-bytes (gptel-fetch-page--header-bytes headers))
								(body (gptel-fetch-page--region-string url-http-end-of-headers (point-max)))
								(actual-bytes (string-bytes body)))
						   ;; Enforce size limits
						   (when (and (not bail) declared-bytes max-bytes (> declared-bytes max-bytes))
							 (funcall callback (gptel-fetch-page--format-error
												:summary (format "Response exceeds max_bytes (%d bytes) for in-memory fetch" max-bytes)
												:status http-status
												:url final-url
												:type mime
												:limit (format "max_bytes=%d max_chars=%d timeout=%ss" max-bytes max-chars timeout)
												:suggest (format "call download_url to save the file, or retry fetch_page with a larger max_bytes (up to %d)" gptel-fetch-page--max-allowed-bytes)))
							 (setq bail t))
						   (when (and (not bail) max-bytes (> actual-bytes max-bytes))
							 (funcall callback (gptel-fetch-page--format-error
												:summary (format "Response exceeds max_bytes (%d bytes) during retrieval" max-bytes)
												:status http-status
												:url final-url
												:type mime
												:limit (format "max_bytes=%d max_chars=%d timeout=%ss" max-bytes max-chars timeout)
												:suggest (format "call download_url to save the file, or retry fetch_page with a larger max_bytes (up to %d)" gptel-fetch-page--max-allowed-bytes)))
							 (setq bail t))

						   (unless bail
							 ;; Route based on content-type & requested format
							 (pcase mime
							   ((or "text/html" "application/xhtml+xml")
								(pcase format
								  ("html"
								   (let* ((content (gptel-fetch-page--truncate body max-chars))
										  (out (if debug
												   (concat (gptel-fetch-page--format-header status-msg final-url mime actual-bytes)
														   "\n\n" content)
												 content)))
									 (funcall callback out)))
								  ("text"
								   (if (fboundp 'libxml-parse-html-region)
									   (condition-case err
										   (let* ((dom (gptel-fetch-page--html-to-dom body))
												  (txt (gptel-fetch-page--truncate (gptel-fetch-page--dom-to-text dom) max-chars))
												  (out (if debug
														   (concat (gptel-fetch-page--format-header status-msg final-url mime actual-bytes)
																   "\n\n" txt)
														 txt)))
											 (funcall callback out))
										 (error
										  (funcall callback (gptel-fetch-page--format-error
															 :summary "HTML parsing failed"
															 :status http-status
															 :url final-url
															 :type mime
															 :suggest "return html format (raw), use download_url, or install Emacs with libxml2")))
										 (funcall callback (gptel-fetch-page--format-error
															:summary "HTML parsing requires Emacs built with libxml2"
															:status http-status
															:url final-url
															:type mime
															:suggest "return html format (raw), use download_url, or install Emacs with libxml2"))))
								   ("links"
									(if (fboundp 'libxml-parse-html-region)
										(condition-case err
											(let* ((dom (gptel-fetch-page--html-to-dom body))
												   (links (gptel-fetch-page--collect-links dom final-url include-titles))
												   (content (gptel-fetch-page--truncate links max-chars))
												   (out (if debug
															(concat (gptel-fetch-page--format-header status-msg final-url mime actual-bytes)
																	"\n\n" content)
														  content)))
											  (funcall callback out))
										  (error
										   (funcall callback (gptel-fetch-page--format-error
															  :summary "HTML parsing failed"
															  :status http-status
															  :url final-url
															  :type mime
															  :suggest "return html format (raw), use download_url, or install Emacs with libxml2")))
										  (funcall callback (gptel-fetch-page--format-error
															 :summary "HTML parsing requires Emacs built with libxml2"
															 :status http-status
															 :url final-url
															 :type mime
															 :suggest "return html format (raw), use download_url, or install Emacs with libxml2"))))))
								  ("text/plain"
								   (let* ((content (gptel-fetch-page--truncate body max-chars))
										  (out (if debug
												   (concat (gptel-fetch-page--format-header status-msg final-url mime actual-bytes)
														   "\n\n" content)
												 content)))
									 (funcall callback out)))
								  (_
								   (funcall callback (gptel-fetch-page--format-error
													  :summary (format "Non-text content not supported for parsing (%s)" mime)
													  :status http-status
													  :url final-url
													  :type mime
													  :suggest "use download_url to save the file locally, then parse with a file tool")))))))
						   (kill-buffer (current-buffer)))))
				   nil t nil))))))))

;; Public entry for the gptel tool function.
(defun gptel-fetch-page--tool (callback url &optional format timeout max_bytes max_chars include_titles user_agent accept accept_language debug)
  (gptel-fetch-page--run callback url format timeout max_bytes max_chars include_titles user_agent accept accept_language debug))

(with-eval-after-load 'gptel
  (gptel-make-tool
   :name "fetch_page"
   :function #'gptel-fetch-page--tool
   :description (concat
				 "Fetch a web page and return rendered text, raw HTML, or links.\n"
				 "- format=text|html|links (default text)\n"
				 "- Guards: timeout (30s), max_bytes (default 16MiB, up to 32MiB), max_chars (300k)\n"
				 "- Includes stylesheets in links mode; relative URLs resolved with <base>/final URL\n"
				 "- Non-text types error with explicit suggestion to use a downloader.")
   :category "web"
   :async t
   :include t
   :args (list
		  '(:name "url" :type string
				  :description "The http/https URL to fetch")
		  '(:name "format" :type string
				  :enum ["text" "html" "links"]
				  :optional t
				  :description "Output format: text (rendered), html (raw), or links (absolute URLs)")
		  '(:name "timeout" :type integer :optional t
				  :description "Timeout in seconds (default 30)")
		  '(:name "max_bytes" :type integer :optional t
				  :description "Max response bytes (default 16MiB, up to 32MiB)")
		  '(:name "max_chars" :type integer :optional t
				  :description "Max output chars (default 300k)")
		  '(:name "include_titles" :type boolean :optional t
				  :description "In links mode, include titles as a tab-separated second column")
		  '(:name "user_agent" :type string :optional t
				  :description "User-Agent preset or full string. Presets: desktop-chrome, desktop-firefox, desktop-safari, edge-windows, mobile-iphone, mobile-android, emacs-default")
		  '(:name "accept" :type string :optional t
				  :description "Accept header override")
		  '(:name "accept_language" :type string :optional t
				  :description "Accept-Language header override (e.g., en-US,en;q=0.9)")
		  '(:name "debug" :type boolean :optional t
				  :description "If true, prepend a compact metadata header"))))

;; Automatically add to gptel-tools on load
(add-to-list 'gptel-tools (gptel-get-tool '("web" "fetch_page")))


(provide 'fetch_page)
;;; fetch_page.el ends here