| 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
 | ;;; fetch_page.el --- Web fetch tool for gptel (HTML/text/links) -*- lexical-binding: t; -*-
;; Copyright (C) 2025  cjennings
;; Author: cjennings and AI assistant
;;; Commentary:
;;
;; fetch_page: In-memory web fetch-and-parse tool for gptel tools.
;;
;; Features
;; - Fetch a URL (http/https), follow redirects.
;; - Return one of: rendered text (DOM+shr), raw HTML, or extracted links.
;; - Size/time guards: max_bytes, max_chars, timeout.
;; - User-Agent presets; Accept/Accept-Language overrides.
;; - Predictable, LLM-friendly outputs and explicit error suggestions.
;;
;; Notes
;; - This tool uses url.el (asynchronous url-retrieve) and libxml/shr for HTML.
;; - For non-text content (e.g., PDFs), it returns an explicit error and suggests
;;   using a download tool (e.g., download_url) instead.
;;
;; Installation
;; - Load after gptel and registers a gptel tool named "fetch_page" (category "web").
;;
;;; Code:
(require 'url)
(require 'url-http) ;; for url-http-end-of-headers, response status, etc.
(require 'subr-x)
(eval-when-compile (require 'cl-lib))
(require 'dom)
(require 'shr)
;; Internal helpers are prefixed gptel-fetch-page--
(defconst gptel-fetch-page--default-max-bytes (* 16 1024 1024))
(defconst gptel-fetch-page--max-allowed-bytes (* 32 1024 1024))
(defconst gptel-fetch-page--default-max-chars 300000)
(defconst gptel-fetch-page--default-timeout 30)
(defconst gptel-fetch-page--ua-presets
  '(("desktop-chrome"  . "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
	("desktop-firefox" . "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0")
	("desktop-safari"  . "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15")
	("edge-windows"    . "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0")
	("mobile-iphone"   . "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1")
	("mobile-android"  . "Mozilla/5.0 (Linux; Android 13; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36")
	("emacs-default"   . "Emacs gptel/fetch_page (+https://github.com/karthink/gptel)") )
  "Preset User-Agent strings.")
(defun gptel-fetch-page--coerce-bool (v)
  (cond
   ((eq v :json-false) nil)
   ((stringp v) (not (string-blank-p v)))
   (t (and v t))))
(defun gptel-fetch-page--coerce-int (v &optional default min max)
  (let ((n (cond
			((numberp v) v)
			((stringp v) (string-to-number v))
			(t nil))))
	(setq n (or n default))
	(when (and min n (< n min)) (setq n min))
	(when (and max n (> n max)) (setq n max))
	n))
(defun gptel-fetch-page--ua (ua)
  "Return a User-Agent string from UA preset or use UA directly."
  (cond
   ((null ua) (cdr (assoc "desktop-chrome" gptel-fetch-page--ua-presets)))
   ((stringp ua)
	(or (cdr (assoc ua gptel-fetch-page--ua-presets)) ua))
   (t (cdr (assoc "desktop-chrome" gptel-fetch-page--ua-presets)))))
(defun gptel-fetch-page--ok-scheme-p (url)
  (let ((u (ignore-errors (url-generic-parse-url url))))
	(and u (member (url-type u) '("http" "https")))))
(defun gptel-fetch-page--parse-headers ()
  "Parse headers in current buffer up to `url-http-end-of-headers'."
  (let (hdrs)
	(save-excursion
	  (goto-char (point-min))
	  (while (re-search-forward "^\\([^:]+\\):[ \t]*\\(.*\\)$" url-http-end-of-headers t)
		(push (cons (downcase (match-string 1)) (string-trim (match-string 2))) hdrs)))
	(nreverse hdrs)))
(defun gptel-fetch-page--content-type (headers)
  "Return (mime charset) from HEADERS alist."
  (let* ((ct (cdr (assoc "content-type" headers)))
		 (mime (and ct (car (split-string ct ";" t "[ \t]*"))))
		 (charset (and ct (when (string-match "charset=\\([^;]+\\)" ct)
							(downcase (match-string 1 ct))))))
	(list (or mime "unknown") charset)))
(defun gptel-fetch-page--header-bytes (headers)
  "Extract and return the Content-Length from HEADERS as a number.
HEADERS is an alist representing HTTP headers. This function looks
up the 'Content-Length' header, safely converting its value to a number,
or returning nil if the header is absent or conversion fails."
  (let ((cl (cdr (assoc "content-length" headers))))
	(and cl (ignore-errors (string-to-number cl)))))
(defun gptel-fetch-page--region-string (beg end)
  (save-excursion
	(goto-char beg)
	(buffer-substring-no-properties beg end)))
(defun gptel-fetch-page--html-to-dom (html)
  (with-temp-buffer
	(insert html)
	(condition-case err
		(if (fboundp 'libxml-parse-html-region)
			(libxml-parse-html-region (point-min) (point-max))
		  (signal 'error '(libxml-missing)))
	  (error (signal (car err) (cdr err))))))
(defun gptel-fetch-page--dom-to-text (dom)
  (with-temp-buffer
	;; Use shr to render; keep output lean and deterministic.
	(let ((shr-use-fonts nil)
		  (shr-inhibit-images t)
		  (shr-width (min 120 (window-body-width))))
	  (shr-insert-document dom))
	(buffer-string)))
(defun gptel-fetch-page--dom-base (dom final-url)
  (or (when-let* ((node (car (dom-by-tag dom 'base)))
				  (href (dom-attr node 'href))) href)
	  final-url))
(defun gptel-fetch-page--resolve (href base)
  (condition-case _
	  (if (and href (not (string-empty-p href)))
		  (url-expand-file-name href base)
		nil)
	(error nil)))
(defun gptel-fetch-page--collect-links (dom final-url include-titles)
  (let* ((base (gptel-fetch-page--dom-base dom final-url))
		 (seen (make-hash-table :test 'equal))
		 (out '()))
	(cl-labels ((push-link (u ttitle)
				  (when (and u (not (gethash u seen)))
					(puthash u t seen)
					(push (if include-titles
							  (format "%s\t%s" u (or (and ttitle (string-trim ttitle)) ""))
							u)
						  out))))
	  ;; <a> and <area>
	  (dolist (tag '(a area))
		(dolist (n (dom-by-tag dom tag))
		  (let ((href (dom-attr n 'href)))
			(push-link (gptel-fetch-page--resolve href base)
					   (ignore-errors (string-trim (dom-texts n)))))))
	  ;; <link rel="stylesheet"> included by request
	  (dolist (n (dom-by-tag dom 'link))
		(let* ((rel (downcase (or (dom-attr n 'rel) "")))
			   (is-style (string-match-p "stylesheet" rel))
			   (href (dom-attr n 'href)))
		  (when is-style
			(push-link (gptel-fetch-page--resolve href base)
					   (or (dom-attr n 'title) rel))))))
	(mapconcat #'identity (nreverse out) "\n")))
(defun gptel-fetch-page--truncate (s max-chars)
  (if (and max-chars (> (length s) max-chars))
	  (concat (substring s 0 max-chars)
			  "\nnote: output truncated at " (number-to-string max-chars) " chars")
	s))
(defun gptel-fetch-page--format-error (&rest kv)
  (let ((summary (plist-get kv :summary))
		(status  (or (plist-get kv :status) "unknown"))
		(url     (or (plist-get kv :url) "unknown"))
		(type    (or (plist-get kv :type) "unknown"))
		(limit   (plist-get kv :limit))
		(suggest (plist-get kv :suggest)))
	(string-join
	 (delq nil
		   (list (format "ERROR: %s" summary)
				 (format "details: status=%s url=%s type=%s" status url type)
				 (and limit (format "limit: %s" limit))
				 (and suggest (format "suggest: %s" suggest))))
	 "\n")))
(defun gptel-fetch-page--format-header (status final-url type size)
  (format "[%s] final-url: %s | type: %s | size: %s B"
		  status final-url type (or size "unknown")))
;; ----------------------------
;; Refactored orchestrator bits
;; ----------------------------
(defun gptel-fetch-page--normalize-args (url out-format timeout max-bytes max-chars include-titles
											 user-agent accept accept-language debug)
  "Coerce and validate arguments into a request plist."
  (let* ((out-format (or out-format "text"))
		 (timeout (gptel-fetch-page--coerce-int timeout gptel-fetch-page--default-timeout 1 nil))
		 (max-bytes (gptel-fetch-page--coerce-int max-bytes gptel-fetch-page--default-max-bytes
												  1 gptel-fetch-page--max-allowed-bytes))
		 (max-chars (gptel-fetch-page--coerce-int max-chars gptel-fetch-page--default-max-chars 1000 nil))
		 (include-titles (gptel-fetch-page--coerce-bool include-titles))
		 (debug (gptel-fetch-page--coerce-bool debug))
		 (ua (gptel-fetch-page--ua user-agent))
		 (accept (or accept "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"))
		 (accept-language (or accept-language "en-US,en;q=0.9")))
	(list :url url
		  :out-format out-format
		  :timeout timeout
		  :max-bytes max-bytes
		  :max-chars max-chars
		  :include-titles include-titles
		  :debug debug
		  :ua ua
		  :accept accept
		  :accept-language accept-language)))
(defun gptel-fetch-page--build-request-headers (req)
  "Build url.el headers alist from REQ."
  (let ((max-bytes (plist-get req :max-bytes)))
	`(("User-Agent" . ,(plist-get req :ua))
	  ("Accept" . ,(plist-get req :accept))
	  ("Accept-Language" . ,(plist-get req :accept-language))
	  ,@(and max-bytes (list (cons "Range" (format "bytes=0-%d" (1- max-bytes))))))))
(defun gptel-fetch-page--start-timeout (req callback done-cell)
  "Start a timer based on REQ's :timeout that calls CALLBACK with an error.
DONE-CELL is a cons cell used as shared mutable state; its car is set non-nil once done."
  (let ((timeout (plist-get req :timeout))
		(orig-url (plist-get req :url)))
	(run-at-time timeout nil
				 (lambda ()
				   (unless (car done-cell)
					 (setcar done-cell t)
					 (funcall callback
							  (gptel-fetch-page--format-error
							   :summary (format "Request timed out after %ss" timeout)
							   :url orig-url
							   :suggest "retry with a larger timeout or probe connectivity")))))))
(defun gptel-fetch-page--parse-response-buffer ()
  "Parse the current url-retrieve buffer into a response plist."
  (save-excursion
	(goto-char (point-min))
	(let* ((status-line (buffer-substring (line-beginning-position) (line-end-position)))
		   (http-status
			(or (and (boundp 'url-http-response-status) url-http-response-status)
				(and (string-match "HTTP/[0-9.]+ +\\([0-9]+\\)" status-line)
					 (string-to-number (match-string 1 status-line)))))
		   (headers (gptel-fetch-page--parse-headers))
		   (final-url (or (and (boundp 'url-current-object)
							   (ignore-errors (url-recreate-url url-current-object)))
						  "unknown"))
		   (ctype (gptel-fetch-page--content-type headers))
		   (mime (car ctype))
		   (charset (cadr ctype))
		   (declared-bytes (gptel-fetch-page--header-bytes headers))
		   (body (gptel-fetch-page--region-string url-http-end-of-headers (point-max)))
		   (actual-bytes (string-bytes body)))
	  (list :status http-status
			:status-line status-line
			:headers headers
			:final-url final-url
			:mime mime
			:charset charset
			:declared-bytes declared-bytes
			:body body
			:actual-bytes actual-bytes))))
(defun gptel-fetch-page--enforce-limits (resp req)
  "Check response sizes and return either (:ok t RESP) or (:error STRING)."
  (let* ((http-status (plist-get resp :status))
		 (final-url (plist-get resp :final-url))
		 (mime (plist-get resp :mime))
		 (declared (plist-get resp :declared-bytes))
		 (actual (plist-get resp :actual-bytes))
		 (max-bytes (plist-get req :max-bytes))
		 (max-chars (plist-get req :max-chars))
		 (timeout (plist-get req :timeout)))
	(cond
	 ((and declared max-bytes (> declared max-bytes))
	  (list :error (gptel-fetch-page--format-error
					:summary (format "Response exceeds max_bytes (%d bytes) for in-memory fetch" max-bytes)
					:status http-status
					:url final-url
					:type mime
					:limit (format "max_bytes=%d max_chars=%d timeout=%ss" max-bytes max-chars timeout)
					:suggest (format "call download_url to save the file, or retry fetch_page with a larger max_bytes (up to %d)"
									 gptel-fetch-page--max-allowed-bytes)))))
	 ((and max-bytes (> actual max-bytes))
	  (list :error (gptel-fetch-page--format-error
					:summary (format "Response exceeds max_bytes (%d bytes) during retrieval" max-bytes)
					:status http-status
					:url final-url
					:type mime
					:limit (format "max_bytes=%d max_chars=%d timeout=%ss" max-bytes max-chars timeout)
					:suggest (format "call download_url to save the file, or retry fetch_page with a larger max_bytes (up to %d)"
									 gptel-fetch-page--max-allowed-bytes)))))
	 (t (list :ok t resp)))))
(defun gptel-fetch-page--maybe-decode-body (body charset)
  "Decode BODY using CHARSET if recognized; otherwise return BODY unchanged."
  (let ((cs (and charset (ignore-errors (coding-system-from-name charset)))))
	(if (and cs (coding-system-p cs))
		(condition-case _
			(decode-coding-string body cs)
		  (error body))
	  body)))
(defun gptel-fetch-page--render (resp req)
  "Render the response according to request preferences.
Returns either (:ok t STRING) or (:error STRING)."
  (let* ((http-status (plist-get resp :status))
		 (final-url (plist-get resp :final-url))
		 (mime (plist-get resp :mime))
		 (charset (plist-get resp :charset))
		 (raw-body (plist-get resp :body))
		 (body (gptel-fetch-page--maybe-decode-body raw-body charset))
		 (out-format (plist-get req :out-format))
		 (max-chars (plist-get req :max-chars))
		 (debug (plist-get req :debug))
		 (include-titles (plist-get req :include-titles))
		 (actual-bytes (plist-get resp :actual-bytes)))
	(pcase mime
	  ((or "text/html" "application/xhtml+xml")
	   (pcase out-format
		 ("html"
		  (let* ((content (gptel-fetch-page--truncate body max-chars))
				 (out (if debug
						  (concat (gptel-fetch-page--format-header http-status final-url mime actual-bytes)
								  "\n\n" content)
						content)))
			(list :ok t out)))
		 ((or "text" "links")
		  (if (fboundp 'libxml-parse-html-region)
			  (condition-case _
				  (let* ((dom (gptel-fetch-page--html-to-dom body))
						 (payload (if (string= out-format "text")
									  (gptel-fetch-page--dom-to-text dom)
									(gptel-fetch-page--collect-links dom final-url include-titles)))
						 (content (gptel-fetch-page--truncate payload max-chars))
						 (out (if debug
								  (concat (gptel-fetch-page--format-header http-status final-url mime actual-bytes)
										  "\n\n" content)
								content)))
					(list :ok t out))
				(error
				 (list :error (gptel-fetch-page--format-error
							   :summary "HTML parsing failed"
							   :status http-status
							   :url final-url
							   :type mime
							   :suggest "return html format (raw), use download_url, or install Emacs with libxml2"))))
			(list :error (gptel-fetch-page--format-error
						  :summary "HTML parsing requires Emacs built with libxml2"
						  :status http-status
						  :url final-url
						  :type mime
						  :suggest "return html format (raw), use download_url, or install Emacs with libxml2"))))
		 (_
		  (list :error (gptel-fetch-page--format-error
						:summary (format "Invalid format '%s' for HTML content" out-format)
						:status http-status
						:url final-url
						:type mime
						:suggest "use format=text, format=html, or format=links")))))
	  ("text/plain"
	   (let* ((content (gptel-fetch-page--truncate body max-chars))
			  (out (if debug
					   (concat (gptel-fetch-page--format-header http-status final-url mime actual-bytes)
							   "\n\n" content)
					 content)))
		 (list :ok t out)))
	  (_
	   (list :error (gptel-fetch-page--format-error
					 :summary (format "Non-text content not supported for parsing (%s)" mime)
					 :status http-status
					 :url final-url
					 :type mime
					 :suggest "use download_url to save the file locally, then parse with a file tool"))))))
;; ----------------------------
;; Orchestrator
;; ----------------------------
(defun gptel-fetch-page--run (callback url out-format timeout max-bytes max-chars include-titles
									   user-agent accept accept-language debug)
  "Orchestrate async fetch and rendering for gptel fetch_page."
  (let ((req (gptel-fetch-page--normalize-args url out-format timeout max-bytes max-chars
											   include-titles user-agent accept accept-language debug)))
	(unless (gptel-fetch-page--ok-scheme-p (plist-get req :url))
	  (funcall callback (gptel-fetch-page--format-error
						 :summary "Only http/https URLs are supported"
						 :url (plist-get req :url)
						 :suggest "use an http/https URL"))
	  (cl-return-from gptel-fetch-page--run))
	(let* ((url-request-method "GET")
		   (url-request-extra-headers (gptel-fetch-page--build-request-headers req))
		   (done (list nil)) ;; cons cell used as shared mutable flag
		   (orig-url (plist-get req :url))
		   timer)
	  (setq timer (gptel-fetch-page--start-timeout req callback done))
	  (url-retrieve
	   orig-url
	   (lambda (status)
		 (let ((inhibit-read-only t))
		   (unwind-protect
			   (progn
				 (unless (car done)
				   (setcar done t)
				   (when (timerp timer) (cancel-timer timer))
				   (if (plist-get status :error)
					   (let ((err (plist-get status :error)))
						 (funcall callback (gptel-fetch-page--format-error
											:summary (error-message-string err)
											:url orig-url
											:suggest "check URL or try again")))
					 (let* ((resp (gptel-fetch-page--parse-response-buffer))
							(limit-check (gptel-fetch-page--enforce-limits resp req)))
					   (if (plist-get limit-check :error)
						   (funcall callback (plist-get limit-check :error))
						 (let ((rendered (gptel-fetch-page--render resp req)))
						   (funcall callback
									(or (plist-get rendered :error)
										(caddr rendered))))))))
				 (kill-buffer (current-buffer)))
			 (ignore-errors (kill-buffer (current-buffer))))))
	   nil t nil))))
;; Public entry for the gptel tool function.
(defun gptel-fetch-page--tool (callback url &optional format timeout max_bytes max_chars include_titles user_agent accept accept_language debug)
  (gptel-fetch-page--run callback url format timeout max_bytes max_chars include_titles user_agent accept accept_language debug))
(with-eval-after-load 'gptel
  (gptel-make-tool
   :name "fetch_page"
   :function #'gptel-fetch-page--tool
   :description (concat
				 "Fetch a web page and return rendered text, raw HTML, or links.\n"
				 "- format=text|html|links (default text)\n"
				 "- Guards: timeout (30s), max_bytes (default 16MiB, up to 32MiB), max_chars (300k)\n"
				 "- Includes stylesheets in links mode; relative URLs resolved with <base>/final URL\n"
				 "- Non-text types error with explicit suggestion to use a downloader.")
   :category "web"
   :async t
   :include t
   :args (list
		  '(:name "url" :type string
				  :description "The http/https URL to fetch")
		  '(:name "format" :type string
				  :enum ["text" "html" "links"]
				  :optional t
				  :description "Output format: text (rendered), html (raw), or links (absolute URLs)")
		  '(:name "timeout" :type integer :optional t
				  :description "Timeout in seconds (default 30)")
		  '(:name "max_bytes" :type integer :optional t
				  :description "Max response bytes (default 16MiB, up to 32MiB)")
		  '(:name "max_chars" :type integer :optional t
				  :description "Max output chars (default 300k)")
		  '(:name "include_titles" :type boolean :optional t
				  :description "In links mode, include titles as a tab-separated second column")
		  '(:name "user_agent" :type string :optional t
				  :description "User-Agent preset or full string. Presets: desktop-chrome, desktop-firefox, desktop-safari, edge-windows, mobile-iphone, mobile-android, emacs-default")
		  '(:name "accept" :type string :optional t
				  :description "Accept header override")
		  '(:name "accept_language" :type string :optional t
				  :description "Accept-Language header override (e.g., en-US,en;q=0.9)")
		  '(:name "debug" :type boolean :optional t
				  :description "If true, prepend a compact metadata header"))))
(provide 'fetch_page)
;;; fetch_page.el ends here
 |