lychee_lib/types/base_info.rs
1//! Parses and resolves [`RawUri`] into into fully-qualified [`Uri`] by
2//! applying base URL and root dir mappings.
3
4use reqwest::Url;
5use serde::Deserialize;
6use std::borrow::Cow;
7use std::path::{Path, PathBuf};
8
9use crate::ErrorKind;
10use crate::Uri;
11use crate::utils;
12use crate::utils::url::{ReqwestUrlExt, is_root_relative_link};
13use url::PathSegmentsMut;
14
15/// Information used for resolving relative URLs within a particular
16/// input source. There should be a 1:1 correspondence between each
17/// `BaseInfo` and its originating `InputSource`. The main entry
18/// point for constructing is [`BaseInfo::from_source_url`].
19///
20/// Once constructed, [`BaseInfo::parse_url_text`] can be used to
21/// parse and resolve a (possibly relative) URL obtained from within
22/// the associated `InputSource`.
23///
24/// A `BaseInfo` may be built from input sources which cannot resolve
25/// relative links---for instance, stdin. It may also be built from input
26/// sources which can resolve *locally*-relative links, but not *root*-relative
27/// links.
28#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Default)]
29#[serde(try_from = "String")]
30pub enum BaseInfo {
31 /// No base information is available. This is for sources with no base
32 /// information, such as [`ResolvedInputSource::Stdin`], and for URLs which
33 /// *cannot be a base*, such as `data:` and `tel:`. [`BaseInfo::None`]
34 /// can resolve no relative links; only fully-qualified links will be
35 /// parsed successfully.
36 #[default]
37 None,
38
39 /// A base which cannot resolve root-relative links. This is for
40 /// `file:` URLs where the root directory is not known. As such, you can
41 /// traverse relative to the current URL (by traversing the filesystem),
42 /// but you cannot jump to the "root".
43 NoRoot(Url),
44
45 /// A full base made up of `origin` and `path`. This can resolve
46 /// all kinds of relative links.
47 ///
48 /// All non-`file:` URLs which *can be a base* fall into this case. For these,
49 /// `origin` and `path` are obtained by dividing the source URL into its
50 /// origin and path. When joined, `${origin}/${path}` should be equivalent
51 /// to the source's original URL.
52 ///
53 /// This also represents `file:` URLs with a known root. The `origin` field
54 /// records the `file:` URL which will be used to resolve root-relative links.
55 /// The `path` field is the subpath to a particular input source within the
56 /// root. This is retained to resolve locally-relative links.
57 Full(Url, String),
58}
59
60impl BaseInfo {
61 /// Constructs [`BaseInfo::None`].
62 #[must_use]
63 pub const fn none() -> Self {
64 Self::None
65 }
66
67 /// Constructs [`BaseInfo::Full`] with the given fields.
68 #[must_use]
69 pub const fn full(origin: Url, path: String) -> Self {
70 Self::Full(origin, path)
71 }
72
73 /// Constructs a [`BaseInfo`], with the variant being determined by the given URL.
74 ///
75 /// - A [`Url::cannot_be_a_base`] URL will yield [`BaseInfo::None`].
76 /// - A `file:` URL will yield [`BaseInfo::NoRoot`].
77 /// - For other URLs, a [`BaseInfo::Full`] will be constructed from the URL's
78 /// origin and path.
79 ///
80 /// Compared to [`BaseInfo::from_base_url`], this function is more lenient in
81 /// what it accepts because this function should return *a* result for all
82 /// input source URLs.
83 #[must_use]
84 pub fn from_source_url(url: &Url) -> Self {
85 if url.scheme() == "file" {
86 Self::NoRoot(url.clone())
87 } else {
88 match Self::split_url_origin_and_path(url) {
89 Some((origin, path)) => Self::full(origin, path),
90 None => Self::none(),
91 }
92 }
93 }
94
95 /// Split URL into its origin and path, if possible. Will fail and return
96 /// `None` for URLs which *cannot be a base*.
97 fn split_url_origin_and_path(url: &Url) -> Option<(Url, String)> {
98 let origin = url.join("/").ok()?;
99 let subpath = origin.make_relative(url)?;
100 Some((origin, subpath))
101 }
102
103 /// Constructs a [`BaseInfo`] from the given URL, requiring that the given path be acceptable as a
104 /// base URL. That is, it cannot be a special scheme like `data:`.
105 ///
106 /// # Errors
107 ///
108 /// Errors if the given URL cannot be a base.
109 pub fn from_base_url(url: &Url) -> Result<BaseInfo, ErrorKind> {
110 if url.cannot_be_a_base() {
111 return Err(ErrorKind::InvalidBase(
112 url.to_string(),
113 "The given URL cannot be used as a base URL".to_string(),
114 ));
115 }
116
117 Ok(Self::from_source_url(url))
118 }
119
120 /// Constructs a [`BaseInfo`] from the given filesystem path, requiring that
121 /// the given path be absolute. Assumes that the given path represents a directory.
122 ///
123 /// This constructs a [`BaseInfo::Full`] where root-relative links will go to
124 /// the given path.
125 ///
126 /// # Errors
127 ///
128 /// Errors if the given path is not an absolute path.
129 pub fn from_path(path: &Path) -> Result<BaseInfo, ErrorKind> {
130 let Ok(url) = Url::from_directory_path(path) else {
131 return Err(ErrorKind::InvalidBase(
132 path.to_string_lossy().to_string(),
133 "Base must either be a full URL (with scheme) or an absolute local path"
134 .to_string(),
135 ));
136 };
137
138 Self::from_base_url(&url).map(|x| x.use_fs_path_as_origin().into_owned())
139 }
140
141 /// If this is a [`BaseInfo::NoRoot`], promote it to a [`BaseInfo::Full`]
142 /// by using the filesystem root as the "origin" for root-relative links.
143 /// Root-relative links will go to the filesystem root.
144 ///
145 /// Generally, this function should be avoided in favour of a more explicit
146 /// user-provided root directory. The filesystem root is rarely a good place
147 /// to look for files.
148 ///
149 /// Makes no change to other [`BaseInfo`] variants.
150 ///
151 /// # Panics
152 ///
153 /// If unable to split a [`BaseInfo::NoRoot`] into origin and path.
154 #[must_use]
155 pub fn use_fs_root_as_origin(&self) -> Cow<'_, Self> {
156 let Self::NoRoot(url) = self else {
157 return Cow::Borrowed(self);
158 };
159
160 let (fs_root, subpath) = Self::split_url_origin_and_path(url)
161 .expect("splitting up a NoRoot file:// URL should work");
162
163 Cow::Owned(Self::full(fs_root, subpath))
164 }
165
166 /// If this is a [`BaseInfo::NoRoot`], promote it to a [`BaseInfo::Full`]
167 /// by using the entire filesystem path as the "origin" for root-relative links.
168 /// Root-relative links will go to the URL that was previously within `NoRoot`.
169 ///
170 /// Generally, this function should be avoided in favour of a more explicit
171 /// user-provided root directory.
172 ///
173 /// Makes no change to other [`BaseInfo`] variants.
174 #[must_use]
175 pub fn use_fs_path_as_origin(&self) -> Cow<'_, Self> {
176 let Self::NoRoot(url) = self else {
177 return Cow::Borrowed(self);
178 };
179
180 Cow::Owned(Self::full(url.clone(), String::new()))
181 }
182
183 /// Returns the URL for the current [`BaseInfo`], joining the origin and path
184 /// if needed.
185 #[must_use]
186 pub fn url(&self) -> Option<Url> {
187 match self {
188 Self::None => None,
189 Self::NoRoot(url) => Some(url.clone()),
190 Self::Full(url, path) => url.join(path).ok(),
191 }
192 }
193
194 /// Returns the filesystem path for the current [`BaseInfo`] if the underlying
195 /// URL is a `file:` URL.
196 #[must_use]
197 pub fn to_file_path(&self) -> Option<PathBuf> {
198 self.url()
199 .filter(|url| url.scheme() == "file")
200 .and_then(|x| x.to_file_path().ok())
201 }
202
203 /// Returns the scheme of the underlying URL.
204 #[must_use]
205 pub fn scheme(&self) -> Option<&str> {
206 match self {
207 Self::None => None,
208 Self::NoRoot(url) | Self::Full(url, _) => Some(url.scheme()),
209 }
210 }
211
212 /// Returns whether this value is [`BaseInfo::None`].
213 #[must_use]
214 pub const fn is_none(&self) -> bool {
215 matches!(self, Self::None)
216 }
217
218 /// Returns whether this [`BaseInfo`] variant supports resolving root-relative links.
219 ///
220 /// If true, implies [`BaseInfo::supports_locally_relative`].
221 #[must_use]
222 pub const fn supports_root_relative(&self) -> bool {
223 matches!(self, Self::Full(_, _))
224 }
225
226 /// Returns whether this [`BaseInfo`] variant supports resolving locally-relative links.
227 #[must_use]
228 pub const fn supports_locally_relative(&self) -> bool {
229 !self.is_none()
230 }
231
232 /// Returns the [`BaseInfo`] which has _more information_
233 /// between `self` and the given `fallback`.
234 ///
235 /// [`BaseInfo::Full`] is preferred over [`BaseInfo::NoRoot`]
236 /// which is preferred over [`BaseInfo::None`]. If both `self`
237 /// and `fallback` are the same variant, then `self` will be preferred.
238 #[must_use]
239 #[allow(clippy::match_same_arms)]
240 pub const fn or_fallback<'a>(&'a self, fallback: &'a Self) -> &'a Self {
241 match (self, fallback) {
242 (x @ Self::Full(_, _), _) => x,
243 (_, x @ Self::Full(_, _)) => x,
244 (x @ Self::NoRoot(_), _) => x,
245 (_, x @ Self::NoRoot(_)) => x,
246 (x @ Self::None, Self::None) => x,
247 }
248 }
249
250 /// Parses the given URL text into a fully-qualified URL, including
251 /// resolving relative links if supported by the current [`BaseInfo`].
252 ///
253 /// To resolve relative links, this uses [`Url::join`] and [`ReqwestUrlExt::join_rooted`]
254 /// for [`BaseInfo::NoRoot`] and [`BaseInfo::Full`], respectively.
255 ///
256 /// # Errors
257 ///
258 /// Returns an error if the text is an invalid URL, or if the text is a
259 /// relative link and this [`BaseInfo`] variant cannot resolve
260 /// the relative link.
261 pub fn parse_url_text(&self, text: &str) -> Result<Url, ErrorKind> {
262 let mut url = match Uri::try_from(text) {
263 Ok(Uri { url }) => Ok(url),
264 Err(e @ ErrorKind::ParseUrl(_, _)) => match self {
265 _ if !self.supports_root_relative() && is_root_relative_link(text) => {
266 Err(ErrorKind::RootRelativeLinkWithoutRoot(text.to_string()))
267 }
268 Self::NoRoot(base) => base
269 .join(text)
270 .map_err(|e| ErrorKind::ParseUrl(e, text.to_string())),
271 Self::Full(origin, subpath) => origin
272 .join_rooted(&[subpath, text])
273 .map_err(|e| ErrorKind::ParseUrl(e, text.to_string())),
274 Self::None => Err(e),
275 },
276 Err(e) => Err(e),
277 }?;
278
279 // BACKWARDS COMPAT: delete trailing slash for file urls
280 if url.scheme() == "file" {
281 let _ = url
282 .path_segments_mut()
283 .as_mut()
284 .map(PathSegmentsMut::pop_if_empty);
285 }
286
287 Ok(url)
288 }
289
290 /// Parses the given URL text into a fully-qualified URL, including
291 /// resolving relative links if supported by the current [`BaseInfo`]
292 /// and applying the given root-dir if necessary.
293 ///
294 /// The root-dir is applied if the current `BaseInfo` is [`BaseInfo::None`]
295 /// or has a `file:` URL and if the given text is a root-relative link.
296 /// In these cases, the given `root_dir` will take effect instead of the
297 /// original `BaseInfo`.
298 ///
299 /// # Errors
300 ///
301 /// Propagates errors from [`BaseInfo::parse_url_text`].
302 pub fn parse_url_text_with_root_dir(
303 &self,
304 text: &str,
305 root_dir: Option<&Url>,
306 ) -> Result<Url, ErrorKind> {
307 // HACK: if root-dir is specified, apply it by fudging around with
308 // file:// URLs. eventually, someone up the stack should construct
309 // the BaseInfo::Full for root-dir and this function should be deleted.
310
311 // NOTE: also apply root-dir for BaseInfo::None :)
312 let fake_base_info = match (self.scheme(), root_dir) {
313 (Some("file") | None, Some(root_dir)) if is_root_relative_link(text) => {
314 Cow::Owned(Self::full(root_dir.clone(), String::new()))
315 }
316 _ => Cow::Borrowed(self),
317 };
318
319 fake_base_info.parse_url_text(text)
320 }
321}
322
323impl TryFrom<&str> for BaseInfo {
324 type Error = ErrorKind;
325
326 /// Attempts to parse a base from the given string which may be
327 /// a URL or a filesystem path. In both cases, the string must
328 /// represent a valid base (i.e., not resulting in [`BaseInfo::None`]).
329 /// Otherwise, an error will be returned.
330 ///
331 /// Note that this makes a distinction between filesystem paths as paths
332 /// and filesystem paths as URLs. When specified as a path, they will
333 /// become [`BaseInfo::Full`] but when specified as a URL, they will
334 /// become [`BaseInfo::NoRoot`].
335 ///
336 /// Additionally, the empty string is accepted and will be parsed to
337 /// [`BaseInfo::None`].
338 fn try_from(value: &str) -> Result<Self, ErrorKind> {
339 if value.is_empty() {
340 return Ok(BaseInfo::none());
341 }
342 match utils::url::parse_url_or_path(value) {
343 Ok(url) => BaseInfo::from_base_url(&url),
344 Err(path) => BaseInfo::from_path(&PathBuf::from(path)),
345 }
346 }
347}
348
349impl TryFrom<String> for BaseInfo {
350 type Error = ErrorKind;
351 fn try_from(value: String) -> Result<Self, ErrorKind> {
352 BaseInfo::try_from(value.as_ref())
353 }
354}
355
356#[cfg(test)]
357mod tests {
358
359 use super::BaseInfo;
360 use reqwest::Url;
361
362 #[test]
363 fn test_base_info_construction() {
364 assert_eq!(
365 BaseInfo::try_from("https://a.com/b/?q#x").unwrap(),
366 BaseInfo::Full(Url::parse("https://a.com").unwrap(), "b/?q#x".to_string())
367 );
368 assert_eq!(
369 BaseInfo::try_from("file:///file-path").unwrap(),
370 BaseInfo::NoRoot(Url::parse("file:///file-path").unwrap())
371 );
372 assert_eq!(
373 BaseInfo::try_from("/file-path").unwrap(),
374 BaseInfo::Full(Url::parse("file:///file-path/").unwrap(), String::new())
375 );
376
377 let urls = ["https://a.com/b/?q#x", "file:///a.com/b/?q#x"];
378 // .url() of base-info should return the original URL
379 for url_str in urls {
380 let url = Url::parse(url_str).unwrap();
381 assert_eq!(BaseInfo::try_from(url_str).unwrap().url(), Some(url));
382 }
383 }
384
385 #[test]
386 fn test_base_info_with_http_base() {
387 let base = BaseInfo::try_from("https://a.com/c/u/").unwrap();
388 let root_dir = Url::parse("file:///root/").unwrap();
389
390 // shouldn't trigger the root URL
391 assert_eq!(
392 base.parse_url_text_with_root_dir("/a", Some(&root_dir)),
393 Ok(Url::parse("https://a.com/a").unwrap())
394 );
395
396 assert_eq!(
397 base.parse_url_text_with_root_dir("..", Some(&root_dir)),
398 Ok(Url::parse("https://a.com/c/").unwrap())
399 );
400
401 // not many tests here because it's covered by join_rooted tests
402 }
403
404 #[test]
405 fn test_base_info_parse_with_root_dir() {
406 let base = BaseInfo::try_from("/file-path").unwrap();
407 let root_dir = Url::parse("file:///root/").unwrap();
408
409 // first, links which shouldn't trigger the root URL
410 assert_eq!(
411 base.parse_url_text_with_root_dir("a", Some(&root_dir)),
412 Ok(Url::parse("file:///file-path/a").unwrap())
413 );
414 assert_eq!(
415 base.parse_url_text_with_root_dir("./a", Some(&root_dir)),
416 Ok(Url::parse("file:///file-path/a").unwrap())
417 );
418 assert_eq!(
419 base.parse_url_text_with_root_dir("///scheme-relative", Some(&root_dir)),
420 Ok(Url::parse("file:///scheme-relative").unwrap())
421 );
422 assert_eq!(
423 base.parse_url_text_with_root_dir("https://a.com/b?q", Some(&root_dir)),
424 Ok(Url::parse("https://a.com/b?q").unwrap())
425 );
426 assert_eq!(
427 base.parse_url_text_with_root_dir("file:///a/", Some(&root_dir)),
428 Ok(Url::parse("file:///a").unwrap())
429 );
430 // NOTE: trailing slash is dropped by parse_url_text
431
432 // basic root dir use
433 assert_eq!(
434 base.parse_url_text_with_root_dir("/a", Some(&root_dir)),
435 Ok(Url::parse("file:///root/a").unwrap())
436 );
437
438 // root-dir can be traversed out of
439 assert_eq!(
440 base.parse_url_text_with_root_dir("/../../", Some(&root_dir)),
441 Ok(Url::parse("file:///").unwrap())
442 );
443 }
444}