lychee_lib/types/
base_info.rs

1//! Parses and resolves [`RawUri`] into into fully-qualified [`Uri`] by
2//! applying base URL and root dir mappings.
3
4use reqwest::Url;
5use serde::Deserialize;
6use std::borrow::Cow;
7use std::path::{Path, PathBuf};
8
9use crate::ErrorKind;
10use crate::Uri;
11use crate::utils;
12use crate::utils::url::{ReqwestUrlExt, is_root_relative_link};
13use url::PathSegmentsMut;
14
15/// Information used for resolving relative URLs within a particular
16/// input source. There should be a 1:1 correspondence between each
17/// `BaseInfo` and its originating `InputSource`. The main entry
18/// point for constructing is [`BaseInfo::from_source_url`].
19///
20/// Once constructed, [`BaseInfo::parse_url_text`] can be used to
21/// parse and resolve a (possibly relative) URL obtained from within
22/// the associated `InputSource`.
23///
24/// A `BaseInfo` may be built from input sources which cannot resolve
25/// relative links---for instance, stdin. It may also be built from input
26/// sources which can resolve *locally*-relative links, but not *root*-relative
27/// links.
28#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Default)]
29#[serde(try_from = "String")]
30pub enum BaseInfo {
31    /// No base information is available. This is for sources with no base
32    /// information, such as [`ResolvedInputSource::Stdin`], and for URLs which
33    /// *cannot be a base*, such as `data:` and `tel:`. [`BaseInfo::None`]
34    /// can resolve no relative links; only fully-qualified links will be
35    /// parsed successfully.
36    #[default]
37    None,
38
39    /// A base which cannot resolve root-relative links. This is for
40    /// `file:` URLs where the root directory is not known. As such, you can
41    /// traverse relative to the current URL (by traversing the filesystem),
42    /// but you cannot jump to the "root".
43    NoRoot(Url),
44
45    /// A full base made up of `origin` and `path`. This can resolve
46    /// all kinds of relative links.
47    ///
48    /// All non-`file:` URLs which *can be a base* fall into this case. For these,
49    /// `origin` and `path` are obtained by dividing the source URL into its
50    /// origin and path. When joined, `${origin}/${path}` should be equivalent
51    /// to the source's original URL.
52    ///
53    /// This also represents `file:` URLs with a known root. The `origin` field
54    /// records the `file:` URL which will be used to resolve root-relative links.
55    /// The `path` field is the subpath to a particular input source within the
56    /// root. This is retained to resolve locally-relative links.
57    Full(Url, String),
58}
59
60impl BaseInfo {
61    /// Constructs [`BaseInfo::None`].
62    #[must_use]
63    pub const fn none() -> Self {
64        Self::None
65    }
66
67    /// Constructs [`BaseInfo::Full`] with the given fields.
68    #[must_use]
69    pub const fn full(origin: Url, path: String) -> Self {
70        Self::Full(origin, path)
71    }
72
73    /// Constructs a [`BaseInfo`], with the variant being determined by the given URL.
74    ///
75    /// - A [`Url::cannot_be_a_base`] URL will yield [`BaseInfo::None`].
76    /// - A `file:` URL will yield [`BaseInfo::NoRoot`].
77    /// - For other URLs, a [`BaseInfo::Full`] will be constructed from the URL's
78    ///   origin and path.
79    ///
80    /// Compared to [`BaseInfo::from_base_url`], this function is more lenient in
81    /// what it accepts because this function should return *a* result for all
82    /// input source URLs.
83    #[must_use]
84    pub fn from_source_url(url: &Url) -> Self {
85        if url.scheme() == "file" {
86            Self::NoRoot(url.clone())
87        } else {
88            match Self::split_url_origin_and_path(url) {
89                Some((origin, path)) => Self::full(origin, path),
90                None => Self::none(),
91            }
92        }
93    }
94
95    /// Split URL into its origin and path, if possible. Will fail and return
96    /// `None` for URLs which *cannot be a base*.
97    fn split_url_origin_and_path(url: &Url) -> Option<(Url, String)> {
98        let origin = url.join("/").ok()?;
99        let subpath = origin.make_relative(url)?;
100        Some((origin, subpath))
101    }
102
103    /// Constructs a [`BaseInfo`] from the given URL, requiring that the given path be acceptable as a
104    /// base URL. That is, it cannot be a special scheme like `data:`.
105    ///
106    /// # Errors
107    ///
108    /// Errors if the given URL cannot be a base.
109    pub fn from_base_url(url: &Url) -> Result<BaseInfo, ErrorKind> {
110        if url.cannot_be_a_base() {
111            return Err(ErrorKind::InvalidBase(
112                url.to_string(),
113                "The given URL cannot be used as a base URL".to_string(),
114            ));
115        }
116
117        Ok(Self::from_source_url(url))
118    }
119
120    /// Constructs a [`BaseInfo`] from the given filesystem path, requiring that
121    /// the given path be absolute. Assumes that the given path represents a directory.
122    ///
123    /// This constructs a [`BaseInfo::Full`] where root-relative links will go to
124    /// the given path.
125    ///
126    /// # Errors
127    ///
128    /// Errors if the given path is not an absolute path.
129    pub fn from_path(path: &Path) -> Result<BaseInfo, ErrorKind> {
130        let Ok(url) = Url::from_directory_path(path) else {
131            return Err(ErrorKind::InvalidBase(
132                path.to_string_lossy().to_string(),
133                "Base must either be a full URL (with scheme) or an absolute local path"
134                    .to_string(),
135            ));
136        };
137
138        Self::from_base_url(&url).map(|x| x.use_fs_path_as_origin().into_owned())
139    }
140
141    /// If this is a [`BaseInfo::NoRoot`], promote it to a [`BaseInfo::Full`]
142    /// by using the filesystem root as the "origin" for root-relative links.
143    /// Root-relative links will go to the filesystem root.
144    ///
145    /// Generally, this function should be avoided in favour of a more explicit
146    /// user-provided root directory. The filesystem root is rarely a good place
147    /// to look for files.
148    ///
149    /// Makes no change to other [`BaseInfo`] variants.
150    ///
151    /// # Panics
152    ///
153    /// If unable to split a [`BaseInfo::NoRoot`] into origin and path.
154    #[must_use]
155    pub fn use_fs_root_as_origin(&self) -> Cow<'_, Self> {
156        let Self::NoRoot(url) = self else {
157            return Cow::Borrowed(self);
158        };
159
160        let (fs_root, subpath) = Self::split_url_origin_and_path(url)
161            .expect("splitting up a NoRoot file:// URL should work");
162
163        Cow::Owned(Self::full(fs_root, subpath))
164    }
165
166    /// If this is a [`BaseInfo::NoRoot`], promote it to a [`BaseInfo::Full`]
167    /// by using the entire filesystem path as the "origin" for root-relative links.
168    /// Root-relative links will go to the URL that was previously within `NoRoot`.
169    ///
170    /// Generally, this function should be avoided in favour of a more explicit
171    /// user-provided root directory.
172    ///
173    /// Makes no change to other [`BaseInfo`] variants.
174    #[must_use]
175    pub fn use_fs_path_as_origin(&self) -> Cow<'_, Self> {
176        let Self::NoRoot(url) = self else {
177            return Cow::Borrowed(self);
178        };
179
180        Cow::Owned(Self::full(url.clone(), String::new()))
181    }
182
183    /// Returns the URL for the current [`BaseInfo`], joining the origin and path
184    /// if needed.
185    #[must_use]
186    pub fn url(&self) -> Option<Url> {
187        match self {
188            Self::None => None,
189            Self::NoRoot(url) => Some(url.clone()),
190            Self::Full(url, path) => url.join(path).ok(),
191        }
192    }
193
194    /// Returns the filesystem path for the current [`BaseInfo`] if the underlying
195    /// URL is a `file:` URL.
196    #[must_use]
197    pub fn to_file_path(&self) -> Option<PathBuf> {
198        self.url()
199            .filter(|url| url.scheme() == "file")
200            .and_then(|x| x.to_file_path().ok())
201    }
202
203    /// Returns the scheme of the underlying URL.
204    #[must_use]
205    pub fn scheme(&self) -> Option<&str> {
206        match self {
207            Self::None => None,
208            Self::NoRoot(url) | Self::Full(url, _) => Some(url.scheme()),
209        }
210    }
211
212    /// Returns whether this value is [`BaseInfo::None`].
213    #[must_use]
214    pub const fn is_none(&self) -> bool {
215        matches!(self, Self::None)
216    }
217
218    /// Returns whether this [`BaseInfo`] variant supports resolving root-relative links.
219    ///
220    /// If true, implies [`BaseInfo::supports_locally_relative`].
221    #[must_use]
222    pub const fn supports_root_relative(&self) -> bool {
223        matches!(self, Self::Full(_, _))
224    }
225
226    /// Returns whether this [`BaseInfo`] variant supports resolving locally-relative links.
227    #[must_use]
228    pub const fn supports_locally_relative(&self) -> bool {
229        !self.is_none()
230    }
231
232    /// Returns the [`BaseInfo`] which has _more information_
233    /// between `self` and the given `fallback`.
234    ///
235    /// [`BaseInfo::Full`] is preferred over [`BaseInfo::NoRoot`]
236    /// which is preferred over [`BaseInfo::None`]. If both `self`
237    /// and `fallback` are the same variant, then `self` will be preferred.
238    #[must_use]
239    #[allow(clippy::match_same_arms)]
240    pub const fn or_fallback<'a>(&'a self, fallback: &'a Self) -> &'a Self {
241        match (self, fallback) {
242            (x @ Self::Full(_, _), _) => x,
243            (_, x @ Self::Full(_, _)) => x,
244            (x @ Self::NoRoot(_), _) => x,
245            (_, x @ Self::NoRoot(_)) => x,
246            (x @ Self::None, Self::None) => x,
247        }
248    }
249
250    /// Parses the given URL text into a fully-qualified URL, including
251    /// resolving relative links if supported by the current [`BaseInfo`].
252    ///
253    /// To resolve relative links, this uses [`Url::join`] and [`ReqwestUrlExt::join_rooted`]
254    /// for [`BaseInfo::NoRoot`] and [`BaseInfo::Full`], respectively.
255    ///
256    /// # Errors
257    ///
258    /// Returns an error if the text is an invalid URL, or if the text is a
259    /// relative link and this [`BaseInfo`] variant cannot resolve
260    /// the relative link.
261    pub fn parse_url_text(&self, text: &str) -> Result<Url, ErrorKind> {
262        let mut url = match Uri::try_from(text) {
263            Ok(Uri { url }) => Ok(url),
264            Err(e @ ErrorKind::ParseUrl(_, _)) => match self {
265                _ if !self.supports_root_relative() && is_root_relative_link(text) => {
266                    Err(ErrorKind::RootRelativeLinkWithoutRoot(text.to_string()))
267                }
268                Self::NoRoot(base) => base
269                    .join(text)
270                    .map_err(|e| ErrorKind::ParseUrl(e, text.to_string())),
271                Self::Full(origin, subpath) => origin
272                    .join_rooted(&[subpath, text])
273                    .map_err(|e| ErrorKind::ParseUrl(e, text.to_string())),
274                Self::None => Err(e),
275            },
276            Err(e) => Err(e),
277        }?;
278
279        // BACKWARDS COMPAT: delete trailing slash for file urls
280        if url.scheme() == "file" {
281            let _ = url
282                .path_segments_mut()
283                .as_mut()
284                .map(PathSegmentsMut::pop_if_empty);
285        }
286
287        Ok(url)
288    }
289
290    /// Parses the given URL text into a fully-qualified URL, including
291    /// resolving relative links if supported by the current [`BaseInfo`]
292    /// and applying the given root-dir if necessary.
293    ///
294    /// The root-dir is applied if the current `BaseInfo` is [`BaseInfo::None`]
295    /// or has a `file:` URL and if the given text is a root-relative link.
296    /// In these cases, the given `root_dir` will take effect instead of the
297    /// original `BaseInfo`.
298    ///
299    /// # Errors
300    ///
301    /// Propagates errors from [`BaseInfo::parse_url_text`].
302    pub fn parse_url_text_with_root_dir(
303        &self,
304        text: &str,
305        root_dir: Option<&Url>,
306    ) -> Result<Url, ErrorKind> {
307        // HACK: if root-dir is specified, apply it by fudging around with
308        // file:// URLs. eventually, someone up the stack should construct
309        // the BaseInfo::Full for root-dir and this function should be deleted.
310
311        // NOTE: also apply root-dir for BaseInfo::None :)
312        let fake_base_info = match (self.scheme(), root_dir) {
313            (Some("file") | None, Some(root_dir)) if is_root_relative_link(text) => {
314                Cow::Owned(Self::full(root_dir.clone(), String::new()))
315            }
316            _ => Cow::Borrowed(self),
317        };
318
319        fake_base_info.parse_url_text(text)
320    }
321}
322
323impl TryFrom<&str> for BaseInfo {
324    type Error = ErrorKind;
325
326    /// Attempts to parse a base from the given string which may be
327    /// a URL or a filesystem path. In both cases, the string must
328    /// represent a valid base (i.e., not resulting in [`BaseInfo::None`]).
329    /// Otherwise, an error will be returned.
330    ///
331    /// Note that this makes a distinction between filesystem paths as paths
332    /// and filesystem paths as URLs. When specified as a path, they will
333    /// become [`BaseInfo::Full`] but when specified as a URL, they will
334    /// become [`BaseInfo::NoRoot`].
335    ///
336    /// Additionally, the empty string is accepted and will be parsed to
337    /// [`BaseInfo::None`].
338    fn try_from(value: &str) -> Result<Self, ErrorKind> {
339        if value.is_empty() {
340            return Ok(BaseInfo::none());
341        }
342        match utils::url::parse_url_or_path(value) {
343            Ok(url) => BaseInfo::from_base_url(&url),
344            Err(path) => BaseInfo::from_path(&PathBuf::from(path)),
345        }
346    }
347}
348
349impl TryFrom<String> for BaseInfo {
350    type Error = ErrorKind;
351    fn try_from(value: String) -> Result<Self, ErrorKind> {
352        BaseInfo::try_from(value.as_ref())
353    }
354}
355
356#[cfg(test)]
357mod tests {
358
359    use super::BaseInfo;
360    use reqwest::Url;
361
362    #[test]
363    fn test_base_info_construction() {
364        assert_eq!(
365            BaseInfo::try_from("https://a.com/b/?q#x").unwrap(),
366            BaseInfo::Full(Url::parse("https://a.com").unwrap(), "b/?q#x".to_string())
367        );
368        assert_eq!(
369            BaseInfo::try_from("file:///file-path").unwrap(),
370            BaseInfo::NoRoot(Url::parse("file:///file-path").unwrap())
371        );
372        assert_eq!(
373            BaseInfo::try_from("/file-path").unwrap(),
374            BaseInfo::Full(Url::parse("file:///file-path/").unwrap(), String::new())
375        );
376
377        let urls = ["https://a.com/b/?q#x", "file:///a.com/b/?q#x"];
378        // .url() of base-info should return the original URL
379        for url_str in urls {
380            let url = Url::parse(url_str).unwrap();
381            assert_eq!(BaseInfo::try_from(url_str).unwrap().url(), Some(url));
382        }
383    }
384
385    #[test]
386    fn test_base_info_with_http_base() {
387        let base = BaseInfo::try_from("https://a.com/c/u/").unwrap();
388        let root_dir = Url::parse("file:///root/").unwrap();
389
390        // shouldn't trigger the root URL
391        assert_eq!(
392            base.parse_url_text_with_root_dir("/a", Some(&root_dir)),
393            Ok(Url::parse("https://a.com/a").unwrap())
394        );
395
396        assert_eq!(
397            base.parse_url_text_with_root_dir("..", Some(&root_dir)),
398            Ok(Url::parse("https://a.com/c/").unwrap())
399        );
400
401        // not many tests here because it's covered by join_rooted tests
402    }
403
404    #[test]
405    fn test_base_info_parse_with_root_dir() {
406        let base = BaseInfo::try_from("/file-path").unwrap();
407        let root_dir = Url::parse("file:///root/").unwrap();
408
409        // first, links which shouldn't trigger the root URL
410        assert_eq!(
411            base.parse_url_text_with_root_dir("a", Some(&root_dir)),
412            Ok(Url::parse("file:///file-path/a").unwrap())
413        );
414        assert_eq!(
415            base.parse_url_text_with_root_dir("./a", Some(&root_dir)),
416            Ok(Url::parse("file:///file-path/a").unwrap())
417        );
418        assert_eq!(
419            base.parse_url_text_with_root_dir("///scheme-relative", Some(&root_dir)),
420            Ok(Url::parse("file:///scheme-relative").unwrap())
421        );
422        assert_eq!(
423            base.parse_url_text_with_root_dir("https://a.com/b?q", Some(&root_dir)),
424            Ok(Url::parse("https://a.com/b?q").unwrap())
425        );
426        assert_eq!(
427            base.parse_url_text_with_root_dir("file:///a/", Some(&root_dir)),
428            Ok(Url::parse("file:///a").unwrap())
429        );
430        // NOTE: trailing slash is dropped by parse_url_text
431
432        // basic root dir use
433        assert_eq!(
434            base.parse_url_text_with_root_dir("/a", Some(&root_dir)),
435            Ok(Url::parse("file:///root/a").unwrap())
436        );
437
438        // root-dir can be traversed out of
439        assert_eq!(
440            base.parse_url_text_with_root_dir("/../../", Some(&root_dir)),
441            Ok(Url::parse("file:///").unwrap())
442        );
443    }
444}