lychee_lib/utils/
url.rs

1use std::borrow::Cow;
2use std::sync::LazyLock;
3
4use linkify::LinkFinder;
5use url::{ParseError, Url};
6
7/// Returns whether the text represents a root-relative link. These look like
8/// `/this` and are resolved relative to a base URL's origin. This can also be called
9/// "domain-relative URL" (by [MDN]) and "path-absolute-URL string" (by [WHATWG]).
10/// From [MDN]:
11///
12/// > Domain-relative URL: `/en-US/docs/Learn_web_development` — the protocol and
13/// > the domain name are both missing. The browser will use the same protocol
14/// > and the same domain name as the one used to load the document hosting that URL.
15///
16/// [MDN]: https://developer.mozilla.org/en-US/docs/Learn_web_development/Howto/Web_mechanics/What_is_a_URL#absolute_urls_vs._relative_urls
17/// [WHATWG]: https://url.spec.whatwg.org/#path-absolute-url-string
18pub(crate) fn is_root_relative_link(text: &str) -> bool {
19    !is_scheme_relative_link(text) && text.trim_ascii_start().starts_with('/')
20}
21
22/// Returns whether the text represents a scheme-relative link. These look like
23/// `//example.com/subpath`. From [MDN]:
24///
25/// > Scheme-relative URL: `//developer.mozilla.org/en-US/docs/Learn_web_development` —
26/// > only the protocol is missing. The browser will use the same protocol as the one
27/// > used to load the document hosting that URL.
28///
29/// [MDN]: https://developer.mozilla.org/en-US/docs/Learn_web_development/Howto/Web_mechanics/What_is_a_URL#absolute_urls_vs._relative_urls
30pub(crate) fn is_scheme_relative_link(text: &str) -> bool {
31    text.trim_ascii_start().starts_with("//")
32}
33
34pub(crate) trait ReqwestUrlExt {
35    /// Joins the given subpaths, using the current URL as the base URL.
36    ///
37    /// Conceptually, `url.join_rooted(&[path])` is very similar to
38    /// `url.join(path)` (using [`Url::join`]). However, they differ when
39    /// the base URL is a `file:` URL.
40    ///
41    /// When used with a `file:` base URL, [`ReqwestUrlExt::join_rooted`]
42    /// will treat root-relative links as locally-relative links, relative
43    /// to the `file:` base URL.
44    ///
45    /// Other relative links and links with non-`file:` bases are joined
46    /// normally, matching the behaviour of [`Url::join`].
47    fn join_rooted(&self, subpaths: &[&str]) -> Result<Url, ParseError>;
48}
49
50impl ReqwestUrlExt for Url {
51    fn join_rooted(&self, subpaths: &[&str]) -> Result<Url, ParseError> {
52        let mut url = Cow::Borrowed(self);
53
54        for subpath in subpaths {
55            if url.scheme() == "file" && is_root_relative_link(subpath) {
56                let locally_relative = format!(".{}", subpath.trim_ascii_start());
57                url = Cow::Owned(self.join(&locally_relative)?);
58            } else {
59                url = Cow::Owned(url.join(subpath)?);
60            }
61        }
62
63        Ok(url.into_owned())
64    }
65}
66
67/// Attempts to parse a string which might represent a URL or a filesystem path.
68/// Returns [`Ok`] if it is unambiguously a valid URL, otherwise returns [`Err`]
69/// with the original input.
70///
71/// On Windows, we take care to make sure absolute paths---which could also be
72/// parsed as URLs---are not parsed as URLs.
73///
74/// # Errors
75///
76/// Returns an [`Err`] if the given text is not a valid URL, or if the given text
77/// *could* be interpreted as a filesystem path. The string is returned within
78/// the error to allow for easier subsequent processing.
79pub(crate) fn parse_url_or_path(input: &str) -> Result<Url, &str> {
80    match Url::parse(input) {
81        Ok(url) if url.scheme().len() == 1 => Err(input),
82        Ok(url) => Ok(url),
83        _ => Err(input),
84    }
85}
86
87static LINK_FINDER: LazyLock<LinkFinder> = LazyLock::new(LinkFinder::new);
88
89// Use `LinkFinder` to offload the raw link searching in plaintext
90pub(crate) fn find_links(input: &str) -> impl Iterator<Item = linkify::Link<'_>> {
91    LINK_FINDER.links(input)
92}
93
94#[cfg(test)]
95mod tests {
96    use super::*;
97    use rstest::rstest;
98
99    #[rstest]
100    // normal HTTP traversal and parsing absolute links
101    #[case::http1("https://a.com/b", &["x/", "d"], "https://a.com/x/d")]
102    #[case::http2("https://a.com/b/", &["x/", "d"], "https://a.com/b/x/d")]
103    #[case::http3("https://a.com/b/", &["https://new.com", "d"], "https://new.com/d")]
104    // parsing absolute file://
105    #[case::file_abs1("https://a.com/b/", &["file:///a", "d"], "file:///d")]
106    #[case::file_abs2("https://a.com/b/", &["file:///a/", "d"], "file:///a/d")]
107    #[case::file_abs3("https://a.com/b/", &["file:///a/b/", "../.."], "file:///")]
108    // file traversal
109    #[case::file_rel1("file:///a/b/", &["/x/y"], "file:///a/b/x/y")]
110    #[case::file_rel2("file:///a/b/", &["a/"], "file:///a/b/a/")]
111    #[case::file_rel3("file:///a/b/", &["a/", "../.."], "file:///a/")]
112    #[case::file_rel4("file:///a/b/", &["a/", "/"], "file:///a/b/")]
113    #[case::file_rel5("file:///a/b/", &["/.."], "file:///a/")]
114    #[case::file_rel6("file:///a/b/", &["/../../"], "file:///")]
115    #[case::file_rel7("file:///a/b/", &[""], "file:///a/b/")]
116    #[case::file_rel8("file:///a/b/", &["."], "file:///a/b/")]
117    // HTTP relative links
118    #[case::http_rel1("https://a.com/x", &[""], "https://a.com/x")]
119    #[case::http_rel2("https://a.com/x", &["../../.."], "https://a.com/")]
120    #[case::http_rel3("https://a.com/x", &["?q", "#x"], "https://a.com/x?q#x")]
121    #[case::http_rel4("https://a.com/x", &[".", "?a"], "https://a.com/?a")]
122    #[case::http_rel5("https://a.com/x", &["/"], "https://a.com/")]
123    #[case::http_rel6("https://a.com/x?q#anchor", &[""], "https://a.com/x?q")]
124    #[case::http_rel7("https://a.com/x#anchor", &["?x"], "https://a.com/x?x")]
125    // scheme relative link - can traverse outside of root
126    #[case::scheme_rel1("file:///root/", &["///new-root"], "file:///new-root")]
127    #[case::scheme_rel2("file:///root/", &["//a.com/boop"], "file://a.com/boop")]
128    #[case::scheme_rel3("https://root/", &["//a.com/boop"], "https://a.com/boop")]
129    fn test_join_rooted(#[case] base: &str, #[case] subpaths: &[&str], #[case] expected: &str) {
130        println!("base={base}, subpaths={subpaths:?}, expected={expected}");
131        assert_eq!(
132            Url::parse(base)
133                .unwrap()
134                .join_rooted(subpaths)
135                .unwrap()
136                .to_string(),
137            expected
138        );
139    }
140
141    #[rstest]
142    // file URLs without trailing / are kinda weird.
143    #[case::file_rel1("file:///a/b/c", &["/../../x"], "file:///x")]
144    #[case::file_rel2("file:///a/b/c", &["/"], "file:///a/b/")]
145    #[case::file_rel3("file:///a/b/c", &[".?qq"], "file:///a/b/?qq")]
146    #[case::file_rel4("file:///a/b/c", &["#x"], "file:///a/b/c#x")]
147    #[case::file_rel5("file:///a/b/c", &["./"], "file:///a/b/")]
148    #[case::file_rel6("file:///a/b/c", &["c"], "file:///a/b/c")]
149    // joining with d
150    #[case::file_rel_d1("file:///a/b/c", &["d", "/../../x"], "file:///x")]
151    #[case::file_rel_d2("file:///a/b/c", &["d", "/"], "file:///a/b/")]
152    #[case::file_rel_d3("file:///a/b/c", &["d", "."], "file:///a/b/")]
153    #[case::file_rel_d4("file:///a/b/c", &["d", "./"], "file:///a/b/")]
154    // joining with d/
155    #[case::file_rel_d_slash1("file:///a/b/c", &["d/", "/"], "file:///a/b/")]
156    #[case::file_rel_d_slash2("file:///a/b/c", &["d/", "."], "file:///a/b/d/")]
157    #[case::file_rel_d_slash3("file:///a/b/c", &["d/", "./"], "file:///a/b/d/")]
158    fn test_join_rooted_with_trailing_filename(
159        #[case] base: &str,
160        #[case] subpaths: &[&str],
161        #[case] expected: &str,
162    ) {
163        println!("base={base}, subpaths={subpaths:?}, expected={expected}");
164        assert_eq!(
165            Url::parse(base)
166                .unwrap()
167                .join_rooted(subpaths)
168                .unwrap()
169                .to_string(),
170            expected
171        );
172    }
173
174    #[rstest]
175    // definitely URLs
176    #[case::ok1("tel:1", Ok("tel:1"))]
177    #[case::ok2("file:///a", Ok("file:///a"))]
178    #[case::ok3("http://a.com", Ok("http://a.com/"))]
179    // path-looking things
180    #[case::err1("", Err(""))]
181    #[case::err2(".", Err("."))]
182    #[case::err3("C:", Err("C:"))]
183    #[case::err4("/unix", Err("/unix"))]
184    #[case::err5("C:/a", Err("C:/a"))]
185    #[case::err6(r"C:\a\b", Err(r"C:\a\b"))]
186    #[case::err7("**/*.md", Err("**/*.md"))]
187    #[case::err8("something", Err("something"))]
188    fn test_parse_url_or_path(#[case] input: &str, #[case] expected: Result<&str, &str>) {
189        let result = parse_url_or_path(input);
190        assert_eq!(result.as_ref().map(Url::as_str), expected.as_deref());
191    }
192}