lychee_lib/utils/
request.rs

1use reqwest::Url;
2use std::collections::HashSet;
3use std::path::Path;
4
5use crate::{
6    BaseInfo, BasicAuthCredentials, LycheeResult, Request, RequestError, Uri,
7    basic_auth::BasicAuthExtractor,
8    types::{ResolvedInputSource, uri::raw::RawUri},
9};
10
11/// Extract basic auth credentials for a given URL.
12pub(crate) fn extract_credentials(
13    extractor: Option<&BasicAuthExtractor>,
14    uri: &Uri,
15) -> Option<BasicAuthCredentials> {
16    extractor.as_ref().and_then(|ext| ext.matches(uri))
17}
18
19/// Create a request from a raw URI.
20fn create_request(
21    raw_uri: &RawUri,
22    source: &ResolvedInputSource,
23    root_dir: Option<&Path>,
24    base: &BaseInfo,
25    extractor: Option<&BasicAuthExtractor>,
26) -> LycheeResult<Request> {
27    let uri = try_parse_into_uri(raw_uri, root_dir, base)?;
28    let source = source.clone();
29    let element = raw_uri.element.clone();
30    let attribute = raw_uri.attribute.clone();
31    let credentials = extract_credentials(extractor, &uri);
32
33    Ok(Request::new(uri, source, element, attribute, credentials))
34}
35
36/// Try to parse the raw URI into a `Uri`.
37///
38/// If the raw URI is not a valid URI, create a URI by joining the base URL with the text.
39/// If the base URL is not available, create a URI from the file path.
40///
41/// # Errors
42///
43/// - If the text (the unparsed URI represented as a `String`) cannot be joined with the base
44///   to create a valid URI.
45/// - If a URI cannot be created from the file path.
46/// - If the source is not a file path (i.e. the URI type is not supported).
47fn try_parse_into_uri(
48    raw_uri: &RawUri,
49    root_dir: Option<&Path>,
50    base: &BaseInfo,
51) -> LycheeResult<Uri> {
52    // TODO: this conversion should be hoisted up the call stack
53    let root_dir = root_dir.and_then(|x| Url::from_directory_path(x).ok());
54    Ok(base
55        .parse_url_text_with_root_dir(&raw_uri.text, root_dir.as_ref())?
56        .into())
57}
58
59/// Create requests out of the collected URLs.
60/// Returns a vector of valid URLs and errors. Valid URLs are deduplicated,
61/// request errors are not deduplicated.
62///
63/// If a URLs is ignored (because of the current settings),
64/// it will not be added to the results.
65pub(crate) fn create(
66    uris: Vec<RawUri>,
67    source: &ResolvedInputSource,
68    root_dir: Option<&Path>,
69    fallback_base: &BaseInfo,
70    extractor: Option<&BasicAuthExtractor>,
71) -> Vec<Result<Request, RequestError>> {
72    let source_base = match source.to_url() {
73        Ok(None) => BaseInfo::none(),
74        Ok(Some(url)) => BaseInfo::from_source_url(&url),
75        Err(e) => {
76            // TODO: GetInputContent is not quite the right error.
77            return vec![Err(RequestError::GetInputContent(source.clone().into(), e))];
78        }
79    };
80
81    // TODO: use_fs_root_as_origin is for backwards compat, so `--base-url file:///a`
82    // can resolve a link of `/b` to `file:///b` (in the absence of root-dir).
83    // maybe change if base-url semantics are changed in future.
84    let fallback_base = fallback_base.use_fs_root_as_origin();
85    let base = source_base.or_fallback(&fallback_base);
86
87    let mut requests = HashSet::<Request>::new();
88    let mut errors = Vec::<RequestError>::new();
89
90    for raw_uri in uris {
91        let result = create_request(&raw_uri, source, root_dir, base, extractor);
92        match result {
93            Ok(request) => {
94                requests.insert(request);
95            }
96            Err(e) => errors.push(RequestError::CreateRequestItem(
97                raw_uri.clone(),
98                source.clone(),
99                e,
100            )),
101        }
102    }
103
104    (requests.into_iter().map(Result::Ok))
105        .chain(errors.into_iter().map(Result::Err))
106        .collect()
107}
108
109#[cfg(test)]
110mod tests {
111    use std::borrow::Cow;
112    use std::num::NonZeroUsize;
113    use std::path::PathBuf;
114
115    use crate::Request;
116    use crate::types::uri::raw::{RawUri, RawUriSpan};
117
118    use super::*;
119
120    /// Create requests from the given raw URIs and returns requests that were
121    /// constructed successfully, silently ignoring link parsing errors.
122    ///
123    /// This reduces the `Result` handling which is needed in test cases. Test
124    /// cases can still detect the unexpected appearance of errors by the
125    /// length being different.
126    fn create_ok_only(
127        uris: Vec<RawUri>,
128        source: &ResolvedInputSource,
129        root_dir: Option<&Path>,
130        base: &BaseInfo,
131        extractor: Option<&BasicAuthExtractor>,
132    ) -> Vec<Request> {
133        create(uris, source, root_dir, base, extractor)
134            .into_iter()
135            .filter_map(Result::ok)
136            .collect()
137    }
138
139    fn raw_uri(text: &'static str) -> RawUri {
140        RawUri {
141            text: text.to_string(),
142            element: None,
143            attribute: None,
144            span: RawUriSpan {
145                line: NonZeroUsize::MAX,
146                column: None,
147            },
148        }
149    }
150
151    #[test]
152    fn test_relative_url_resolution() {
153        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
154        let source = ResolvedInputSource::String(Cow::Borrowed(""));
155
156        let uris = vec![raw_uri("relative.html")];
157        let requests = create_ok_only(uris, &source, None, &base, None);
158
159        assert_eq!(requests.len(), 1);
160        assert!(
161            requests
162                .iter()
163                .any(|r| r.uri.url.as_str() == "https://example.com/path/relative.html")
164        );
165    }
166
167    #[test]
168    fn test_absolute_url_resolution() {
169        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
170        let source = ResolvedInputSource::String(Cow::Borrowed(""));
171
172        let uris = vec![raw_uri("https://another.com/page")];
173        let requests = create_ok_only(uris, &source, None, &base, None);
174
175        assert_eq!(requests.len(), 1);
176        assert!(
177            requests
178                .iter()
179                .any(|r| r.uri.url.as_str() == "https://another.com/page")
180        );
181    }
182
183    #[test]
184    fn test_root_relative_url_resolution() {
185        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
186        let source = ResolvedInputSource::String(Cow::Borrowed(""));
187
188        let uris = vec![raw_uri("/root-relative")];
189        let requests = create_ok_only(uris, &source, None, &base, None);
190
191        assert_eq!(requests.len(), 1);
192        assert!(
193            requests
194                .iter()
195                .any(|r| r.uri.url.as_str() == "https://example.com/root-relative")
196        );
197    }
198
199    #[test]
200    fn test_parent_directory_url_resolution() {
201        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
202        let source = ResolvedInputSource::String(Cow::Borrowed(""));
203
204        let uris = vec![raw_uri("../parent")];
205        let requests = create_ok_only(uris, &source, None, &base, None);
206
207        assert_eq!(requests.len(), 1);
208        assert!(
209            requests
210                .iter()
211                .any(|r| r.uri.url.as_str() == "https://example.com/parent")
212        );
213    }
214
215    #[test]
216    fn test_fragment_url_resolution() {
217        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
218        let source = ResolvedInputSource::String(Cow::Borrowed(""));
219
220        let uris = vec![raw_uri("#fragment")];
221        let requests = create_ok_only(uris, &source, None, &base, None);
222
223        assert_eq!(requests.len(), 1);
224        assert!(
225            requests
226                .iter()
227                .any(|r| r.uri.url.as_str() == "https://example.com/path/page.html#fragment")
228        );
229    }
230
231    #[test]
232    fn test_relative_url_resolution_from_root_dir() {
233        let root_dir = PathBuf::from("/tmp/lychee");
234        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
235
236        let uris = vec![raw_uri("relative.html")];
237        let requests = create_ok_only(uris, &source, Some(&root_dir), &BaseInfo::none(), None);
238
239        assert_eq!(requests.len(), 1);
240        assert!(
241            requests
242                .iter()
243                .any(|r| r.uri.url.as_str() == "file:///some/relative.html")
244        );
245    }
246
247    #[test]
248    fn test_absolute_url_resolution_from_root_dir() {
249        let root_dir = PathBuf::from("/tmp/lychee");
250        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
251
252        let uris = vec![raw_uri("https://another.com/page")];
253        let requests = create_ok_only(uris, &source, Some(&root_dir), &BaseInfo::none(), None);
254
255        assert_eq!(requests.len(), 1);
256        assert!(
257            requests
258                .iter()
259                .any(|r| r.uri.url.as_str() == "https://another.com/page")
260        );
261    }
262
263    #[test]
264    fn test_root_relative_url_resolution_from_root_dir() {
265        let root_dir = PathBuf::from("/tmp/lychee");
266        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
267
268        let uris = vec![raw_uri("/root-relative")];
269        let requests = create_ok_only(uris, &source, Some(&root_dir), &BaseInfo::none(), None);
270
271        assert_eq!(requests.len(), 1);
272        assert!(
273            requests
274                .iter()
275                .any(|r| r.uri.url.as_str() == "file:///tmp/lychee/root-relative")
276        );
277    }
278
279    #[test]
280    fn test_parent_directory_url_resolution_from_root_dir() {
281        let root_dir = PathBuf::from("/tmp/lychee");
282        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
283
284        let uris = vec![raw_uri("../parent")];
285        let requests = create_ok_only(uris, &source, Some(&root_dir), &BaseInfo::none(), None);
286
287        assert_eq!(requests.len(), 1);
288        assert!(
289            requests
290                .iter()
291                .any(|r| r.uri.url.as_str() == "file:///parent")
292        );
293    }
294
295    #[test]
296    fn test_fragment_url_resolution_from_root_dir() {
297        let root_dir = PathBuf::from("/tmp/lychee");
298        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
299
300        let uris = vec![raw_uri("#fragment")];
301        let requests = create_ok_only(uris, &source, Some(&root_dir), &BaseInfo::none(), None);
302
303        assert_eq!(requests.len(), 1);
304        assert!(
305            requests
306                .iter()
307                .any(|r| r.uri.url.as_str() == "file:///some/page.html#fragment")
308        );
309    }
310
311    #[test]
312    fn test_relative_url_resolution_from_root_dir_and_base_url() {
313        let root_dir = PathBuf::from("/tmp/lychee");
314        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
315        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
316
317        let uris = vec![raw_uri("relative.html")];
318        let requests = create_ok_only(uris, &source, Some(&root_dir), &base, None);
319
320        assert_eq!(requests.len(), 1);
321        assert!(
322            requests
323                .iter()
324                .any(|r| r.uri.url.as_str() == "https://example.com/path/relative.html")
325        );
326    }
327
328    #[test]
329    fn test_absolute_url_resolution_from_root_dir_and_base_url() {
330        let root_dir = PathBuf::from("/tmp/lychee");
331        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
332        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
333
334        let uris = vec![raw_uri("https://another.com/page")];
335        let requests = create_ok_only(uris, &source, Some(&root_dir), &base, None);
336
337        assert_eq!(requests.len(), 1);
338        assert!(
339            requests
340                .iter()
341                .any(|r| r.uri.url.as_str() == "https://another.com/page")
342        );
343    }
344
345    #[test]
346    fn test_root_relative_url_resolution_from_root_dir_and_base_url() {
347        let root_dir = PathBuf::from("/tmp/lychee");
348        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
349        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
350
351        let uris = vec![raw_uri("/root-relative")];
352        let requests = create_ok_only(uris, &source, Some(&root_dir), &base, None);
353
354        assert_eq!(requests.len(), 1);
355        assert!(
356            requests
357                .iter()
358                .any(|r| r.uri.url.as_str() == "https://example.com/root-relative")
359        );
360    }
361
362    #[test]
363    fn test_parent_directory_url_resolution_from_root_dir_and_base_url() {
364        let root_dir = PathBuf::from("/tmp/lychee");
365        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
366        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
367
368        let uris = vec![raw_uri("../parent")];
369        let requests = create_ok_only(uris, &source, Some(&root_dir), &base, None);
370
371        assert_eq!(requests.len(), 1);
372        assert!(
373            requests
374                .iter()
375                .any(|r| r.uri.url.as_str() == "https://example.com/parent")
376        );
377    }
378
379    #[test]
380    fn test_fragment_url_resolution_from_root_dir_and_base_url() {
381        let root_dir = PathBuf::from("/tmp/lychee");
382        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
383        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
384
385        let uris = vec![raw_uri("#fragment")];
386        let requests = create_ok_only(uris, &source, Some(&root_dir), &base, None);
387
388        assert_eq!(requests.len(), 1);
389        assert!(
390            requests
391                .iter()
392                .any(|r| r.uri.url.as_str() == "https://example.com/path/page.html#fragment")
393        );
394    }
395
396    #[test]
397    fn test_no_base_url_resolution() {
398        let source = ResolvedInputSource::String(Cow::Borrowed(""));
399
400        let uris = vec![raw_uri("https://example.com/page")];
401        let requests = create_ok_only(uris, &source, None, &BaseInfo::none(), None);
402
403        assert_eq!(requests.len(), 1);
404        assert!(
405            requests
406                .iter()
407                .any(|r| r.uri.url.as_str() == "https://example.com/page")
408        );
409    }
410
411    #[test]
412    fn test_create_request_from_relative_file_path() {
413        let base = BaseInfo::from_path(&PathBuf::from("/tmp/lychee")).unwrap();
414        let input_source = ResolvedInputSource::FsPath(PathBuf::from("page.html"));
415
416        let actual =
417            create_request(&raw_uri("file.html"), &input_source, None, &base, None).unwrap();
418
419        assert_eq!(
420            actual,
421            Request::new(
422                Uri {
423                    url: Url::from_file_path("/tmp/lychee/file.html").unwrap()
424                },
425                input_source,
426                None,
427                None,
428                None,
429            )
430        );
431    }
432
433    #[test]
434    fn test_create_request_from_relative_file_path_errors() {
435        // relative links unsupported from stdin
436        assert!(
437            create_request(
438                &raw_uri("file.html"),
439                &ResolvedInputSource::Stdin,
440                None,
441                &BaseInfo::none(),
442                None,
443            )
444            .is_err()
445        );
446
447        // error because no root-dir and no base-url
448        assert!(
449            create_request(
450                &raw_uri("/file.html"),
451                &ResolvedInputSource::FsPath(PathBuf::from("page.html")),
452                None,
453                &BaseInfo::none(),
454                None,
455            )
456            .is_err()
457        );
458    }
459
460    #[test]
461    fn test_create_request_from_absolute_file_path() {
462        let base = BaseInfo::from_path(&PathBuf::from("/tmp/lychee")).unwrap();
463        let input_source = ResolvedInputSource::FsPath(PathBuf::from("/tmp/lychee/page.html"));
464
465        // Use an absolute path that's outside the base directory
466        let actual = create_request(
467            &raw_uri("/usr/local/share/doc/example.html"),
468            &input_source,
469            None,
470            &base,
471            None,
472        )
473        .unwrap();
474
475        assert_eq!(
476            actual,
477            Request::new(
478                Uri {
479                    url: Url::from_file_path("/tmp/lychee/usr/local/share/doc/example.html")
480                        .unwrap()
481                },
482                input_source,
483                None,
484                None,
485                None,
486            )
487        );
488    }
489
490    #[test]
491    fn test_parse_relative_path_into_uri() {
492        let base = BaseInfo::from_path(&PathBuf::from("/tmp/lychee")).unwrap();
493
494        let raw_uri = raw_uri("relative.html");
495        let uri = try_parse_into_uri(&raw_uri, None, &base).unwrap();
496
497        assert_eq!(uri.url.as_str(), "file:///tmp/lychee/relative.html");
498    }
499
500    #[test]
501    fn test_parse_absolute_path_into_uri() {
502        let base = BaseInfo::from_path(&PathBuf::from("/tmp/lychee")).unwrap();
503
504        let raw_uri = raw_uri("absolute.html");
505        let uri = try_parse_into_uri(&raw_uri, None, &base).unwrap();
506
507        assert_eq!(uri.url.as_str(), "file:///tmp/lychee/absolute.html");
508    }
509}