lychee_lib/
collector.rs

1use crate::ErrorKind;
2use crate::Preprocessor;
3use crate::filter::PathExcludes;
4
5use crate::types::resolver::UrlContentResolver;
6use crate::{
7    BaseInfo, Input, LycheeResult, Request, RequestError, basic_auth::BasicAuthExtractor,
8    extract::Extractor, types::FileExtensions, types::uri::raw::RawUri, utils::request,
9};
10use futures::TryStreamExt;
11use futures::{
12    StreamExt,
13    stream::{self, Stream},
14};
15use http::HeaderMap;
16use par_stream::ParStreamExt;
17use reqwest::Client;
18use std::collections::HashSet;
19use std::path::{Path, PathBuf};
20
21/// Collector keeps the state of link collection
22/// It drives the link extraction from inputs
23#[allow(clippy::struct_excessive_bools)]
24#[derive(Debug, Clone)]
25pub struct Collector {
26    basic_auth_extractor: Option<BasicAuthExtractor>,
27    skip_missing_inputs: bool,
28    skip_ignored: bool,
29    skip_hidden: bool,
30    include_verbatim: bool,
31    include_wikilinks: bool,
32    use_html5ever: bool,
33    root_dir: Option<PathBuf>,
34    base: BaseInfo,
35    excluded_paths: PathExcludes,
36    headers: HeaderMap,
37    client: Client,
38    preprocessor: Option<Preprocessor>,
39}
40
41impl Default for Collector {
42    /// # Panics
43    ///
44    /// We call [`Collector::new()`] which can panic in certain scenarios.
45    ///
46    /// Use `Collector::new()` instead if you need to handle
47    /// [`ClientBuilder`](crate::ClientBuilder) errors gracefully.
48    fn default() -> Self {
49        Collector {
50            basic_auth_extractor: None,
51            skip_missing_inputs: false,
52            include_verbatim: false,
53            include_wikilinks: false,
54            use_html5ever: false,
55            skip_hidden: true,
56            skip_ignored: true,
57            root_dir: None,
58            base: BaseInfo::none(),
59            headers: HeaderMap::new(),
60            client: Client::new(),
61            excluded_paths: PathExcludes::empty(),
62            preprocessor: None,
63        }
64    }
65}
66
67impl Collector {
68    /// Create a new collector with an empty cache
69    ///
70    /// # Errors
71    ///
72    /// Returns an `Err` if the `root_dir` is not a valid path
73    /// or if the reqwest `Client` fails to build
74    pub fn new(root_dir: Option<PathBuf>, base: BaseInfo) -> LycheeResult<Self> {
75        // HACK: if root-dir and base-url are given together and the base is a full file path,
76        // then join the root dir onto the base to match old behaviour.........
77        let (root_dir, base) = match (root_dir, base) {
78            (Some(root_dir), BaseInfo::Full(url, path))
79                if url.scheme() == "file" && path.is_empty() =>
80            {
81                let root_dir = root_dir
82                    .strip_prefix("/")
83                    .map(Path::to_path_buf)
84                    .unwrap_or(root_dir)
85                    .join("");
86
87                match url.to_file_path() {
88                    Ok(base_path) => (Some(base_path.join(root_dir)), BaseInfo::full(url, path)),
89                    Err(()) => (Some(root_dir), BaseInfo::full(url, path)),
90                }
91            }
92            (Some(root_dir), base) => {
93                let root_dir_exists = root_dir.read_dir().map(|_| ());
94                let root_dir = root_dir_exists
95                    .and_then(|()| std::path::absolute(&root_dir))
96                    .map_err(|e| ErrorKind::InvalidRootDir(root_dir, e))?;
97                (Some(root_dir), base)
98            }
99            (None, base) => (None, base),
100        };
101        Ok(Collector {
102            basic_auth_extractor: None,
103            skip_missing_inputs: false,
104            include_verbatim: false,
105            include_wikilinks: false,
106            use_html5ever: false,
107            skip_hidden: true,
108            skip_ignored: true,
109            preprocessor: None,
110            headers: HeaderMap::new(),
111            client: Client::builder()
112                .build()
113                .map_err(ErrorKind::BuildRequestClient)?,
114            excluded_paths: PathExcludes::empty(),
115            root_dir,
116            base,
117        })
118    }
119
120    /// Skip missing input files (default is to error if they don't exist)
121    #[must_use]
122    pub const fn skip_missing_inputs(mut self, yes: bool) -> Self {
123        self.skip_missing_inputs = yes;
124        self
125    }
126
127    /// Skip files that are hidden
128    #[must_use]
129    pub const fn skip_hidden(mut self, yes: bool) -> Self {
130        self.skip_hidden = yes;
131        self
132    }
133
134    /// Skip files that are ignored
135    #[must_use]
136    pub const fn skip_ignored(mut self, yes: bool) -> Self {
137        self.skip_ignored = yes;
138        self
139    }
140
141    /// Set headers to use when resolving input URLs
142    #[must_use]
143    pub fn headers(mut self, headers: HeaderMap) -> Self {
144        self.headers = headers;
145        self
146    }
147
148    /// Set client to use for checking input URLs
149    #[must_use]
150    pub fn client(mut self, client: Client) -> Self {
151        self.client = client;
152        self
153    }
154
155    /// Use `html5ever` to parse HTML instead of `html5gum`.
156    #[must_use]
157    pub const fn use_html5ever(mut self, yes: bool) -> Self {
158        self.use_html5ever = yes;
159        self
160    }
161
162    /// Skip over links in verbatim sections (like Markdown code blocks)
163    #[must_use]
164    pub const fn include_verbatim(mut self, yes: bool) -> Self {
165        self.include_verbatim = yes;
166        self
167    }
168
169    /// Check WikiLinks in Markdown files
170    #[allow(clippy::doc_markdown)]
171    #[must_use]
172    pub const fn include_wikilinks(mut self, yes: bool) -> Self {
173        self.include_wikilinks = yes;
174        self
175    }
176
177    /// Configure a file [`Preprocessor`]
178    #[must_use]
179    pub fn preprocessor(mut self, preprocessor: Option<Preprocessor>) -> Self {
180        self.preprocessor = preprocessor;
181        self
182    }
183
184    /// Pass a [`BasicAuthExtractor`] which is capable to match found
185    /// URIs to basic auth credentials. These credentials get passed to the
186    /// request in question.
187    #[must_use]
188    #[allow(clippy::missing_const_for_fn)]
189    pub fn basic_auth_extractor(mut self, extractor: BasicAuthExtractor) -> Self {
190        self.basic_auth_extractor = Some(extractor);
191        self
192    }
193
194    /// Configure which paths to exclude
195    #[must_use]
196    pub fn excluded_paths(mut self, excluded_paths: PathExcludes) -> Self {
197        self.excluded_paths = excluded_paths;
198        self
199    }
200
201    /// Convenience method to fetch all unique links from inputs
202    /// with the default extensions.
203    pub fn collect_links(
204        self,
205        inputs: HashSet<Input>,
206    ) -> impl Stream<Item = Result<Request, RequestError>> {
207        self.collect_links_from_file_types(inputs, crate::types::FileType::default_extensions())
208    }
209
210    /// Fetch all unique links from inputs
211    /// All relative URLs get prefixed with `base` (if given).
212    /// (This can be a directory or a base URL)
213    ///
214    /// # Errors
215    ///
216    /// Will return `Err` if links cannot be extracted from an input
217    pub fn collect_links_from_file_types(
218        self,
219        inputs: HashSet<Input>,
220        extensions: FileExtensions,
221    ) -> impl Stream<Item = Result<Request, RequestError>> {
222        let skip_missing_inputs = self.skip_missing_inputs;
223        let skip_hidden = self.skip_hidden;
224        let skip_ignored = self.skip_ignored;
225        let global_base = self.base;
226        let excluded_paths = self.excluded_paths;
227
228        let resolver = UrlContentResolver {
229            basic_auth_extractor: self.basic_auth_extractor.clone(),
230            headers: self.headers.clone(),
231            client: self.client,
232        };
233
234        let extractor = Extractor::new(
235            self.use_html5ever,
236            self.include_verbatim,
237            self.include_wikilinks,
238        );
239
240        stream::iter(inputs)
241            .par_then_unordered(None, move |input| {
242                let extensions = extensions.clone();
243                let resolver = resolver.clone();
244                let excluded_paths = excluded_paths.clone();
245                let preprocessor = self.preprocessor.clone();
246
247                async move {
248                    input.get_contents(
249                        skip_missing_inputs,
250                        skip_hidden,
251                        skip_ignored,
252                        extensions,
253                        resolver,
254                        excluded_paths,
255                        preprocessor,
256                    )
257                }
258            })
259            .flatten()
260            .par_then_unordered(None, move |content| {
261                let global_base = global_base.clone();
262                let root_dir = self.root_dir.clone();
263                let basic_auth_extractor = self.basic_auth_extractor.clone();
264                async move {
265                    let content = content?;
266                    let uris: Vec<RawUri> = extractor.extract(&content);
267                    let requests = request::create(
268                        uris,
269                        &content.source,
270                        root_dir.as_deref(),
271                        &global_base,
272                        basic_auth_extractor.as_ref(),
273                    );
274                    Result::Ok(stream::iter(requests))
275                }
276            })
277            .try_flatten()
278    }
279}
280
281#[cfg(test)]
282mod tests {
283    use std::borrow::Cow;
284    use std::{collections::HashSet, convert::TryFrom, fs::File, io::Write};
285    use test_utils::{fixtures_path, load_fixture, mail, mock_server, website};
286
287    use http::StatusCode;
288    use reqwest::Url;
289
290    use super::*;
291    use crate::{
292        LycheeResult, Uri,
293        filter::PathExcludes,
294        types::{FileType, Input, InputSource},
295    };
296
297    // Helper function to run the collector on the given inputs
298    async fn collect(
299        inputs: HashSet<Input>,
300        root_dir: Option<PathBuf>,
301        base: BaseInfo,
302    ) -> LycheeResult<HashSet<Uri>> {
303        let responses = Collector::new(root_dir, base)?.collect_links(inputs);
304        Ok(responses.map(|r| r.unwrap().uri).collect().await)
305    }
306
307    /// Helper function for collecting verbatim links
308    ///
309    /// A verbatim link is a link that is not parsed by the HTML parser.
310    /// For example, a link in a code block or a script tag.
311    async fn collect_verbatim(
312        inputs: HashSet<Input>,
313        root_dir: Option<PathBuf>,
314        base: BaseInfo,
315        extensions: FileExtensions,
316    ) -> LycheeResult<HashSet<Uri>> {
317        let responses = Collector::new(root_dir, base)?
318            .include_verbatim(true)
319            .collect_links_from_file_types(inputs, extensions);
320        Ok(responses.map(|r| r.unwrap().uri).collect().await)
321    }
322
323    const TEST_STRING: &str = "http://test-string.com";
324    const TEST_URL: &str = "https://test-url.org";
325    const TEST_FILE: &str = "https://test-file.io";
326    const TEST_GLOB_1: &str = "https://test-glob-1.io";
327    const TEST_GLOB_2_MAIL: &str = "test@glob-2.io";
328
329    #[tokio::test]
330    async fn test_file_without_extension_is_plaintext() -> LycheeResult<()> {
331        let temp_dir = tempfile::tempdir().unwrap();
332        // Treat as plaintext file (no extension)
333        let file_path = temp_dir.path().join("README");
334        let _file = File::create(&file_path).unwrap();
335        let input = Input::new(&file_path.as_path().display().to_string(), None, true)?;
336        let contents: Vec<_> = input
337            .get_contents(
338                true,
339                true,
340                true,
341                FileType::default_extensions(),
342                UrlContentResolver::default(),
343                PathExcludes::empty(),
344                None,
345            )
346            .collect::<Vec<_>>()
347            .await;
348
349        assert_eq!(contents.len(), 1);
350        assert_eq!(contents[0].as_ref().unwrap().file_type, FileType::Plaintext);
351        Ok(())
352    }
353
354    #[tokio::test]
355    async fn test_url_without_extension_is_html() -> LycheeResult<()> {
356        let input = Input::new("https://example.com/", None, true)?;
357        let contents: Vec<_> = input
358            .get_contents(
359                true,
360                true,
361                true,
362                FileType::default_extensions(),
363                UrlContentResolver::default(),
364                PathExcludes::empty(),
365                None,
366            )
367            .collect::<Vec<_>>()
368            .await;
369
370        assert_eq!(contents.len(), 1);
371        assert_eq!(contents[0].as_ref().unwrap().file_type, FileType::Html);
372        Ok(())
373    }
374
375    #[tokio::test]
376    async fn test_collect_links() -> LycheeResult<()> {
377        let temp_dir = tempfile::tempdir().unwrap();
378        let temp_dir_path = temp_dir.path();
379
380        let file_path = temp_dir_path.join("f");
381        let file_glob_1_path = temp_dir_path.join("glob-1");
382        let file_glob_2_path = temp_dir_path.join("glob-2");
383
384        let mut file = File::create(&file_path).unwrap();
385        let mut file_glob_1 = File::create(file_glob_1_path).unwrap();
386        let mut file_glob_2 = File::create(file_glob_2_path).unwrap();
387
388        writeln!(file, "{TEST_FILE}").unwrap();
389        writeln!(file_glob_1, "{TEST_GLOB_1}").unwrap();
390        writeln!(file_glob_2, "{TEST_GLOB_2_MAIL}").unwrap();
391
392        let mock_server = mock_server!(StatusCode::OK, set_body_string(TEST_URL));
393
394        let inputs = HashSet::from_iter([
395            Input::from_input_source(InputSource::String(Cow::Borrowed(TEST_STRING))),
396            Input::from_input_source(InputSource::RemoteUrl(Box::new(
397                Url::parse(&mock_server.uri())
398                    .map_err(|e| (mock_server.uri(), e))
399                    .unwrap(),
400            ))),
401            Input::from_input_source(InputSource::FsPath(file_path)),
402            Input::from_input_source(InputSource::FsGlob {
403                pattern: glob::Pattern::new(&temp_dir_path.join("glob*").to_string_lossy())?,
404                ignore_case: true,
405            }),
406        ]);
407
408        let links = collect_verbatim(
409            inputs,
410            None,
411            BaseInfo::none(),
412            FileType::default_extensions(),
413        )
414        .await
415        .ok()
416        .unwrap();
417
418        let expected_links = HashSet::from_iter([
419            website!(TEST_STRING),
420            website!(TEST_URL),
421            website!(TEST_FILE),
422            website!(TEST_GLOB_1),
423            mail!(TEST_GLOB_2_MAIL),
424        ]);
425
426        assert_eq!(links, expected_links);
427
428        Ok(())
429    }
430
431    #[tokio::test]
432    async fn test_collect_markdown_links() {
433        let base = BaseInfo::try_from("https://github.com/hello-rust/lychee/").unwrap();
434        let input = Input {
435            source: InputSource::String(Cow::Borrowed(
436                "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)",
437            )),
438            file_type_hint: Some(FileType::Markdown),
439        };
440        let inputs = HashSet::from_iter([input]);
441
442        let links = collect(inputs, None, base).await.ok().unwrap();
443
444        let expected_links = HashSet::from_iter([
445            website!("https://endler.dev"),
446            website!("https://github.com/hello-rust/lychee/relative_link"),
447        ]);
448
449        assert_eq!(links, expected_links);
450    }
451
452    #[tokio::test]
453    async fn test_collect_html_links() {
454        let base = BaseInfo::try_from("https://github.com/lycheeverse/").unwrap();
455        let input = Input {
456            source: InputSource::String(Cow::Borrowed(
457                r#"<html>
458                <div class="row">
459                    <a href="https://github.com/lycheeverse/lychee/">
460                    <a href="blob/master/README.md">README</a>
461                </div>
462            </html>"#,
463            )),
464            file_type_hint: Some(FileType::Html),
465        };
466        let inputs = HashSet::from_iter([input]);
467
468        let links = collect(inputs, None, base).await.ok().unwrap();
469
470        let expected_links = HashSet::from_iter([
471            website!("https://github.com/lycheeverse/lychee/"),
472            website!("https://github.com/lycheeverse/blob/master/README.md"),
473        ]);
474
475        assert_eq!(links, expected_links);
476    }
477
478    #[tokio::test]
479    async fn test_collect_html_srcset() {
480        let base = BaseInfo::try_from("https://example.com/").unwrap();
481        let input = Input {
482            source: InputSource::String(Cow::Borrowed(
483                r#"
484            <img
485                src="/static/image.png"
486                srcset="
487                /static/image300.png  300w,
488                /static/image600.png  600w,
489                "
490            />
491          "#,
492            )),
493            file_type_hint: Some(FileType::Html),
494        };
495        let inputs = HashSet::from_iter([input]);
496
497        let links = collect(inputs, None, base).await.ok().unwrap();
498
499        let expected_links = HashSet::from_iter([
500            website!("https://example.com/static/image.png"),
501            website!("https://example.com/static/image300.png"),
502            website!("https://example.com/static/image600.png"),
503        ]);
504
505        assert_eq!(links, expected_links);
506    }
507
508    #[tokio::test]
509    async fn test_markdown_internal_url() {
510        let base = BaseInfo::try_from("https://localhost.com/").unwrap();
511
512        let input = Input {
513            source: InputSource::String(Cow::Borrowed(
514                "This is [an internal url](@/internal.md)
515        This is [an internal url](@/internal.markdown)
516        This is [an internal url](@/internal.markdown#example)
517        This is [an internal url](@/internal.md#example)",
518            )),
519            file_type_hint: Some(FileType::Markdown),
520        };
521        let inputs = HashSet::from_iter([input]);
522
523        let links = collect(inputs, None, base).await.ok().unwrap();
524
525        let expected = HashSet::from_iter([
526            website!("https://localhost.com/@/internal.md"),
527            website!("https://localhost.com/@/internal.markdown"),
528            website!("https://localhost.com/@/internal.md#example"),
529            website!("https://localhost.com/@/internal.markdown#example"),
530        ]);
531
532        assert_eq!(links, expected);
533    }
534
535    #[tokio::test]
536    async fn test_extract_html5_not_valid_xml_relative_links() {
537        let base = BaseInfo::try_from("https://example.com").unwrap();
538        let input = load_fixture!("TEST_HTML5.html");
539
540        let input = Input {
541            source: InputSource::String(Cow::Owned(input)),
542            file_type_hint: Some(FileType::Html),
543        };
544        let inputs = HashSet::from_iter([input]);
545
546        let links = collect(inputs, None, base).await.ok().unwrap();
547
548        let expected_links = HashSet::from_iter([
549            // the body links wouldn't be present if the file was parsed strictly as XML
550            website!("https://example.com/body/a"),
551            website!("https://example.com/body/div_empty_a"),
552            website!("https://example.com/css/style_full_url.css"),
553            website!("https://example.com/css/style_relative_url.css"),
554            website!("https://example.com/head/home"),
555            website!("https://example.com/images/icon.png"),
556        ]);
557
558        assert_eq!(links, expected_links);
559    }
560
561    #[tokio::test]
562    async fn test_relative_url_with_base_extracted_from_input() {
563        let contents = r#"<html>
564            <div class="row">
565                <a href="https://github.com/lycheeverse/lychee/">GitHub</a>
566                <a href="/about">About</a>
567            </div>
568        </html>"#;
569        let mock_server = mock_server!(StatusCode::OK, set_body_string(contents));
570
571        let server_uri = Url::parse(&mock_server.uri()).unwrap();
572
573        let input = Input::from_input_source(InputSource::RemoteUrl(Box::new(server_uri.clone())));
574
575        let inputs = HashSet::from_iter([input]);
576
577        let links = collect(inputs, None, BaseInfo::none()).await.ok().unwrap();
578
579        let expected_urls = HashSet::from_iter([
580            website!("https://github.com/lycheeverse/lychee/"),
581            website!(&format!("{server_uri}about")),
582        ]);
583
584        assert_eq!(links, expected_urls);
585    }
586
587    #[tokio::test]
588    async fn test_email_with_query_params() {
589        let input = Input::from_input_source(InputSource::String(Cow::Borrowed(
590            "This is a mailto:user@example.com?subject=Hello link",
591        )));
592
593        let inputs = HashSet::from_iter([input]);
594
595        let links = collect(inputs, None, BaseInfo::none()).await.ok().unwrap();
596
597        let expected_links = HashSet::from_iter([mail!("user@example.com")]);
598
599        assert_eq!(links, expected_links);
600    }
601
602    #[tokio::test]
603    async fn test_multiple_remote_urls() {
604        let mock_server_1 = mock_server!(
605            StatusCode::OK,
606            set_body_string(r#"<a href="relative.html">Link</a>"#)
607        );
608        let mock_server_2 = mock_server!(
609            StatusCode::OK,
610            set_body_string(r#"<a href="relative.html">Link</a>"#)
611        );
612
613        let inputs = HashSet::from_iter([
614            Input {
615                source: InputSource::RemoteUrl(Box::new(
616                    Url::parse(&format!(
617                        "{}/foo/index.html",
618                        mock_server_1.uri().trim_end_matches('/')
619                    ))
620                    .unwrap(),
621                )),
622                file_type_hint: Some(FileType::Html),
623            },
624            Input {
625                source: InputSource::RemoteUrl(Box::new(
626                    Url::parse(&format!(
627                        "{}/bar/index.html",
628                        mock_server_2.uri().trim_end_matches('/')
629                    ))
630                    .unwrap(),
631                )),
632                file_type_hint: Some(FileType::Html),
633            },
634        ]);
635
636        let links = collect(inputs, None, BaseInfo::none()).await.ok().unwrap();
637
638        let expected_links = HashSet::from_iter([
639            website!(&format!(
640                "{}/foo/relative.html",
641                mock_server_1.uri().trim_end_matches('/')
642            )),
643            website!(&format!(
644                "{}/bar/relative.html",
645                mock_server_2.uri().trim_end_matches('/')
646            )),
647        ]);
648
649        assert_eq!(links, expected_links);
650    }
651
652    #[tokio::test]
653    async fn test_file_path_with_base() {
654        let base = BaseInfo::try_from("/path/to/root").unwrap();
655
656        let input = Input {
657            source: InputSource::String(Cow::Borrowed(
658                r#"
659                <a href="index.html">Index</a>
660                <a href="about.html">About</a>
661                <a href="../up.html">About</a>
662                <a href="/another.html">Another</a>
663            "#,
664            )),
665            file_type_hint: Some(FileType::Html),
666        };
667
668        let inputs = HashSet::from_iter([input]);
669
670        let links = collect(inputs, None, base).await.ok().unwrap();
671        let links_str: HashSet<_> = links.iter().map(|x| x.url.as_str()).collect();
672
673        let expected_links: HashSet<_> = HashSet::from_iter([
674            ("file:///path/to/root/index.html"),
675            ("file:///path/to/root/about.html"),
676            ("file:///path/to/up.html"),
677            ("file:///path/to/root/another.html"),
678        ]);
679
680        assert_eq!(links_str, expected_links);
681    }
682}