1use crate::ErrorKind;
2use crate::Preprocessor;
3use crate::filter::PathExcludes;
4
5use crate::types::resolver::UrlContentResolver;
6use crate::{
7 BaseInfo, Input, LycheeResult, Request, RequestError, basic_auth::BasicAuthExtractor,
8 extract::Extractor, types::FileExtensions, types::uri::raw::RawUri, utils::request,
9};
10use futures::TryStreamExt;
11use futures::{
12 StreamExt,
13 stream::{self, Stream},
14};
15use http::HeaderMap;
16use par_stream::ParStreamExt;
17use reqwest::Client;
18use std::collections::HashSet;
19use std::path::{Path, PathBuf};
20
21#[allow(clippy::struct_excessive_bools)]
24#[derive(Debug, Clone)]
25pub struct Collector {
26 basic_auth_extractor: Option<BasicAuthExtractor>,
27 skip_missing_inputs: bool,
28 skip_ignored: bool,
29 skip_hidden: bool,
30 include_verbatim: bool,
31 include_wikilinks: bool,
32 use_html5ever: bool,
33 root_dir: Option<PathBuf>,
34 base: BaseInfo,
35 excluded_paths: PathExcludes,
36 headers: HeaderMap,
37 client: Client,
38 preprocessor: Option<Preprocessor>,
39}
40
41impl Default for Collector {
42 fn default() -> Self {
49 Collector {
50 basic_auth_extractor: None,
51 skip_missing_inputs: false,
52 include_verbatim: false,
53 include_wikilinks: false,
54 use_html5ever: false,
55 skip_hidden: true,
56 skip_ignored: true,
57 root_dir: None,
58 base: BaseInfo::none(),
59 headers: HeaderMap::new(),
60 client: Client::new(),
61 excluded_paths: PathExcludes::empty(),
62 preprocessor: None,
63 }
64 }
65}
66
67impl Collector {
68 pub fn new(root_dir: Option<PathBuf>, base: BaseInfo) -> LycheeResult<Self> {
75 let (root_dir, base) = match (root_dir, base) {
78 (Some(root_dir), BaseInfo::Full(url, path))
79 if url.scheme() == "file" && path.is_empty() =>
80 {
81 let root_dir = root_dir
82 .strip_prefix("/")
83 .map(Path::to_path_buf)
84 .unwrap_or(root_dir)
85 .join("");
86
87 match url.to_file_path() {
88 Ok(base_path) => (Some(base_path.join(root_dir)), BaseInfo::full(url, path)),
89 Err(()) => (Some(root_dir), BaseInfo::full(url, path)),
90 }
91 }
92 (Some(root_dir), base) => {
93 let root_dir_exists = root_dir.read_dir().map(|_| ());
94 let root_dir = root_dir_exists
95 .and_then(|()| std::path::absolute(&root_dir))
96 .map_err(|e| ErrorKind::InvalidRootDir(root_dir, e))?;
97 (Some(root_dir), base)
98 }
99 (None, base) => (None, base),
100 };
101 Ok(Collector {
102 basic_auth_extractor: None,
103 skip_missing_inputs: false,
104 include_verbatim: false,
105 include_wikilinks: false,
106 use_html5ever: false,
107 skip_hidden: true,
108 skip_ignored: true,
109 preprocessor: None,
110 headers: HeaderMap::new(),
111 client: Client::builder()
112 .build()
113 .map_err(ErrorKind::BuildRequestClient)?,
114 excluded_paths: PathExcludes::empty(),
115 root_dir,
116 base,
117 })
118 }
119
120 #[must_use]
122 pub const fn skip_missing_inputs(mut self, yes: bool) -> Self {
123 self.skip_missing_inputs = yes;
124 self
125 }
126
127 #[must_use]
129 pub const fn skip_hidden(mut self, yes: bool) -> Self {
130 self.skip_hidden = yes;
131 self
132 }
133
134 #[must_use]
136 pub const fn skip_ignored(mut self, yes: bool) -> Self {
137 self.skip_ignored = yes;
138 self
139 }
140
141 #[must_use]
143 pub fn headers(mut self, headers: HeaderMap) -> Self {
144 self.headers = headers;
145 self
146 }
147
148 #[must_use]
150 pub fn client(mut self, client: Client) -> Self {
151 self.client = client;
152 self
153 }
154
155 #[must_use]
157 pub const fn use_html5ever(mut self, yes: bool) -> Self {
158 self.use_html5ever = yes;
159 self
160 }
161
162 #[must_use]
164 pub const fn include_verbatim(mut self, yes: bool) -> Self {
165 self.include_verbatim = yes;
166 self
167 }
168
169 #[allow(clippy::doc_markdown)]
171 #[must_use]
172 pub const fn include_wikilinks(mut self, yes: bool) -> Self {
173 self.include_wikilinks = yes;
174 self
175 }
176
177 #[must_use]
179 pub fn preprocessor(mut self, preprocessor: Option<Preprocessor>) -> Self {
180 self.preprocessor = preprocessor;
181 self
182 }
183
184 #[must_use]
188 #[allow(clippy::missing_const_for_fn)]
189 pub fn basic_auth_extractor(mut self, extractor: BasicAuthExtractor) -> Self {
190 self.basic_auth_extractor = Some(extractor);
191 self
192 }
193
194 #[must_use]
196 pub fn excluded_paths(mut self, excluded_paths: PathExcludes) -> Self {
197 self.excluded_paths = excluded_paths;
198 self
199 }
200
201 pub fn collect_links(
204 self,
205 inputs: HashSet<Input>,
206 ) -> impl Stream<Item = Result<Request, RequestError>> {
207 self.collect_links_from_file_types(inputs, crate::types::FileType::default_extensions())
208 }
209
210 pub fn collect_links_from_file_types(
218 self,
219 inputs: HashSet<Input>,
220 extensions: FileExtensions,
221 ) -> impl Stream<Item = Result<Request, RequestError>> {
222 let skip_missing_inputs = self.skip_missing_inputs;
223 let skip_hidden = self.skip_hidden;
224 let skip_ignored = self.skip_ignored;
225 let global_base = self.base;
226 let excluded_paths = self.excluded_paths;
227
228 let resolver = UrlContentResolver {
229 basic_auth_extractor: self.basic_auth_extractor.clone(),
230 headers: self.headers.clone(),
231 client: self.client,
232 };
233
234 let extractor = Extractor::new(
235 self.use_html5ever,
236 self.include_verbatim,
237 self.include_wikilinks,
238 );
239
240 stream::iter(inputs)
241 .par_then_unordered(None, move |input| {
242 let extensions = extensions.clone();
243 let resolver = resolver.clone();
244 let excluded_paths = excluded_paths.clone();
245 let preprocessor = self.preprocessor.clone();
246
247 async move {
248 input.get_contents(
249 skip_missing_inputs,
250 skip_hidden,
251 skip_ignored,
252 extensions,
253 resolver,
254 excluded_paths,
255 preprocessor,
256 )
257 }
258 })
259 .flatten()
260 .par_then_unordered(None, move |content| {
261 let global_base = global_base.clone();
262 let root_dir = self.root_dir.clone();
263 let basic_auth_extractor = self.basic_auth_extractor.clone();
264 async move {
265 let content = content?;
266 let uris: Vec<RawUri> = extractor.extract(&content);
267 let requests = request::create(
268 uris,
269 &content.source,
270 root_dir.as_deref(),
271 &global_base,
272 basic_auth_extractor.as_ref(),
273 );
274 Result::Ok(stream::iter(requests))
275 }
276 })
277 .try_flatten()
278 }
279}
280
281#[cfg(test)]
282mod tests {
283 use std::borrow::Cow;
284 use std::{collections::HashSet, convert::TryFrom, fs::File, io::Write};
285 use test_utils::{fixtures_path, load_fixture, mail, mock_server, website};
286
287 use http::StatusCode;
288 use reqwest::Url;
289
290 use super::*;
291 use crate::{
292 LycheeResult, Uri,
293 filter::PathExcludes,
294 types::{FileType, Input, InputSource},
295 };
296
297 async fn collect(
299 inputs: HashSet<Input>,
300 root_dir: Option<PathBuf>,
301 base: BaseInfo,
302 ) -> LycheeResult<HashSet<Uri>> {
303 let responses = Collector::new(root_dir, base)?.collect_links(inputs);
304 Ok(responses.map(|r| r.unwrap().uri).collect().await)
305 }
306
307 async fn collect_verbatim(
312 inputs: HashSet<Input>,
313 root_dir: Option<PathBuf>,
314 base: BaseInfo,
315 extensions: FileExtensions,
316 ) -> LycheeResult<HashSet<Uri>> {
317 let responses = Collector::new(root_dir, base)?
318 .include_verbatim(true)
319 .collect_links_from_file_types(inputs, extensions);
320 Ok(responses.map(|r| r.unwrap().uri).collect().await)
321 }
322
323 const TEST_STRING: &str = "http://test-string.com";
324 const TEST_URL: &str = "https://test-url.org";
325 const TEST_FILE: &str = "https://test-file.io";
326 const TEST_GLOB_1: &str = "https://test-glob-1.io";
327 const TEST_GLOB_2_MAIL: &str = "test@glob-2.io";
328
329 #[tokio::test]
330 async fn test_file_without_extension_is_plaintext() -> LycheeResult<()> {
331 let temp_dir = tempfile::tempdir().unwrap();
332 let file_path = temp_dir.path().join("README");
334 let _file = File::create(&file_path).unwrap();
335 let input = Input::new(&file_path.as_path().display().to_string(), None, true)?;
336 let contents: Vec<_> = input
337 .get_contents(
338 true,
339 true,
340 true,
341 FileType::default_extensions(),
342 UrlContentResolver::default(),
343 PathExcludes::empty(),
344 None,
345 )
346 .collect::<Vec<_>>()
347 .await;
348
349 assert_eq!(contents.len(), 1);
350 assert_eq!(contents[0].as_ref().unwrap().file_type, FileType::Plaintext);
351 Ok(())
352 }
353
354 #[tokio::test]
355 async fn test_url_without_extension_is_html() -> LycheeResult<()> {
356 let input = Input::new("https://example.com/", None, true)?;
357 let contents: Vec<_> = input
358 .get_contents(
359 true,
360 true,
361 true,
362 FileType::default_extensions(),
363 UrlContentResolver::default(),
364 PathExcludes::empty(),
365 None,
366 )
367 .collect::<Vec<_>>()
368 .await;
369
370 assert_eq!(contents.len(), 1);
371 assert_eq!(contents[0].as_ref().unwrap().file_type, FileType::Html);
372 Ok(())
373 }
374
375 #[tokio::test]
376 async fn test_collect_links() -> LycheeResult<()> {
377 let temp_dir = tempfile::tempdir().unwrap();
378 let temp_dir_path = temp_dir.path();
379
380 let file_path = temp_dir_path.join("f");
381 let file_glob_1_path = temp_dir_path.join("glob-1");
382 let file_glob_2_path = temp_dir_path.join("glob-2");
383
384 let mut file = File::create(&file_path).unwrap();
385 let mut file_glob_1 = File::create(file_glob_1_path).unwrap();
386 let mut file_glob_2 = File::create(file_glob_2_path).unwrap();
387
388 writeln!(file, "{TEST_FILE}").unwrap();
389 writeln!(file_glob_1, "{TEST_GLOB_1}").unwrap();
390 writeln!(file_glob_2, "{TEST_GLOB_2_MAIL}").unwrap();
391
392 let mock_server = mock_server!(StatusCode::OK, set_body_string(TEST_URL));
393
394 let inputs = HashSet::from_iter([
395 Input::from_input_source(InputSource::String(Cow::Borrowed(TEST_STRING))),
396 Input::from_input_source(InputSource::RemoteUrl(Box::new(
397 Url::parse(&mock_server.uri())
398 .map_err(|e| (mock_server.uri(), e))
399 .unwrap(),
400 ))),
401 Input::from_input_source(InputSource::FsPath(file_path)),
402 Input::from_input_source(InputSource::FsGlob {
403 pattern: glob::Pattern::new(&temp_dir_path.join("glob*").to_string_lossy())?,
404 ignore_case: true,
405 }),
406 ]);
407
408 let links = collect_verbatim(
409 inputs,
410 None,
411 BaseInfo::none(),
412 FileType::default_extensions(),
413 )
414 .await
415 .ok()
416 .unwrap();
417
418 let expected_links = HashSet::from_iter([
419 website!(TEST_STRING),
420 website!(TEST_URL),
421 website!(TEST_FILE),
422 website!(TEST_GLOB_1),
423 mail!(TEST_GLOB_2_MAIL),
424 ]);
425
426 assert_eq!(links, expected_links);
427
428 Ok(())
429 }
430
431 #[tokio::test]
432 async fn test_collect_markdown_links() {
433 let base = BaseInfo::try_from("https://github.com/hello-rust/lychee/").unwrap();
434 let input = Input {
435 source: InputSource::String(Cow::Borrowed(
436 "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)",
437 )),
438 file_type_hint: Some(FileType::Markdown),
439 };
440 let inputs = HashSet::from_iter([input]);
441
442 let links = collect(inputs, None, base).await.ok().unwrap();
443
444 let expected_links = HashSet::from_iter([
445 website!("https://endler.dev"),
446 website!("https://github.com/hello-rust/lychee/relative_link"),
447 ]);
448
449 assert_eq!(links, expected_links);
450 }
451
452 #[tokio::test]
453 async fn test_collect_html_links() {
454 let base = BaseInfo::try_from("https://github.com/lycheeverse/").unwrap();
455 let input = Input {
456 source: InputSource::String(Cow::Borrowed(
457 r#"<html>
458 <div class="row">
459 <a href="https://github.com/lycheeverse/lychee/">
460 <a href="blob/master/README.md">README</a>
461 </div>
462 </html>"#,
463 )),
464 file_type_hint: Some(FileType::Html),
465 };
466 let inputs = HashSet::from_iter([input]);
467
468 let links = collect(inputs, None, base).await.ok().unwrap();
469
470 let expected_links = HashSet::from_iter([
471 website!("https://github.com/lycheeverse/lychee/"),
472 website!("https://github.com/lycheeverse/blob/master/README.md"),
473 ]);
474
475 assert_eq!(links, expected_links);
476 }
477
478 #[tokio::test]
479 async fn test_collect_html_srcset() {
480 let base = BaseInfo::try_from("https://example.com/").unwrap();
481 let input = Input {
482 source: InputSource::String(Cow::Borrowed(
483 r#"
484 <img
485 src="/static/image.png"
486 srcset="
487 /static/image300.png 300w,
488 /static/image600.png 600w,
489 "
490 />
491 "#,
492 )),
493 file_type_hint: Some(FileType::Html),
494 };
495 let inputs = HashSet::from_iter([input]);
496
497 let links = collect(inputs, None, base).await.ok().unwrap();
498
499 let expected_links = HashSet::from_iter([
500 website!("https://example.com/static/image.png"),
501 website!("https://example.com/static/image300.png"),
502 website!("https://example.com/static/image600.png"),
503 ]);
504
505 assert_eq!(links, expected_links);
506 }
507
508 #[tokio::test]
509 async fn test_markdown_internal_url() {
510 let base = BaseInfo::try_from("https://localhost.com/").unwrap();
511
512 let input = Input {
513 source: InputSource::String(Cow::Borrowed(
514 "This is [an internal url](@/internal.md)
515 This is [an internal url](@/internal.markdown)
516 This is [an internal url](@/internal.markdown#example)
517 This is [an internal url](@/internal.md#example)",
518 )),
519 file_type_hint: Some(FileType::Markdown),
520 };
521 let inputs = HashSet::from_iter([input]);
522
523 let links = collect(inputs, None, base).await.ok().unwrap();
524
525 let expected = HashSet::from_iter([
526 website!("https://localhost.com/@/internal.md"),
527 website!("https://localhost.com/@/internal.markdown"),
528 website!("https://localhost.com/@/internal.md#example"),
529 website!("https://localhost.com/@/internal.markdown#example"),
530 ]);
531
532 assert_eq!(links, expected);
533 }
534
535 #[tokio::test]
536 async fn test_extract_html5_not_valid_xml_relative_links() {
537 let base = BaseInfo::try_from("https://example.com").unwrap();
538 let input = load_fixture!("TEST_HTML5.html");
539
540 let input = Input {
541 source: InputSource::String(Cow::Owned(input)),
542 file_type_hint: Some(FileType::Html),
543 };
544 let inputs = HashSet::from_iter([input]);
545
546 let links = collect(inputs, None, base).await.ok().unwrap();
547
548 let expected_links = HashSet::from_iter([
549 website!("https://example.com/body/a"),
551 website!("https://example.com/body/div_empty_a"),
552 website!("https://example.com/css/style_full_url.css"),
553 website!("https://example.com/css/style_relative_url.css"),
554 website!("https://example.com/head/home"),
555 website!("https://example.com/images/icon.png"),
556 ]);
557
558 assert_eq!(links, expected_links);
559 }
560
561 #[tokio::test]
562 async fn test_relative_url_with_base_extracted_from_input() {
563 let contents = r#"<html>
564 <div class="row">
565 <a href="https://github.com/lycheeverse/lychee/">GitHub</a>
566 <a href="/about">About</a>
567 </div>
568 </html>"#;
569 let mock_server = mock_server!(StatusCode::OK, set_body_string(contents));
570
571 let server_uri = Url::parse(&mock_server.uri()).unwrap();
572
573 let input = Input::from_input_source(InputSource::RemoteUrl(Box::new(server_uri.clone())));
574
575 let inputs = HashSet::from_iter([input]);
576
577 let links = collect(inputs, None, BaseInfo::none()).await.ok().unwrap();
578
579 let expected_urls = HashSet::from_iter([
580 website!("https://github.com/lycheeverse/lychee/"),
581 website!(&format!("{server_uri}about")),
582 ]);
583
584 assert_eq!(links, expected_urls);
585 }
586
587 #[tokio::test]
588 async fn test_email_with_query_params() {
589 let input = Input::from_input_source(InputSource::String(Cow::Borrowed(
590 "This is a mailto:user@example.com?subject=Hello link",
591 )));
592
593 let inputs = HashSet::from_iter([input]);
594
595 let links = collect(inputs, None, BaseInfo::none()).await.ok().unwrap();
596
597 let expected_links = HashSet::from_iter([mail!("user@example.com")]);
598
599 assert_eq!(links, expected_links);
600 }
601
602 #[tokio::test]
603 async fn test_multiple_remote_urls() {
604 let mock_server_1 = mock_server!(
605 StatusCode::OK,
606 set_body_string(r#"<a href="relative.html">Link</a>"#)
607 );
608 let mock_server_2 = mock_server!(
609 StatusCode::OK,
610 set_body_string(r#"<a href="relative.html">Link</a>"#)
611 );
612
613 let inputs = HashSet::from_iter([
614 Input {
615 source: InputSource::RemoteUrl(Box::new(
616 Url::parse(&format!(
617 "{}/foo/index.html",
618 mock_server_1.uri().trim_end_matches('/')
619 ))
620 .unwrap(),
621 )),
622 file_type_hint: Some(FileType::Html),
623 },
624 Input {
625 source: InputSource::RemoteUrl(Box::new(
626 Url::parse(&format!(
627 "{}/bar/index.html",
628 mock_server_2.uri().trim_end_matches('/')
629 ))
630 .unwrap(),
631 )),
632 file_type_hint: Some(FileType::Html),
633 },
634 ]);
635
636 let links = collect(inputs, None, BaseInfo::none()).await.ok().unwrap();
637
638 let expected_links = HashSet::from_iter([
639 website!(&format!(
640 "{}/foo/relative.html",
641 mock_server_1.uri().trim_end_matches('/')
642 )),
643 website!(&format!(
644 "{}/bar/relative.html",
645 mock_server_2.uri().trim_end_matches('/')
646 )),
647 ]);
648
649 assert_eq!(links, expected_links);
650 }
651
652 #[tokio::test]
653 async fn test_file_path_with_base() {
654 let base = BaseInfo::try_from("/path/to/root").unwrap();
655
656 let input = Input {
657 source: InputSource::String(Cow::Borrowed(
658 r#"
659 <a href="index.html">Index</a>
660 <a href="about.html">About</a>
661 <a href="../up.html">About</a>
662 <a href="/another.html">Another</a>
663 "#,
664 )),
665 file_type_hint: Some(FileType::Html),
666 };
667
668 let inputs = HashSet::from_iter([input]);
669
670 let links = collect(inputs, None, base).await.ok().unwrap();
671 let links_str: HashSet<_> = links.iter().map(|x| x.url.as_str()).collect();
672
673 let expected_links: HashSet<_> = HashSet::from_iter([
674 ("file:///path/to/root/index.html"),
675 ("file:///path/to/root/about.html"),
676 ("file:///path/to/up.html"),
677 ("file:///path/to/root/another.html"),
678 ]);
679
680 assert_eq!(links_str, expected_links);
681 }
682}