lychee_lib/checker/
file.rs

1use http::StatusCode;
2use log::warn;
3use std::borrow::Cow;
4use std::path::{Path, PathBuf};
5
6use crate::checker::wikilink::resolver::WikilinkResolver;
7use crate::{
8    BaseInfo, ErrorKind, Result, Status, Uri,
9    utils::fragment_checker::{FragmentChecker, FragmentInput},
10};
11
12/// A utility for checking the existence and validity of file-based URIs.
13///
14/// `FileChecker` resolves and validates file paths, handling both absolute and relative paths.
15/// It supports base path resolution, fallback extensions for files without extensions,
16/// and optional fragment checking for HTML files.
17#[derive(Debug, Clone)]
18pub(crate) struct FileChecker {
19    /// List of file extensions to try if the original path doesn't exist.
20    fallback_extensions: Vec<String>,
21    /// If specified, resolves to one of the given index files if the original path
22    /// is a directory.
23    ///
24    /// If non-`None`, a directory must contain at least one of the file names
25    /// in order to be considered a valid link target. Index files names are
26    /// required to match regular files, aside from the special `.` name which
27    /// will match the directory itself.
28    ///
29    /// If `None`, index file checking is disabled and directory links are valid
30    /// as long as the directory exists on disk.
31    index_files: Option<Vec<String>>,
32    /// Whether to check for the existence of fragments (e.g., `#section-id`) in HTML files.
33    include_fragments: bool,
34    /// Utility for performing fragment checks in HTML files.
35    fragment_checker: FragmentChecker,
36    /// Utility for optionally resolving Wikilinks.
37    wikilink_resolver: Option<WikilinkResolver>,
38}
39
40impl FileChecker {
41    /// Creates a new `FileChecker` with the given configuration.
42    ///
43    /// # Arguments
44    ///
45    /// * `base` - Optional base path or URL for resolving wikilinks.
46    /// * `fallback_extensions` - List of extensions to try if the original file is not found.
47    /// * `index_files` - Optional list of index file names to search for if the path is a directory.
48    /// * `include_fragments` - Whether to check for fragment existence in HTML files.
49    /// * `include_wikilinks` - Whether to check the existence of Wikilinks found in Markdown files .
50    ///
51    /// # Errors
52    ///
53    /// Fails if an invalid `base` is provided when including wikilinks.
54    pub(crate) fn new(
55        base: &BaseInfo,
56        fallback_extensions: Vec<String>,
57        index_files: Option<Vec<String>>,
58        include_fragments: bool,
59        include_wikilinks: bool,
60    ) -> Result<Self> {
61        let wikilink_resolver = if include_wikilinks {
62            Some(WikilinkResolver::new(base, fallback_extensions.clone())?)
63        } else {
64            None
65        };
66
67        Ok(Self {
68            fallback_extensions,
69            index_files,
70            include_fragments,
71            fragment_checker: FragmentChecker::new(),
72            wikilink_resolver,
73        })
74    }
75
76    /// Checks the given file URI for existence and validity.
77    ///
78    /// This method resolves the URI to a file path, checks if the file exists,
79    /// and optionally checks for the existence of fragments in HTML files.
80    ///
81    /// # Arguments
82    ///
83    /// * `uri` - The URI to check.
84    ///
85    /// # Returns
86    ///
87    /// Returns a `Status` indicating the result of the check.
88    pub(crate) async fn check(&self, uri: &Uri) -> Status {
89        let Ok(path) = uri.url.to_file_path() else {
90            return ErrorKind::InvalidFilePath(uri.clone()).into();
91        };
92
93        let path = self.resolve_local_path(&path, uri);
94        match path {
95            Ok(path) => self.check_file(path.as_ref(), uri).await,
96            Err(err) => err.into(),
97        }
98    }
99
100    /// Resolves the given local path by applying logic which is specific to local file
101    /// checking - currently, this includes fallback extensions and index files.
102    ///
103    /// # Arguments
104    ///
105    /// * `path` - The path to check. Need not exist.
106    /// * `uri` - The original URI, used for error reporting.
107    ///
108    /// # Returns
109    ///
110    /// Returns `Ok` with the resolved path if it is valid, otherwise returns
111    /// `Err` with an appropriate error. The returned path, if any, is guaranteed
112    /// to exist and may be a file or a directory.
113    fn resolve_local_path<'a>(&self, path: &'a Path, uri: &Uri) -> Result<Cow<'a, Path>> {
114        let path = match path.metadata() {
115            // for non-existing paths, attempt fallback extensions
116            // if fallback extensions don't help, try wikilinks
117            Err(e) if e.kind() == std::io::ErrorKind::NotFound => self
118                .apply_fallback_extensions(path, uri)
119                .or_else(|_| {
120                    if let Some(resolver) = &self.wikilink_resolver {
121                        resolver.resolve(path, uri)
122                    } else {
123                        Err(ErrorKind::InvalidFilePath(uri.clone()))
124                    }
125                })
126                .map(Cow::Owned),
127
128            // other IO errors are unexpected and should fail the check
129            Err(e) => Err(ErrorKind::ReadFileInput(e, path.to_path_buf())),
130
131            // existing directories are resolved via index files
132            Ok(meta) if meta.is_dir() => self.apply_index_files(path).map(Cow::Owned),
133
134            // otherwise, path is an existing file - just return the path
135            Ok(_) => Ok(Cow::Borrowed(path)),
136        };
137
138        // if initial resolution results in a directory, also attempts to apply
139        // fallback extensions. probably, this always makes sense because
140        // directories are treated as having no fragments, so a real file with
141        // a fallback extension (if it exists) will potentially contain more
142        // fragments and thus be "more useful".
143        //
144        // (currently, this case is only reachable if `.` is in the index_files list.)
145        match path {
146            Ok(dir_path) if dir_path.is_dir() => self
147                .apply_fallback_extensions(&dir_path, uri)
148                .map(Cow::Owned)
149                .or(Ok(dir_path)),
150            Ok(path) => Ok(path),
151            Err(err) => Err(err),
152        }
153    }
154
155    /// Resolves a path to a file, applying fallback extensions if necessary.
156    ///
157    /// This function will try to find a file, first by attempting the given path
158    /// itself, then by attempting the path with each extension from
159    /// [`FileChecker::fallback_extensions`]. The first existing file (not directory),
160    /// if any, will be returned.
161    ///
162    /// # Arguments
163    ///
164    /// * `path` - The path to resolve.
165    /// * `uri` - The original URI, used for error reporting.
166    ///
167    /// # Returns
168    ///
169    /// Returns `Ok(PathBuf)` with the resolved file path, or `Err` if no valid file is found.
170    /// If `Ok` is returned, the contained `PathBuf` is guaranteed to exist and be a file.
171    fn apply_fallback_extensions(&self, path: &Path, uri: &Uri) -> Result<PathBuf> {
172        // If it's already a file, use it directly
173        if path.is_file() {
174            return Ok(path.to_path_buf());
175        }
176
177        // Try fallback extensions
178        let mut path_buf = path.to_path_buf();
179        for ext in &self.fallback_extensions {
180            path_buf.set_extension(ext);
181            if path_buf.is_file() {
182                return Ok(path_buf);
183            }
184        }
185
186        Err(ErrorKind::InvalidFilePath(uri.clone()))
187    }
188
189    /// Tries to find an index file in the given directory, returning the first match.
190    /// The index file behavior is specified by [`FileChecker::index_files`].
191    ///
192    /// If this is non-`None`, index files must exist and resolved index files are
193    /// required to be files, aside from the special name `.` - this will match the
194    /// directory itself.
195    ///
196    /// If `None`, index file resolution is disabled and this function simply
197    /// returns the given path.
198    ///
199    /// # Arguments
200    ///
201    /// * `dir_path` - The directory within which to search for index files.
202    ///   This is assumed to be an existing directory.
203    ///
204    /// # Returns
205    ///
206    /// Returns `Ok(PathBuf)` pointing to the first existing index file, or
207    /// `Err` if no index file is found. If `Ok` is returned, the contained `PathBuf`
208    /// is guaranteed to exist. In most cases, the returned path will be a file path.
209    ///
210    /// If index files are disabled, simply returns `Ok(dir_path)`.
211    fn apply_index_files(&self, dir_path: &Path) -> Result<PathBuf> {
212        // this implements the "disabled" case by treating a directory as its
213        // own index file.
214        let index_names_to_try = match &self.index_files {
215            Some(names) => &names[..],
216            None => &[".".to_owned()],
217        };
218
219        let invalid_index_error = || {
220            // Drop empty index file names. These will never be accepted as valid
221            // index files, and doing this makes cleaner error reporting.
222            let mut names = index_names_to_try.to_vec();
223            names.retain(|x| !x.is_empty());
224
225            ErrorKind::InvalidIndexFile(names)
226        };
227
228        index_names_to_try
229            .iter()
230            .find_map(|filename| {
231                // for some special index file names, we accept directories as well
232                // as files.
233                let exists = match filename.as_str() {
234                    "." => Path::exists,
235                    _ => Path::is_file,
236                };
237
238                let path = dir_path.join(filename);
239                exists(&path).then_some(path)
240            })
241            .ok_or_else(invalid_index_error)
242    }
243
244    /// Checks a resolved file, optionally verifying fragments for HTML files.
245    ///
246    /// # Arguments
247    ///
248    /// * `path` - The resolved path to check.
249    /// * `uri` - The original URI, used for error reporting.
250    ///
251    /// # Returns
252    ///
253    /// Returns a `Status` indicating the result of the check.
254    async fn check_file(&self, path: &Path, uri: &Uri) -> Status {
255        if self.include_fragments {
256            self.check_fragment(path, uri).await
257        } else {
258            Status::Ok(StatusCode::OK)
259        }
260    }
261
262    /// Checks for the existence of a fragment in a path.
263    ///
264    /// The given path may be a file or a directory. A directory
265    /// is treated as if it was an empty file with no fragments.
266    ///
267    /// # Arguments
268    ///
269    /// * `path` - The path to the file or directory. Assumed to exist.
270    /// * `uri` - The original URI, containing the fragment to check.
271    ///
272    /// # Returns
273    ///
274    /// Returns a `Status` indicating the result of the fragment check.
275    async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
276        // for absent or trivial fragments, always return success.
277        if uri.url.fragment().is_none_or(str::is_empty) {
278            return Status::Ok(StatusCode::OK);
279        }
280
281        // directories are treated as if they were a file with no fragments.
282        // reaching here means we have a non-trivial fragment on a directory,
283        // so return error.
284        if path.is_dir() {
285            return ErrorKind::InvalidFragment(uri.clone()).into();
286        }
287
288        match FragmentInput::from_path(path).await {
289            Ok(input) => match self.fragment_checker.check(input, &uri.url).await {
290                Ok(true) => Status::Ok(StatusCode::OK),
291                Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
292                Err(err) => {
293                    warn!("Skipping fragment check for {uri} due to the following error: {err}");
294                    Status::Ok(StatusCode::OK)
295                }
296            },
297            Err(err) => {
298                warn!("Skipping fragment check for {uri} due to the following error: {err}");
299                Status::Ok(StatusCode::OK)
300            }
301        }
302    }
303}
304
305#[cfg(test)]
306mod tests {
307    use super::FileChecker;
308    use crate::{
309        BaseInfo,
310        ErrorKind::{InvalidFilePath, InvalidFragment, InvalidIndexFile},
311        Status, Uri,
312    };
313    use test_utils::{fixture_uri, fixtures_path};
314
315    /// Calls [`FileChecker::check`] on the given [`FileChecker`] with given URL
316    /// path (relative to the fixtures directory).
317    ///
318    /// The result of checking the link is matched against the given pattern.
319    macro_rules! assert_filecheck {
320        ($checker:expr, $path:expr, $pattern:pat) => {
321            let uri = Uri::from(fixture_uri!($path));
322            let result = $checker.check(&uri).await;
323            assert!(
324                matches!(result, $pattern),
325                "assertion failed: {} should be {} but was '{:?}'",
326                &uri,
327                stringify!($pattern),
328                &result
329            );
330        };
331    }
332
333    /// Calls [`FileChecker::resolve_local_path`] on the given [`FileChecker`]
334    /// with given URL path (relative to the fixtures directory).
335    ///
336    /// The result of resolving the link is matched against the given pattern.
337    /// The pattern should match values of type `Result<&str, ErrorKind>`.
338    macro_rules! assert_resolves {
339        ($checker:expr, $subpath:expr, $expected:pat) => {
340            let uri = Uri::from(fixture_uri!($subpath));
341            let path = uri
342                .url
343                .to_file_path()
344                .expect("fixture uri should be a valid path");
345            let result = $checker.resolve_local_path(&path, &uri);
346            let result_subpath = result
347                .as_deref()
348                .map(|p| p.strip_prefix(fixtures_path!()).unwrap())
349                .map(|p| p.to_string_lossy());
350            assert!(
351                matches!(result_subpath.as_deref(), $expected),
352                "{:?} resolved to {:?} but should be {}",
353                $subpath,
354                result_subpath,
355                stringify!($expected)
356            );
357        };
358    }
359
360    #[tokio::test]
361    async fn test_default() {
362        // default behaviour accepts dir links as long as the directory exists.
363        let checker = FileChecker::new(&BaseInfo::none(), vec![], None, true, false).unwrap();
364
365        assert_filecheck!(&checker, "filechecker/index_dir", Status::Ok(_));
366
367        // empty dir is accepted with '.' in index_files, but it contains no fragments.
368        assert_resolves!(
369            &checker,
370            "filechecker/empty_dir",
371            Ok("filechecker/empty_dir")
372        );
373        assert_filecheck!(&checker, "filechecker/empty_dir", Status::Ok(_));
374        assert_filecheck!(&checker, "filechecker/empty_dir#", Status::Ok(_));
375        assert_filecheck!(
376            &checker,
377            "filechecker/empty_dir#fragment",
378            Status::Error(InvalidFragment(_))
379        );
380
381        // even though index.html is present, it is not used because index_files is only
382        // '.', so no fragments are found.
383        assert_resolves!(
384            &checker,
385            "filechecker/index_dir",
386            Ok("filechecker/index_dir")
387        );
388        assert_filecheck!(
389            &checker,
390            "filechecker/index_dir#fragment",
391            Status::Error(InvalidFragment(_))
392        );
393        assert_filecheck!(
394            &checker,
395            "filechecker/index_dir#non-existingfragment",
396            Status::Error(InvalidFragment(_))
397        );
398
399        assert_filecheck!(&checker, "filechecker/same_name", Status::Ok(_));
400
401        // because no fallback extensions are configured
402        assert_resolves!(
403            &checker,
404            "filechecker/same_name",
405            Ok("filechecker/same_name")
406        );
407        assert_filecheck!(
408            &checker,
409            "filechecker/same_name#a",
410            Status::Error(InvalidFragment(_))
411        );
412    }
413
414    #[tokio::test]
415    async fn test_index_files() {
416        let checker = FileChecker::new(
417            &BaseInfo::none(),
418            vec![],
419            Some(vec!["index.html".to_owned(), "index.md".to_owned()]),
420            true,
421            false,
422        )
423        .unwrap();
424
425        assert_resolves!(
426            &checker,
427            "filechecker/index_dir",
428            Ok("filechecker/index_dir/index.html")
429        );
430        assert_resolves!(
431            &checker,
432            "filechecker/index_md",
433            Ok("filechecker/index_md/index.md")
434        );
435        // empty is rejected because of no index.html
436        assert_resolves!(&checker, "filechecker/empty_dir", Err(InvalidIndexFile(_)));
437
438        // index.html is resolved and fragments are checked.
439        assert_filecheck!(&checker, "filechecker/index_dir#fragment", Status::Ok(_));
440        assert_filecheck!(
441            &checker,
442            "filechecker/index_dir#non-existingfragment",
443            Status::Error(InvalidFragment(_))
444        );
445
446        // directories which look like files should still have index files applied
447        assert_resolves!(
448            &checker,
449            "filechecker/dir_with_extension.html",
450            Err(InvalidIndexFile(_))
451        );
452    }
453
454    #[tokio::test]
455    async fn test_both_fallback_and_index_corner() {
456        let checker = FileChecker::new(
457            &BaseInfo::none(),
458            vec!["html".to_owned()],
459            Some(vec!["index".to_owned()]),
460            false,
461            false,
462        )
463        .unwrap();
464
465        // this test case has a subdir 'same_name' and a file 'same_name.html'.
466        // this shows that the index file resolving is applied in this case and
467        // fallback extensions are not applied.
468        assert_resolves!(&checker, "filechecker/same_name", Err(InvalidIndexFile(_)));
469
470        // this directory has an index.html, but the index_files argument is only "index". this
471        // shows that fallback extensions are not applied to index file names, as the index.html is
472        // not found.
473        assert_resolves!(&checker, "filechecker/index_dir", Err(InvalidIndexFile(_)));
474
475        // a directory called 'dir_with_extension.html' exists. this test shows that fallback
476        // extensions must resolve to a file not a directory.
477        assert_resolves!(
478            &checker,
479            "filechecker/dir_with_extension",
480            Err(InvalidFilePath(_))
481        );
482    }
483
484    #[tokio::test]
485    async fn test_empty_index_list_corner() {
486        // empty index_files list will reject all directory links
487        let checker_no_indexes =
488            FileChecker::new(&BaseInfo::none(), vec![], Some(vec![]), false, false).unwrap();
489        assert_resolves!(
490            &checker_no_indexes,
491            "filechecker/index_dir",
492            Err(InvalidIndexFile(_))
493        );
494        assert_resolves!(
495            &checker_no_indexes,
496            "filechecker/empty_dir",
497            Err(InvalidIndexFile(_))
498        );
499    }
500
501    #[tokio::test]
502    async fn test_index_list_of_directories_corner() {
503        // this test defines index_files to be a list of different names, all of which will
504        // resolve to an existing directory. however, because they are directories and not
505        // the special '.' name, these should not be accepted as valid index files.
506        let dir_names = vec![
507            String::new(),
508            "./.".to_owned(),
509            "..".to_owned(),
510            "/".to_owned(),
511        ];
512        let checker_dir_indexes =
513            FileChecker::new(&BaseInfo::none(), vec![], Some(dir_names), false, false).unwrap();
514        assert_resolves!(
515            &checker_dir_indexes,
516            "filechecker/index_dir",
517            Err(InvalidIndexFile(_))
518        );
519        assert_resolves!(
520            &checker_dir_indexes,
521            "filechecker/empty_dir",
522            Err(InvalidIndexFile(_))
523        );
524    }
525
526    #[tokio::test]
527    async fn test_index_file_traversal_corner() {
528        // index file names can contain path fragments and they will be traversed.
529        let checker_dotdot = FileChecker::new(
530            &BaseInfo::none(),
531            vec![],
532            Some(vec!["../index_dir/index.html".to_owned()]),
533            true,
534            false,
535        )
536        .unwrap();
537        assert_resolves!(
538            &checker_dotdot,
539            "filechecker/empty_dir#fragment",
540            Ok("filechecker/empty_dir/../index_dir/index.html")
541        );
542
543        // absolute paths to a file on disk should also work
544        let absolute_html = fixtures_path!()
545            .join("filechecker/index_dir/index.html")
546            .to_str()
547            .expect("expected utf-8 fixtures path")
548            .to_owned();
549        let checker_absolute = FileChecker::new(
550            &BaseInfo::none(),
551            vec![],
552            Some(vec![absolute_html]),
553            true,
554            false,
555        )
556        .unwrap();
557        assert_resolves!(
558            &checker_absolute,
559            "filechecker/empty_dir#fragment",
560            Ok("filechecker/index_dir/index.html")
561        );
562    }
563
564    #[tokio::test]
565    async fn test_fallback_extensions_on_directories() {
566        let checker = FileChecker::new(
567            &BaseInfo::none(),
568            vec!["html".to_owned()],
569            None,
570            true,
571            false,
572        )
573        .unwrap();
574
575        // fallback extensions should be applied when directory links are resolved
576        // to directories (i.e., the default index_files behavior or if `.`
577        // appears in index_files).
578        assert_resolves!(
579            &checker,
580            "filechecker/same_name#a",
581            Ok("filechecker/same_name.html")
582        );
583
584        // currently, trailing slashes are ignored and fallback extensions are
585        // applied regardless. maybe links with trailing slash should be prevented
586        // from resolving to files.
587        assert_resolves!(
588            &checker,
589            "filechecker/same_name/",
590            Ok("filechecker/same_name.html")
591        );
592    }
593}