lychee_lib/checker/file.rs
1use http::StatusCode;
2use log::warn;
3use std::borrow::Cow;
4use std::path::{Path, PathBuf};
5
6use crate::checker::wikilink::resolver::WikilinkResolver;
7use crate::{
8 BaseInfo, ErrorKind, Result, Status, Uri,
9 utils::fragment_checker::{FragmentChecker, FragmentInput},
10};
11
12/// A utility for checking the existence and validity of file-based URIs.
13///
14/// `FileChecker` resolves and validates file paths, handling both absolute and relative paths.
15/// It supports base path resolution, fallback extensions for files without extensions,
16/// and optional fragment checking for HTML files.
17#[derive(Debug, Clone)]
18pub(crate) struct FileChecker {
19 /// List of file extensions to try if the original path doesn't exist.
20 fallback_extensions: Vec<String>,
21 /// If specified, resolves to one of the given index files if the original path
22 /// is a directory.
23 ///
24 /// If non-`None`, a directory must contain at least one of the file names
25 /// in order to be considered a valid link target. Index files names are
26 /// required to match regular files, aside from the special `.` name which
27 /// will match the directory itself.
28 ///
29 /// If `None`, index file checking is disabled and directory links are valid
30 /// as long as the directory exists on disk.
31 index_files: Option<Vec<String>>,
32 /// Whether to check for the existence of fragments (e.g., `#section-id`) in HTML files.
33 include_fragments: bool,
34 /// Utility for performing fragment checks in HTML files.
35 fragment_checker: FragmentChecker,
36 /// Utility for optionally resolving Wikilinks.
37 wikilink_resolver: Option<WikilinkResolver>,
38}
39
40impl FileChecker {
41 /// Creates a new `FileChecker` with the given configuration.
42 ///
43 /// # Arguments
44 ///
45 /// * `base` - Optional base path or URL for resolving wikilinks.
46 /// * `fallback_extensions` - List of extensions to try if the original file is not found.
47 /// * `index_files` - Optional list of index file names to search for if the path is a directory.
48 /// * `include_fragments` - Whether to check for fragment existence in HTML files.
49 /// * `include_wikilinks` - Whether to check the existence of Wikilinks found in Markdown files .
50 ///
51 /// # Errors
52 ///
53 /// Fails if an invalid `base` is provided when including wikilinks.
54 pub(crate) fn new(
55 base: &BaseInfo,
56 fallback_extensions: Vec<String>,
57 index_files: Option<Vec<String>>,
58 include_fragments: bool,
59 include_wikilinks: bool,
60 ) -> Result<Self> {
61 let wikilink_resolver = if include_wikilinks {
62 Some(WikilinkResolver::new(base, fallback_extensions.clone())?)
63 } else {
64 None
65 };
66
67 Ok(Self {
68 fallback_extensions,
69 index_files,
70 include_fragments,
71 fragment_checker: FragmentChecker::new(),
72 wikilink_resolver,
73 })
74 }
75
76 /// Checks the given file URI for existence and validity.
77 ///
78 /// This method resolves the URI to a file path, checks if the file exists,
79 /// and optionally checks for the existence of fragments in HTML files.
80 ///
81 /// # Arguments
82 ///
83 /// * `uri` - The URI to check.
84 ///
85 /// # Returns
86 ///
87 /// Returns a `Status` indicating the result of the check.
88 pub(crate) async fn check(&self, uri: &Uri) -> Status {
89 let Ok(path) = uri.url.to_file_path() else {
90 return ErrorKind::InvalidFilePath(uri.clone()).into();
91 };
92
93 let path = self.resolve_local_path(&path, uri);
94 match path {
95 Ok(path) => self.check_file(path.as_ref(), uri).await,
96 Err(err) => err.into(),
97 }
98 }
99
100 /// Resolves the given local path by applying logic which is specific to local file
101 /// checking - currently, this includes fallback extensions and index files.
102 ///
103 /// # Arguments
104 ///
105 /// * `path` - The path to check. Need not exist.
106 /// * `uri` - The original URI, used for error reporting.
107 ///
108 /// # Returns
109 ///
110 /// Returns `Ok` with the resolved path if it is valid, otherwise returns
111 /// `Err` with an appropriate error. The returned path, if any, is guaranteed
112 /// to exist and may be a file or a directory.
113 fn resolve_local_path<'a>(&self, path: &'a Path, uri: &Uri) -> Result<Cow<'a, Path>> {
114 let path = match path.metadata() {
115 // for non-existing paths, attempt fallback extensions
116 // if fallback extensions don't help, try wikilinks
117 Err(e) if e.kind() == std::io::ErrorKind::NotFound => self
118 .apply_fallback_extensions(path, uri)
119 .or_else(|_| {
120 if let Some(resolver) = &self.wikilink_resolver {
121 resolver.resolve(path, uri)
122 } else {
123 Err(ErrorKind::InvalidFilePath(uri.clone()))
124 }
125 })
126 .map(Cow::Owned),
127
128 // other IO errors are unexpected and should fail the check
129 Err(e) => Err(ErrorKind::ReadFileInput(e, path.to_path_buf())),
130
131 // existing directories are resolved via index files
132 Ok(meta) if meta.is_dir() => self.apply_index_files(path).map(Cow::Owned),
133
134 // otherwise, path is an existing file - just return the path
135 Ok(_) => Ok(Cow::Borrowed(path)),
136 };
137
138 // if initial resolution results in a directory, also attempts to apply
139 // fallback extensions. probably, this always makes sense because
140 // directories are treated as having no fragments, so a real file with
141 // a fallback extension (if it exists) will potentially contain more
142 // fragments and thus be "more useful".
143 //
144 // (currently, this case is only reachable if `.` is in the index_files list.)
145 match path {
146 Ok(dir_path) if dir_path.is_dir() => self
147 .apply_fallback_extensions(&dir_path, uri)
148 .map(Cow::Owned)
149 .or(Ok(dir_path)),
150 Ok(path) => Ok(path),
151 Err(err) => Err(err),
152 }
153 }
154
155 /// Resolves a path to a file, applying fallback extensions if necessary.
156 ///
157 /// This function will try to find a file, first by attempting the given path
158 /// itself, then by attempting the path with each extension from
159 /// [`FileChecker::fallback_extensions`]. The first existing file (not directory),
160 /// if any, will be returned.
161 ///
162 /// # Arguments
163 ///
164 /// * `path` - The path to resolve.
165 /// * `uri` - The original URI, used for error reporting.
166 ///
167 /// # Returns
168 ///
169 /// Returns `Ok(PathBuf)` with the resolved file path, or `Err` if no valid file is found.
170 /// If `Ok` is returned, the contained `PathBuf` is guaranteed to exist and be a file.
171 fn apply_fallback_extensions(&self, path: &Path, uri: &Uri) -> Result<PathBuf> {
172 // If it's already a file, use it directly
173 if path.is_file() {
174 return Ok(path.to_path_buf());
175 }
176
177 // Try fallback extensions
178 let mut path_buf = path.to_path_buf();
179 for ext in &self.fallback_extensions {
180 path_buf.set_extension(ext);
181 if path_buf.is_file() {
182 return Ok(path_buf);
183 }
184 }
185
186 Err(ErrorKind::InvalidFilePath(uri.clone()))
187 }
188
189 /// Tries to find an index file in the given directory, returning the first match.
190 /// The index file behavior is specified by [`FileChecker::index_files`].
191 ///
192 /// If this is non-`None`, index files must exist and resolved index files are
193 /// required to be files, aside from the special name `.` - this will match the
194 /// directory itself.
195 ///
196 /// If `None`, index file resolution is disabled and this function simply
197 /// returns the given path.
198 ///
199 /// # Arguments
200 ///
201 /// * `dir_path` - The directory within which to search for index files.
202 /// This is assumed to be an existing directory.
203 ///
204 /// # Returns
205 ///
206 /// Returns `Ok(PathBuf)` pointing to the first existing index file, or
207 /// `Err` if no index file is found. If `Ok` is returned, the contained `PathBuf`
208 /// is guaranteed to exist. In most cases, the returned path will be a file path.
209 ///
210 /// If index files are disabled, simply returns `Ok(dir_path)`.
211 fn apply_index_files(&self, dir_path: &Path) -> Result<PathBuf> {
212 // this implements the "disabled" case by treating a directory as its
213 // own index file.
214 let index_names_to_try = match &self.index_files {
215 Some(names) => &names[..],
216 None => &[".".to_owned()],
217 };
218
219 let invalid_index_error = || {
220 // Drop empty index file names. These will never be accepted as valid
221 // index files, and doing this makes cleaner error reporting.
222 let mut names = index_names_to_try.to_vec();
223 names.retain(|x| !x.is_empty());
224
225 ErrorKind::InvalidIndexFile(names)
226 };
227
228 index_names_to_try
229 .iter()
230 .find_map(|filename| {
231 // for some special index file names, we accept directories as well
232 // as files.
233 let exists = match filename.as_str() {
234 "." => Path::exists,
235 _ => Path::is_file,
236 };
237
238 let path = dir_path.join(filename);
239 exists(&path).then_some(path)
240 })
241 .ok_or_else(invalid_index_error)
242 }
243
244 /// Checks a resolved file, optionally verifying fragments for HTML files.
245 ///
246 /// # Arguments
247 ///
248 /// * `path` - The resolved path to check.
249 /// * `uri` - The original URI, used for error reporting.
250 ///
251 /// # Returns
252 ///
253 /// Returns a `Status` indicating the result of the check.
254 async fn check_file(&self, path: &Path, uri: &Uri) -> Status {
255 if self.include_fragments {
256 self.check_fragment(path, uri).await
257 } else {
258 Status::Ok(StatusCode::OK)
259 }
260 }
261
262 /// Checks for the existence of a fragment in a path.
263 ///
264 /// The given path may be a file or a directory. A directory
265 /// is treated as if it was an empty file with no fragments.
266 ///
267 /// # Arguments
268 ///
269 /// * `path` - The path to the file or directory. Assumed to exist.
270 /// * `uri` - The original URI, containing the fragment to check.
271 ///
272 /// # Returns
273 ///
274 /// Returns a `Status` indicating the result of the fragment check.
275 async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
276 // for absent or trivial fragments, always return success.
277 if uri.url.fragment().is_none_or(str::is_empty) {
278 return Status::Ok(StatusCode::OK);
279 }
280
281 // directories are treated as if they were a file with no fragments.
282 // reaching here means we have a non-trivial fragment on a directory,
283 // so return error.
284 if path.is_dir() {
285 return ErrorKind::InvalidFragment(uri.clone()).into();
286 }
287
288 match FragmentInput::from_path(path).await {
289 Ok(input) => match self.fragment_checker.check(input, &uri.url).await {
290 Ok(true) => Status::Ok(StatusCode::OK),
291 Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
292 Err(err) => {
293 warn!("Skipping fragment check for {uri} due to the following error: {err}");
294 Status::Ok(StatusCode::OK)
295 }
296 },
297 Err(err) => {
298 warn!("Skipping fragment check for {uri} due to the following error: {err}");
299 Status::Ok(StatusCode::OK)
300 }
301 }
302 }
303}
304
305#[cfg(test)]
306mod tests {
307 use super::FileChecker;
308 use crate::{
309 BaseInfo,
310 ErrorKind::{InvalidFilePath, InvalidFragment, InvalidIndexFile},
311 Status, Uri,
312 };
313 use test_utils::{fixture_uri, fixtures_path};
314
315 /// Calls [`FileChecker::check`] on the given [`FileChecker`] with given URL
316 /// path (relative to the fixtures directory).
317 ///
318 /// The result of checking the link is matched against the given pattern.
319 macro_rules! assert_filecheck {
320 ($checker:expr, $path:expr, $pattern:pat) => {
321 let uri = Uri::from(fixture_uri!($path));
322 let result = $checker.check(&uri).await;
323 assert!(
324 matches!(result, $pattern),
325 "assertion failed: {} should be {} but was '{:?}'",
326 &uri,
327 stringify!($pattern),
328 &result
329 );
330 };
331 }
332
333 /// Calls [`FileChecker::resolve_local_path`] on the given [`FileChecker`]
334 /// with given URL path (relative to the fixtures directory).
335 ///
336 /// The result of resolving the link is matched against the given pattern.
337 /// The pattern should match values of type `Result<&str, ErrorKind>`.
338 macro_rules! assert_resolves {
339 ($checker:expr, $subpath:expr, $expected:pat) => {
340 let uri = Uri::from(fixture_uri!($subpath));
341 let path = uri
342 .url
343 .to_file_path()
344 .expect("fixture uri should be a valid path");
345 let result = $checker.resolve_local_path(&path, &uri);
346 let result_subpath = result
347 .as_deref()
348 .map(|p| p.strip_prefix(fixtures_path!()).unwrap())
349 .map(|p| p.to_string_lossy());
350 assert!(
351 matches!(result_subpath.as_deref(), $expected),
352 "{:?} resolved to {:?} but should be {}",
353 $subpath,
354 result_subpath,
355 stringify!($expected)
356 );
357 };
358 }
359
360 #[tokio::test]
361 async fn test_default() {
362 // default behaviour accepts dir links as long as the directory exists.
363 let checker = FileChecker::new(&BaseInfo::none(), vec![], None, true, false).unwrap();
364
365 assert_filecheck!(&checker, "filechecker/index_dir", Status::Ok(_));
366
367 // empty dir is accepted with '.' in index_files, but it contains no fragments.
368 assert_resolves!(
369 &checker,
370 "filechecker/empty_dir",
371 Ok("filechecker/empty_dir")
372 );
373 assert_filecheck!(&checker, "filechecker/empty_dir", Status::Ok(_));
374 assert_filecheck!(&checker, "filechecker/empty_dir#", Status::Ok(_));
375 assert_filecheck!(
376 &checker,
377 "filechecker/empty_dir#fragment",
378 Status::Error(InvalidFragment(_))
379 );
380
381 // even though index.html is present, it is not used because index_files is only
382 // '.', so no fragments are found.
383 assert_resolves!(
384 &checker,
385 "filechecker/index_dir",
386 Ok("filechecker/index_dir")
387 );
388 assert_filecheck!(
389 &checker,
390 "filechecker/index_dir#fragment",
391 Status::Error(InvalidFragment(_))
392 );
393 assert_filecheck!(
394 &checker,
395 "filechecker/index_dir#non-existingfragment",
396 Status::Error(InvalidFragment(_))
397 );
398
399 assert_filecheck!(&checker, "filechecker/same_name", Status::Ok(_));
400
401 // because no fallback extensions are configured
402 assert_resolves!(
403 &checker,
404 "filechecker/same_name",
405 Ok("filechecker/same_name")
406 );
407 assert_filecheck!(
408 &checker,
409 "filechecker/same_name#a",
410 Status::Error(InvalidFragment(_))
411 );
412 }
413
414 #[tokio::test]
415 async fn test_index_files() {
416 let checker = FileChecker::new(
417 &BaseInfo::none(),
418 vec![],
419 Some(vec!["index.html".to_owned(), "index.md".to_owned()]),
420 true,
421 false,
422 )
423 .unwrap();
424
425 assert_resolves!(
426 &checker,
427 "filechecker/index_dir",
428 Ok("filechecker/index_dir/index.html")
429 );
430 assert_resolves!(
431 &checker,
432 "filechecker/index_md",
433 Ok("filechecker/index_md/index.md")
434 );
435 // empty is rejected because of no index.html
436 assert_resolves!(&checker, "filechecker/empty_dir", Err(InvalidIndexFile(_)));
437
438 // index.html is resolved and fragments are checked.
439 assert_filecheck!(&checker, "filechecker/index_dir#fragment", Status::Ok(_));
440 assert_filecheck!(
441 &checker,
442 "filechecker/index_dir#non-existingfragment",
443 Status::Error(InvalidFragment(_))
444 );
445
446 // directories which look like files should still have index files applied
447 assert_resolves!(
448 &checker,
449 "filechecker/dir_with_extension.html",
450 Err(InvalidIndexFile(_))
451 );
452 }
453
454 #[tokio::test]
455 async fn test_both_fallback_and_index_corner() {
456 let checker = FileChecker::new(
457 &BaseInfo::none(),
458 vec!["html".to_owned()],
459 Some(vec!["index".to_owned()]),
460 false,
461 false,
462 )
463 .unwrap();
464
465 // this test case has a subdir 'same_name' and a file 'same_name.html'.
466 // this shows that the index file resolving is applied in this case and
467 // fallback extensions are not applied.
468 assert_resolves!(&checker, "filechecker/same_name", Err(InvalidIndexFile(_)));
469
470 // this directory has an index.html, but the index_files argument is only "index". this
471 // shows that fallback extensions are not applied to index file names, as the index.html is
472 // not found.
473 assert_resolves!(&checker, "filechecker/index_dir", Err(InvalidIndexFile(_)));
474
475 // a directory called 'dir_with_extension.html' exists. this test shows that fallback
476 // extensions must resolve to a file not a directory.
477 assert_resolves!(
478 &checker,
479 "filechecker/dir_with_extension",
480 Err(InvalidFilePath(_))
481 );
482 }
483
484 #[tokio::test]
485 async fn test_empty_index_list_corner() {
486 // empty index_files list will reject all directory links
487 let checker_no_indexes =
488 FileChecker::new(&BaseInfo::none(), vec![], Some(vec![]), false, false).unwrap();
489 assert_resolves!(
490 &checker_no_indexes,
491 "filechecker/index_dir",
492 Err(InvalidIndexFile(_))
493 );
494 assert_resolves!(
495 &checker_no_indexes,
496 "filechecker/empty_dir",
497 Err(InvalidIndexFile(_))
498 );
499 }
500
501 #[tokio::test]
502 async fn test_index_list_of_directories_corner() {
503 // this test defines index_files to be a list of different names, all of which will
504 // resolve to an existing directory. however, because they are directories and not
505 // the special '.' name, these should not be accepted as valid index files.
506 let dir_names = vec![
507 String::new(),
508 "./.".to_owned(),
509 "..".to_owned(),
510 "/".to_owned(),
511 ];
512 let checker_dir_indexes =
513 FileChecker::new(&BaseInfo::none(), vec![], Some(dir_names), false, false).unwrap();
514 assert_resolves!(
515 &checker_dir_indexes,
516 "filechecker/index_dir",
517 Err(InvalidIndexFile(_))
518 );
519 assert_resolves!(
520 &checker_dir_indexes,
521 "filechecker/empty_dir",
522 Err(InvalidIndexFile(_))
523 );
524 }
525
526 #[tokio::test]
527 async fn test_index_file_traversal_corner() {
528 // index file names can contain path fragments and they will be traversed.
529 let checker_dotdot = FileChecker::new(
530 &BaseInfo::none(),
531 vec![],
532 Some(vec!["../index_dir/index.html".to_owned()]),
533 true,
534 false,
535 )
536 .unwrap();
537 assert_resolves!(
538 &checker_dotdot,
539 "filechecker/empty_dir#fragment",
540 Ok("filechecker/empty_dir/../index_dir/index.html")
541 );
542
543 // absolute paths to a file on disk should also work
544 let absolute_html = fixtures_path!()
545 .join("filechecker/index_dir/index.html")
546 .to_str()
547 .expect("expected utf-8 fixtures path")
548 .to_owned();
549 let checker_absolute = FileChecker::new(
550 &BaseInfo::none(),
551 vec![],
552 Some(vec![absolute_html]),
553 true,
554 false,
555 )
556 .unwrap();
557 assert_resolves!(
558 &checker_absolute,
559 "filechecker/empty_dir#fragment",
560 Ok("filechecker/index_dir/index.html")
561 );
562 }
563
564 #[tokio::test]
565 async fn test_fallback_extensions_on_directories() {
566 let checker = FileChecker::new(
567 &BaseInfo::none(),
568 vec!["html".to_owned()],
569 None,
570 true,
571 false,
572 )
573 .unwrap();
574
575 // fallback extensions should be applied when directory links are resolved
576 // to directories (i.e., the default index_files behavior or if `.`
577 // appears in index_files).
578 assert_resolves!(
579 &checker,
580 "filechecker/same_name#a",
581 Ok("filechecker/same_name.html")
582 );
583
584 // currently, trailing slashes are ignored and fallback extensions are
585 // applied regardless. maybe links with trailing slash should be prevented
586 // from resolving to files.
587 assert_resolves!(
588 &checker,
589 "filechecker/same_name/",
590 Ok("filechecker/same_name.html")
591 );
592 }
593}