lychee_lib/types/input/
source.rs

1//! Input source type definitions.
2//!
3//! lychee can handle different kinds of input sources:
4//! - URLs (of HTTP/HTTPS scheme)
5//! - File system paths (to files or directories)
6//! - Unix shell-style glob patterns (e.g. `./docs/**/*.md`)
7//! - Standard input (`stdin`)
8//! - Raw strings (UTF-8 only for now)
9//!
10//! Each input source is handled differently:
11//! - File paths are walked (if they are directories) and filtered by
12//!   extension
13//! - Glob patterns are expanded to matching file paths, which are then walked
14//!   and filtered by extension
15//! - URLs, raw strings, and standard input (`stdin`) are read directly
16
17use crate::ErrorKind;
18
19use glob::Pattern;
20use reqwest::Url;
21use serde::{Deserialize, Deserializer, Serialize};
22use std::borrow::Cow;
23use std::fmt::Display;
24use std::ops::Deref;
25use std::path::PathBuf;
26use std::result::Result;
27
28/// Input types which lychee supports
29#[derive(Debug, Clone, PartialEq, Eq, Hash, Deserialize)]
30#[non_exhaustive]
31pub enum InputSource {
32    /// URL (of HTTP/HTTPS scheme).
33    RemoteUrl(Box<Url>),
34    /// Unix shell-style glob pattern.
35    FsGlob {
36        /// The glob pattern matching all input files
37        #[serde(deserialize_with = "InputSource::deserialize_pattern")]
38        pattern: Pattern,
39        /// Don't be case sensitive when matching files against a glob pattern
40        ignore_case: bool,
41    },
42    /// File path.
43    FsPath(PathBuf),
44    /// Standard Input.
45    Stdin,
46    /// Raw string input.
47    String(Cow<'static, str>),
48}
49
50impl InputSource {
51    const STDIN: &str = "-";
52
53    /// Parses a [`InputSource`] from the given string. The kind of input source will be
54    /// automatically detected according to certain rules and precedences.
55    ///
56    /// # Errors
57    ///
58    /// Returns an error if:
59    /// - the input does not exist (i.e. the path is invalid)
60    /// - the input cannot be parsed as a URL
61    pub fn new(input: &str, glob_ignore_case: bool) -> Result<Self, ErrorKind> {
62        if input == Self::STDIN {
63            return Ok(InputSource::Stdin);
64        }
65
66        // We use [`reqwest::Url::parse`] because it catches some other edge cases that [`http::Request:builder`] does not
67        if let Ok(url) = Url::parse(input) {
68            // Weed out non-HTTP schemes, including Windows drive
69            // specifiers, which can be parsed by the
70            // [url](https://crates.io/crates/url) crate
71            return match url.scheme() {
72                "http" | "https" => Ok(InputSource::RemoteUrl(Box::new(url))),
73                _ => Err(ErrorKind::InvalidFile(PathBuf::from(input))),
74            };
75        }
76
77        // This seems to be the only way to determine if this is a glob pattern
78        let is_glob = glob::Pattern::escape(input) != input;
79
80        if is_glob {
81            return Ok(InputSource::FsGlob {
82                pattern: Pattern::new(input)?,
83                ignore_case: glob_ignore_case,
84            });
85        }
86
87        // It might be a file path; check if it exists
88        let path = PathBuf::from(input);
89
90        // On Windows, a filepath can never be mistaken for a
91        // URL, because Windows filepaths use `\` and URLs use
92        // `/`
93        #[cfg(windows)]
94        if path.exists() {
95            // The file exists, so we return the path
96            Ok(InputSource::FsPath(path))
97        } else {
98            // We have a valid filepath, but the file does not
99            // exist so we return an error
100            Err(ErrorKind::InvalidFile(path))
101        }
102
103        #[cfg(unix)]
104        if path.exists() {
105            Ok(InputSource::FsPath(path))
106        } else if input.starts_with('~') || input.starts_with('.') {
107            // The path is not valid, but it might still be a
108            // valid URL.
109            //
110            // Check if the path starts with a tilde (`~`) or a
111            // dot and exit early if it does.
112            //
113            // This check might not be sufficient to cover all cases
114            // but it catches the most common ones
115            Err(ErrorKind::InvalidFile(path))
116        } else {
117            // Invalid path; check if a valid URL can be constructed from the input
118            // by prefixing it with a `http://` scheme.
119            //
120            // Curl also uses http (i.e. not https), see
121            // https://github.com/curl/curl/blob/70ac27604a2abfa809a7b2736506af0da8c3c8a9/lib/urlapi.c#L1104-L1124
122            //
123            // TODO: We should get rid of this heuristic and
124            // require users to provide a full URL with scheme.
125            // This is a big source of confusion to users.
126            let url = Url::parse(&format!("http://{input}"))
127                .map_err(|e| ErrorKind::ParseUrl(e, "Input is not a valid URL".to_string()))?;
128            Ok(InputSource::RemoteUrl(Box::new(url)))
129        }
130    }
131
132    fn deserialize_pattern<'de, D>(deserializer: D) -> Result<Pattern, D::Error>
133    where
134        D: Deserializer<'de>,
135    {
136        use serde::de::Error;
137        let s = String::deserialize(deserializer)?;
138        Pattern::new(&s).map_err(D::Error::custom)
139    }
140}
141
142/// Resolved input sources that can be processed for content.
143///
144/// This represents input sources after glob pattern expansion.
145/// It is identical to `InputSource`, except that glob patterns
146/// have been resolved to concrete file paths.
147///
148/// We use a separate type to avoid handling the (no longer applicable)
149/// glob case in downstream processing.
150#[derive(Debug, Clone, PartialEq, Eq, Hash)]
151pub enum ResolvedInputSource {
152    /// URL (of HTTP/HTTPS scheme).
153    RemoteUrl(Box<Url>),
154    /// File path.
155    FsPath(PathBuf),
156    /// Standard Input.
157    Stdin,
158    /// Raw string input.
159    String(Cow<'static, str>),
160}
161
162impl ResolvedInputSource {
163    /// Converts a [`ResolvedInputSource::RemoteUrl`] or
164    /// [`ResolvedInputSource::FsPath`] to a [`Url`] pointing to the source.
165    ///
166    /// For other variants (i.e., those without a URL), `Ok(None)` is returned.
167    ///
168    /// # Errors
169    ///
170    /// Returns an error if building a URL from a [`ResolvedInputSource::FsPath`]
171    /// fails.
172    pub fn to_url(&self) -> Result<Option<Url>, ErrorKind> {
173        match self {
174            Self::RemoteUrl(url) => Ok(Some(url.deref().clone())),
175            Self::FsPath(path) => std::path::absolute(path)
176                .ok()
177                .and_then(|x| Url::from_file_path(x).ok())
178                .ok_or_else(|| ErrorKind::InvalidUrlFromPath(path.to_owned()))
179                .map(Some),
180            _ => Ok(None),
181        }
182    }
183}
184
185impl From<ResolvedInputSource> for InputSource {
186    fn from(resolved: ResolvedInputSource) -> Self {
187        match resolved {
188            ResolvedInputSource::RemoteUrl(url) => InputSource::RemoteUrl(url),
189            ResolvedInputSource::FsPath(path) => InputSource::FsPath(path),
190            ResolvedInputSource::Stdin => InputSource::Stdin,
191            ResolvedInputSource::String(s) => InputSource::String(s),
192        }
193    }
194}
195
196impl Display for ResolvedInputSource {
197    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
198        f.write_str(match self {
199            Self::RemoteUrl(url) => url.as_str(),
200            Self::FsPath(path) => path.to_str().unwrap_or_default(),
201            Self::Stdin => "stdin",
202            Self::String(s) => s.as_ref(),
203        })
204    }
205}
206
207/// Custom serialization for the `InputSource` enum.
208///
209/// This implementation serializes all variants as strings to ensure
210/// compatibility with JSON serialization, which requires string keys for enums.
211///
212/// Without this custom implementation, attempting to serialize `InputSource` to
213/// JSON would result in a "key must be a string" error.
214///
215/// See: <https://github.com/serde-rs/json/issues/45>
216impl Serialize for InputSource {
217    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
218    where
219        S: serde::Serializer,
220    {
221        serializer.collect_str(self)
222    }
223}
224
225impl Display for InputSource {
226    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
227        f.write_str(match self {
228            Self::RemoteUrl(url) => url.as_str(),
229            Self::FsGlob { pattern, .. } => pattern.as_str(),
230            Self::FsPath(path) => path.to_str().unwrap_or_default(),
231            Self::Stdin => "stdin",
232            Self::String(s) => s.as_ref(),
233        })
234    }
235}
236
237#[cfg(test)]
238mod tests {
239    use super::*;
240
241    /// Serialization of `FsGlob` relies on [`glob::Pattern::to_string`].
242    /// Here, we check that the `to_string` works as we require.
243    #[test]
244    fn test_pattern_serialization_is_original_pattern() {
245        let pat = "asd[f]*";
246        assert_eq!(
247            serde_json::to_string(&InputSource::FsGlob {
248                pattern: Pattern::new(pat).unwrap(),
249                ignore_case: false,
250            })
251            .unwrap(),
252            serde_json::to_string(pat).unwrap(),
253        );
254    }
255}