lychee_lib/utils/
fragment_checker.rs

1use log::info;
2use std::{
3    borrow::Cow,
4    collections::{HashMap, HashSet, hash_map::Entry},
5    path::Path,
6    sync::Arc,
7};
8
9use crate::{
10    Result,
11    extract::{html::html5gum::extract_html_fragments, markdown::extract_markdown_fragments},
12    types::{ErrorKind, FileType},
13};
14use percent_encoding::percent_decode_str;
15use tokio::{fs, sync::Mutex};
16use url::Url;
17
18/// Holds the content and file type of the fragment input.
19pub(crate) struct FragmentInput {
20    pub content: String,
21    pub file_type: FileType,
22}
23
24impl FragmentInput {
25    pub(crate) async fn from_path(path: &Path) -> Result<Self> {
26        let content = fs::read_to_string(path)
27            .await
28            .map_err(|err| ErrorKind::ReadFileInput(err, path.to_path_buf()))?;
29        let file_type = FileType::from(path);
30        Ok(Self { content, file_type })
31    }
32}
33
34/// A fragment builder that expands the given fragments into a list of candidates.
35struct FragmentBuilder {
36    variants: Vec<String>,
37    decoded: Vec<String>,
38}
39
40impl FragmentBuilder {
41    fn new(fragment: &str, url: &Url, file_type: FileType) -> Result<Self> {
42        let mut variants = vec![fragment.into()];
43        // For GitHub links, add "user-content-" prefix to the fragments.
44        // The following cases cannot be handled unless we simulate with a headless browser:
45        // - markdown files from any specific path (includes "blob/master/README.md")
46        // - "issuecomment" fragments from the GitHub issue pages
47        if url
48            .host_str()
49            .is_some_and(|host| host.ends_with("github.com"))
50        {
51            variants.push(format!("user-content-{fragment}"));
52        }
53
54        // Only store the percent-decoded variants if it's different from the original
55        // fragment. This avoids storing and comparing the same fragment twice.
56        let mut decoded = Vec::new();
57        for frag in &variants {
58            let mut require_alloc = false;
59            let mut fragment_decoded: Cow<'_, str> = match percent_decode_str(frag).decode_utf8()? {
60                Cow::Borrowed(s) => s.into(),
61                Cow::Owned(s) => {
62                    require_alloc = true;
63                    s.into()
64                }
65            };
66            if file_type == FileType::Markdown {
67                let lowercase = fragment_decoded.to_lowercase();
68                if lowercase != fragment_decoded {
69                    fragment_decoded = lowercase.into();
70                    require_alloc = true;
71                }
72            }
73            if require_alloc {
74                decoded.push(fragment_decoded.into());
75            }
76        }
77
78        Ok(Self { variants, decoded })
79    }
80
81    fn any_matches(&self, fragments: &HashSet<String>) -> bool {
82        self.variants
83            .iter()
84            .chain(self.decoded.iter())
85            .any(|frag| fragments.contains(frag))
86    }
87}
88
89/// Holds a cache of fragments for a given URL.
90///
91/// Fragments, also known as anchors, are used to link to a specific
92/// part of a page. For example, the URL `https://example.com#foo`
93/// will link to the element with the `id` of `foo`.
94///
95/// This cache is used to avoid having to re-parse the same file
96/// multiple times when checking if a given URL contains a fragment.
97///
98/// The cache is stored in a `HashMap` with the URL as the key and
99/// a `HashSet` of fragments as the value.
100#[derive(Default, Clone, Debug)]
101pub(crate) struct FragmentChecker {
102    cache: Arc<Mutex<HashMap<String, HashSet<String>>>>,
103}
104
105impl FragmentChecker {
106    /// Creates a new `FragmentChecker`.
107    pub(crate) fn new() -> Self {
108        Self {
109            cache: Arc::default(),
110        }
111    }
112
113    /// Checks if the given [`FragmentInput`] contains the given fragment.
114    ///
115    /// Returns false, if there is a fragment in the link which is not empty or "top"
116    /// and the path is to a Markdown file, which doesn't contain the given fragment.
117    /// (Empty # and #top fragments are always valid, triggering the browser to scroll to top.)
118    ///
119    /// In all other cases, returns true.
120    pub(crate) async fn check(&self, input: FragmentInput, url: &Url) -> Result<bool> {
121        let Some(fragment) = url.fragment() else {
122            return Ok(true);
123        };
124        if fragment.is_empty() || fragment.eq_ignore_ascii_case("top") {
125            return Ok(true);
126        }
127
128        let url_without_frag = Self::remove_fragment(url.clone());
129
130        let FragmentInput { content, file_type } = input;
131        let extractor = match file_type {
132            FileType::Markdown => extract_markdown_fragments,
133            FileType::Html => extract_html_fragments,
134            FileType::Css | FileType::Plaintext => {
135                info!("Skipping fragment check for {url} within a {file_type} file");
136                return Ok(true);
137            }
138        };
139
140        let fragment_candidates = FragmentBuilder::new(fragment, url, file_type)?;
141        match self.cache.lock().await.entry(url_without_frag) {
142            Entry::Vacant(entry) => {
143                let file_frags = extractor(&content);
144                let contains_fragment = fragment_candidates.any_matches(&file_frags);
145                entry.insert(file_frags);
146                Ok(contains_fragment)
147            }
148            Entry::Occupied(entry) => {
149                let file_frags = entry.get();
150                Ok(fragment_candidates.any_matches(file_frags))
151            }
152        }
153    }
154
155    fn remove_fragment(mut url: Url) -> String {
156        url.set_fragment(None);
157        url.into()
158    }
159}