lychee_lib/utils/
fragment_checker.rs1use log::info;
2use std::{
3 borrow::Cow,
4 collections::{HashMap, HashSet, hash_map::Entry},
5 path::Path,
6 sync::Arc,
7};
8
9use crate::{
10 Result,
11 extract::{html::html5gum::extract_html_fragments, markdown::extract_markdown_fragments},
12 types::{ErrorKind, FileType},
13};
14use percent_encoding::percent_decode_str;
15use tokio::{fs, sync::Mutex};
16use url::Url;
17
18pub(crate) struct FragmentInput {
20 pub content: String,
21 pub file_type: FileType,
22}
23
24impl FragmentInput {
25 pub(crate) async fn from_path(path: &Path) -> Result<Self> {
26 let content = fs::read_to_string(path)
27 .await
28 .map_err(|err| ErrorKind::ReadFileInput(err, path.to_path_buf()))?;
29 let file_type = FileType::from(path);
30 Ok(Self { content, file_type })
31 }
32}
33
34struct FragmentBuilder {
36 variants: Vec<String>,
37 decoded: Vec<String>,
38}
39
40impl FragmentBuilder {
41 fn new(fragment: &str, url: &Url, file_type: FileType) -> Result<Self> {
42 let mut variants = vec![fragment.into()];
43 if url
48 .host_str()
49 .is_some_and(|host| host.ends_with("github.com"))
50 {
51 variants.push(format!("user-content-{fragment}"));
52 }
53
54 let mut decoded = Vec::new();
57 for frag in &variants {
58 let mut require_alloc = false;
59 let mut fragment_decoded: Cow<'_, str> = match percent_decode_str(frag).decode_utf8()? {
60 Cow::Borrowed(s) => s.into(),
61 Cow::Owned(s) => {
62 require_alloc = true;
63 s.into()
64 }
65 };
66 if file_type == FileType::Markdown {
67 let lowercase = fragment_decoded.to_lowercase();
68 if lowercase != fragment_decoded {
69 fragment_decoded = lowercase.into();
70 require_alloc = true;
71 }
72 }
73 if require_alloc {
74 decoded.push(fragment_decoded.into());
75 }
76 }
77
78 Ok(Self { variants, decoded })
79 }
80
81 fn any_matches(&self, fragments: &HashSet<String>) -> bool {
82 self.variants
83 .iter()
84 .chain(self.decoded.iter())
85 .any(|frag| fragments.contains(frag))
86 }
87}
88
89#[derive(Default, Clone, Debug)]
101pub(crate) struct FragmentChecker {
102 cache: Arc<Mutex<HashMap<String, HashSet<String>>>>,
103}
104
105impl FragmentChecker {
106 pub(crate) fn new() -> Self {
108 Self {
109 cache: Arc::default(),
110 }
111 }
112
113 pub(crate) async fn check(&self, input: FragmentInput, url: &Url) -> Result<bool> {
121 let Some(fragment) = url.fragment() else {
122 return Ok(true);
123 };
124 if fragment.is_empty() || fragment.eq_ignore_ascii_case("top") {
125 return Ok(true);
126 }
127
128 let url_without_frag = Self::remove_fragment(url.clone());
129
130 let FragmentInput { content, file_type } = input;
131 let extractor = match file_type {
132 FileType::Markdown => extract_markdown_fragments,
133 FileType::Html => extract_html_fragments,
134 FileType::Css | FileType::Plaintext => {
135 info!("Skipping fragment check for {url} within a {file_type} file");
136 return Ok(true);
137 }
138 };
139
140 let fragment_candidates = FragmentBuilder::new(fragment, url, file_type)?;
141 match self.cache.lock().await.entry(url_without_frag) {
142 Entry::Vacant(entry) => {
143 let file_frags = extractor(&content);
144 let contains_fragment = fragment_candidates.any_matches(&file_frags);
145 entry.insert(file_frags);
146 Ok(contains_fragment)
147 }
148 Entry::Occupied(entry) => {
149 let file_frags = entry.get();
150 Ok(fragment_candidates.any_matches(file_frags))
151 }
152 }
153 }
154
155 fn remove_fragment(mut url: Url) -> String {
156 url.set_fragment(None);
157 url.into()
158 }
159}