feat: add 'user-content-' prefix to support github markdown fragment (#1750)

This commit is contained in:
Keming 2025-07-05 04:58:47 +08:00 committed by GitHub
parent 81f2605118
commit 02f6f5cb49
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 64 additions and 10 deletions

View file

@ -24,7 +24,7 @@
<a href="#in-the-end">doesn't exist</a><br>
<a href="#">To the top</a><br>
<a href="#top">To the top alt</a><br>
<a href="https://github.com/lycheeverse/lychee#user-content-table-of-contents">To the lychee readme license fragment.</a>
<a href="https://github.com/lycheeverse/lychee#table-of-contents">To the lychee readme license fragment.</a>
</section>
</body>
</html>

View file

@ -1890,7 +1890,7 @@ mod cli {
.stderr(contains("fixtures/fragments/file.html#top"))
.stderr(contains("fixtures/fragments/file2.md#top"))
.stderr(contains(
"https://github.com/lycheeverse/lychee#user-content-table-of-contents",
"https://github.com/lycheeverse/lychee#table-of-contents",
))
.stderr(contains(
"https://github.com/lycheeverse/lychee#non-existent-anchor",

View file

@ -1,4 +1,5 @@
use std::{
borrow::Cow,
collections::{HashMap, HashSet, hash_map::Entry},
path::Path,
sync::Arc,
@ -29,6 +30,61 @@ impl FragmentInput {
}
}
/// A fragment builder that expands the given fragments into a list of candidates.
struct FragmentBuilder {
variants: Vec<String>,
decoded: Vec<String>,
}
impl FragmentBuilder {
fn new(fragment: &str, url: &Url, file_type: FileType) -> Result<Self> {
let mut variants = vec![fragment.into()];
// For GitHub links, add "user-content-" prefix to the fragments.
// The following cases cannot be handled unless we simulate with a headless browser:
// - markdown files from any specific path (includes "blob/master/README.md")
// - "issuecomment" fragments from the GitHub issue pages
if url
.host_str()
.is_some_and(|host| host.ends_with("github.com"))
{
variants.push(format!("user-content-{fragment}"));
}
// Only store the percent-decoded variants if it's different from the original
// fragment. This avoids storing and comparing the same fragment twice.
let mut decoded = Vec::new();
for frag in &variants {
let mut require_alloc = false;
let mut fragment_decoded: Cow<'_, str> = match percent_decode_str(frag).decode_utf8()? {
Cow::Borrowed(s) => s.into(),
Cow::Owned(s) => {
require_alloc = true;
s.into()
}
};
if file_type == FileType::Markdown {
let lowercase = fragment_decoded.to_lowercase();
if lowercase != fragment_decoded {
fragment_decoded = lowercase.into();
require_alloc = true;
}
}
if require_alloc {
decoded.push(fragment_decoded.into());
}
}
Ok(Self { variants, decoded })
}
fn any_matches(&self, fragments: &HashSet<String>) -> bool {
self.variants
.iter()
.chain(self.decoded.iter())
.any(|frag| fragments.contains(frag))
}
}
/// Holds a cache of fragments for a given URL.
///
/// Fragments, also known as anchors, are used to link to a specific
@ -67,7 +123,7 @@ impl FragmentChecker {
if fragment.is_empty() || fragment.eq_ignore_ascii_case("top") {
return Ok(true);
}
let mut fragment_decoded = percent_decode_str(fragment).decode_utf8()?;
let url_without_frag = Self::remove_fragment(url.clone());
let FragmentInput { content, file_type } = input;
@ -76,20 +132,18 @@ impl FragmentChecker {
FileType::Html => extract_html_fragments,
FileType::Plaintext => return Ok(true),
};
if file_type == FileType::Markdown {
fragment_decoded = fragment_decoded.to_lowercase().into();
}
let fragment_candidates = FragmentBuilder::new(fragment, url, file_type)?;
match self.cache.lock().await.entry(url_without_frag) {
Entry::Vacant(entry) => {
let file_frags = extractor(&content);
let contains_fragment =
file_frags.contains(fragment) || file_frags.contains(&fragment_decoded as &str);
let contains_fragment = fragment_candidates.any_matches(&file_frags);
entry.insert(file_frags);
Ok(contains_fragment)
}
Entry::Occupied(entry) => {
Ok(entry.get().contains(fragment)
|| entry.get().contains(&fragment_decoded as &str))
let file_frags = entry.get();
Ok(fragment_candidates.any_matches(file_frags))
}
}
}