Check fragments in HTML files (#1198)

* Added html5gum based fragment extractor * Markdown fragment extractor now extracts fragments from inline html * Added fragment checks for html file * Added inline html and html document to fragment checks test * Improved some comments * Improved documentation of markdown's fragment extractor.
2026-05-02 02:44:45 +00:00 · 2023-08-22 15:44:45 +01:00 · 2023-08-22 15:44:45 +01:00 · f59aa61ee3
commit f59aa61ee3
parent 9f6f5501fa
8 changed files with 119 additions and 42 deletions
--- a/fixtures/fragments/file.html
+++ b/fixtures/fragments/file.html
@ -0,0 +1,22 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8">
+    <title>For Testing Fragments</title>
+  </head>
+  <body>
+    <section id="in-the-beginning">
+      <p>
+        To start
+        <a href="file1.md#fragment-1">
+          let's run away.
+        </a>
+      </p>
+    </section>
+    <section>
+      <p id="a-word">Word</p>
+      <a href="#in-the-beginning">back we go</a>
+      <a href="#in-the-end">doesn't exist</a>
+    </section>
+  </body>
+</html>
--- a/fixtures/fragments/file1.md
+++ b/fixtures/fragments/file1.md
@ -21,11 +21,11 @@ This is a test file for the fragment loader.
 Explicit fragment links are currently not supported.
 Therefore we put the test into a code block for now to prevent false positives.

-```
-<a name="explicit-fragment"></a>
+<a id="explicit-fragment"></a>

 [Link to explicit fragment](#explicit-fragment)
-```
+
+[To the html doc](file.html#a-word)

 ## Custom Fragments

--- a/lychee-bin/tests/cli.rs
+++ b/lychee-bin/tests/cli.rs
@ -1427,6 +1427,7 @@ mod cli {
            .arg(input)
            .assert()
            .failure()
+            .stderr(contains("fixtures/fragments/file1.md#fragment-1"))
            .stderr(contains("fixtures/fragments/file1.md#fragment-2"))
            .stderr(contains("fixtures/fragments/file2.md#custom-id"))
            .stderr(contains("fixtures/fragments/file1.md#missing-fragment"))
@ -1434,12 +1435,15 @@ mod cli {
            .stderr(contains("fixtures/fragments/file1.md#kebab-case-fragment"))
            .stderr(contains("fixtures/fragments/file2.md#missing-fragment"))
            .stderr(contains("fixtures/fragments/empty_file#fragment"))
+            .stderr(contains("fixtures/fragments/file.html#a-word"))
+            .stderr(contains("fixtures/fragments/file.html#in-the-beginning"))
+            .stderr(contains("fixtures/fragments/file.html#in-the-end"))
            .stderr(contains(
                "fixtures/fragments/file1.md#kebab-case-fragment-1",
            ))
-            .stdout(contains("8 Total"))
-            .stdout(contains("6 OK"))
-            // 2 failures because of missing fragments
-            .stdout(contains("2 Errors"));
+            .stdout(contains("13 Total"))
+            .stdout(contains("10 OK"))
+            // 3 failures because of missing fragments
+            .stdout(contains("3 Errors"));
    }
 }
--- a/lychee-lib/src/extract/html/html5gum.rs
+++ b/lychee-lib/src/extract/html/html5gum.rs
@ -1,3 +1,5 @@
+use std::collections::HashSet;
+
 use html5gum::{Emitter, Error, State, Tokenizer};

 use super::{is_email_link, is_verbatim_elem, srcset};
@ -7,6 +9,7 @@ use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri};
 struct LinkExtractor {
    // note: what html5gum calls a tag, lychee calls an element
    links: Vec<RawUri>,
+    fragments: HashSet<String>,
    current_string: Vec<u8>,
    current_element_name: Vec<u8>,
    current_element_is_closing: bool,
@ -26,9 +29,10 @@ unsafe fn from_utf8_unchecked(s: &[u8]) -> &str {
 }

 impl LinkExtractor {
-    pub(crate) const fn new(include_verbatim: bool) -> Self {
+    pub(crate) fn new(include_verbatim: bool) -> Self {
        LinkExtractor {
            links: Vec::new(),
+            fragments: HashSet::new(),
            current_string: Vec::new(),
            current_element_name: Vec::new(),
            current_element_is_closing: false,
@ -181,6 +185,10 @@ impl LinkExtractor {
            };

            self.links.extend(new_urls);
+
+            if attr == "id" {
+                self.fragments.insert(value.to_string());
+            }
        }

        self.current_attribute_name.clear();
@ -288,24 +296,44 @@ pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
    assert!(tokenizer.next().is_none());
    extractor.links
 }
+
+/// Extract fragments from id attributes within a HTML string.
+pub(crate) fn extract_html_fragments(buf: &str) -> HashSet<String> {
+    let mut extractor = LinkExtractor::new(true);
+    let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
+    assert!(tokenizer.next().is_none());
+    extractor.fragments
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;

    const HTML_INPUT: &str = r#"
 <html>
-    <body>
-        <p>This is a paragraph with some inline <code>https://example.com</code> and a normal <a href="https://example.org">example</a></p>
+    <body id="content">
+        <p>This is a paragraph with some inline <code id="inline-code">https://example.com</code> and a normal <a href="https://example.org">example</a></p>
        <pre>
        Some random text
        https://foo.com and http://bar.com/some/path
        Something else
        <a href="https://baz.org">example link inside pre</a>
        </pre>
-        <p><b>bold</b></p>
+        <p id="emphasis"><b>bold</b></p>
    </body>
 </html>"#;

+    #[test]
+    fn test_extract_fragments() {
+        let expected = HashSet::from([
+            "content".to_string(),
+            "inline-code".to_string(),
+            "emphasis".to_string(),
+        ]);
+        let actual = extract_html_fragments(HTML_INPUT);
+        assert_eq!(actual, expected);
+    }
+
    #[test]
    fn test_skip_verbatim() {
        let expected = vec![RawUri {
--- a/lychee-lib/src/extract/html/mod.rs
+++ b/lychee-lib/src/extract/html/mod.rs
@ -1,3 +1,4 @@
+//! Extract links and fragments from html documents
 pub(crate) mod html5ever;
 pub(crate) mod html5gum;
 mod srcset;
--- a/lychee-lib/src/extract/markdown.rs
+++ b/lychee-lib/src/extract/markdown.rs
@ -1,11 +1,11 @@
-//! Extract things from markdown documents
+//! Extract links and fragments from markdown documents
 use std::collections::{HashMap, HashSet};

 use pulldown_cmark::{Event, Options, Parser, Tag};

 use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri};

-use super::html::html5gum::extract_html;
+use super::html::html5gum::{extract_html, extract_html_fragments};

 /// Extract unparsed URL strings from a Markdown string.
 pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUri> {
@ -80,7 +80,13 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
        .collect()
 }

-/// Extract unparsed URL strings from a Markdown string.
+/// Extract fragments/anchors/fragments from a Markdown string.
+///
+/// Fragments are generated from headings using the same unique kebab case method as GitHub.
+/// If a [heading attribute](https://github.com/raphlinus/pulldown-cmark/blob/master/specs/heading_attrs.txt)
+/// is present,
+/// this will be added to the fragment set **alongside** the other generated fragment.
+/// It means a single heading such as `## Frag 1 {#frag-2}` would generate two fragments.
 pub(crate) fn extract_markdown_fragments(input: &str) -> HashSet<String> {
    let mut in_heading = false;
    let mut heading = String::new();
@ -112,6 +118,11 @@ pub(crate) fn extract_markdown_fragments(input: &str) -> HashSet<String> {
                };
            }

+            // An HTML node
+            Event::Html(html) => {
+                out.extend(extract_html_fragments(&html));
+            }
+
            // Silently skip over other events
            _ => (),
        }
@ -158,10 +169,12 @@ mod tests {
    use super::*;

    const MD_INPUT: &str = r#"
-# Test
+# A Test

 Some link in text [here](https://foo.com)

+## A test {#well-still-the-same-test}
+
 Code:

 ```bash
@ -171,8 +184,22 @@ https://bar.com/123
 or inline like `https://bar.org` for instance.

 [example](http://example.com)
+
+<span id="the-end">The End</span>
        "#;

+    #[test]
+    fn test_extract_fragments() {
+        let expected = HashSet::from([
+            "a-test".to_string(),
+            "a-test-1".to_string(),
+            "well-still-the-same-test".to_string(),
+            "the-end".to_string(),
+        ]);
+        let actual = extract_markdown_fragments(MD_INPUT);
+        assert_eq!(actual, expected);
+    }
+
    #[test]
    fn test_skip_verbatim() {
        let expected = vec![
--- a/lychee-lib/src/extract/mod.rs
+++ b/lychee-lib/src/extract/mod.rs
@ -1,6 +1,6 @@
 use crate::types::{uri::raw::RawUri, FileType, InputContent};

-mod html;
+pub mod html;
 pub mod markdown;
 mod plaintext;

--- a/lychee-lib/src/utils/fragment_checker.rs
+++ b/lychee-lib/src/utils/fragment_checker.rs
@ -4,7 +4,11 @@ use std::{
    sync::Arc,
 };

-use crate::{extract::markdown::extract_markdown_fragments, types::FileType, Result};
+use crate::{
+    extract::{html::html5gum::extract_html_fragments, markdown::extract_markdown_fragments},
+    types::FileType,
+    Result,
+};
 use tokio::{fs, sync::Mutex};
 use url::Url;

@ -39,13 +43,23 @@ impl FragmentChecker {
    ///
    /// In all other cases, returns true.
    pub(crate) async fn check(&self, path: &Path, url: &Url) -> Result<bool> {
-        match (FileType::from(path), url.fragment()) {
-            (FileType::Markdown, Some(fragment)) => {
-                let url_without_frag = Self::remove_fragment(url.clone());
-                self.populate_cache_if_vacant(url_without_frag, path, fragment)
-                    .await
+        let Some(fragment) = url.fragment() else {
+            return Ok(true)
+        };
+        let url_without_frag = Self::remove_fragment(url.clone());
+
+        let extractor = match FileType::from(path) {
+            FileType::Markdown => extract_markdown_fragments,
+            FileType::Html => extract_html_fragments,
+            FileType::Plaintext => return Ok(true),
+        };
+        match self.cache.lock().await.entry(url_without_frag) {
+            Entry::Vacant(entry) => {
+                let content = fs::read_to_string(path).await?;
+                let file_frags = extractor(&content);
+                Ok(entry.insert(file_frags).contains(fragment))
            }
-            _ => Ok(true),
+            Entry::Occupied(entry) => Ok(entry.get().contains(fragment)),
        }
    }

@ -53,23 +67,4 @@ impl FragmentChecker {
        url.set_fragment(None);
        url.into()
    }
-
-    /// Populates the fragment cache with the given URL if it
-    /// is not already in the cache.
-    async fn populate_cache_if_vacant(
-        &self,
-        url_without_frag: String,
-        path: &Path,
-        fragment: &str,
-    ) -> Result<bool> {
-        let mut fragment_cache = self.cache.lock().await;
-        match fragment_cache.entry(url_without_frag.clone()) {
-            Entry::Vacant(entry) => {
-                let content = fs::read_to_string(path).await?;
-                let file_frags = extract_markdown_fragments(&content);
-                Ok(entry.insert(file_frags).contains(fragment))
-            }
-            Entry::Occupied(entry) => Ok(entry.get().contains(fragment)),
-        }
-    }
 }