Check fragments in HTML files (#1198)

* Added html5gum based fragment extractor
* Markdown fragment extractor now extracts fragments from inline html
* Added fragment checks for html file
* Added inline html and html document to fragment checks test
* Improved some comments
* Improved documentation of markdown's fragment extractor.
This commit is contained in:
Hugo McNally 2023-08-22 15:44:45 +01:00 committed by GitHub
parent 9f6f5501fa
commit f59aa61ee3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 119 additions and 42 deletions

22
fixtures/fragments/file.html vendored Normal file
View file

@ -0,0 +1,22 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>For Testing Fragments</title>
</head>
<body>
<section id="in-the-beginning">
<p>
To start
<a href="file1.md#fragment-1">
let's run away.
</a>
</p>
</section>
<section>
<p id="a-word">Word</p>
<a href="#in-the-beginning">back we go</a>
<a href="#in-the-end">doesn't exist</a>
</section>
</body>
</html>

View file

@ -21,11 +21,11 @@ This is a test file for the fragment loader.
Explicit fragment links are currently not supported.
Therefore we put the test into a code block for now to prevent false positives.
```
<a name="explicit-fragment"></a>
<a id="explicit-fragment"></a>
[Link to explicit fragment](#explicit-fragment)
```
[To the html doc](file.html#a-word)
## Custom Fragments

View file

@ -1427,6 +1427,7 @@ mod cli {
.arg(input)
.assert()
.failure()
.stderr(contains("fixtures/fragments/file1.md#fragment-1"))
.stderr(contains("fixtures/fragments/file1.md#fragment-2"))
.stderr(contains("fixtures/fragments/file2.md#custom-id"))
.stderr(contains("fixtures/fragments/file1.md#missing-fragment"))
@ -1434,12 +1435,15 @@ mod cli {
.stderr(contains("fixtures/fragments/file1.md#kebab-case-fragment"))
.stderr(contains("fixtures/fragments/file2.md#missing-fragment"))
.stderr(contains("fixtures/fragments/empty_file#fragment"))
.stderr(contains("fixtures/fragments/file.html#a-word"))
.stderr(contains("fixtures/fragments/file.html#in-the-beginning"))
.stderr(contains("fixtures/fragments/file.html#in-the-end"))
.stderr(contains(
"fixtures/fragments/file1.md#kebab-case-fragment-1",
))
.stdout(contains("8 Total"))
.stdout(contains("6 OK"))
// 2 failures because of missing fragments
.stdout(contains("2 Errors"));
.stdout(contains("13 Total"))
.stdout(contains("10 OK"))
// 3 failures because of missing fragments
.stdout(contains("3 Errors"));
}
}

View file

@ -1,3 +1,5 @@
use std::collections::HashSet;
use html5gum::{Emitter, Error, State, Tokenizer};
use super::{is_email_link, is_verbatim_elem, srcset};
@ -7,6 +9,7 @@ use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri};
struct LinkExtractor {
// note: what html5gum calls a tag, lychee calls an element
links: Vec<RawUri>,
fragments: HashSet<String>,
current_string: Vec<u8>,
current_element_name: Vec<u8>,
current_element_is_closing: bool,
@ -26,9 +29,10 @@ unsafe fn from_utf8_unchecked(s: &[u8]) -> &str {
}
impl LinkExtractor {
pub(crate) const fn new(include_verbatim: bool) -> Self {
pub(crate) fn new(include_verbatim: bool) -> Self {
LinkExtractor {
links: Vec::new(),
fragments: HashSet::new(),
current_string: Vec::new(),
current_element_name: Vec::new(),
current_element_is_closing: false,
@ -181,6 +185,10 @@ impl LinkExtractor {
};
self.links.extend(new_urls);
if attr == "id" {
self.fragments.insert(value.to_string());
}
}
self.current_attribute_name.clear();
@ -288,24 +296,44 @@ pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
assert!(tokenizer.next().is_none());
extractor.links
}
/// Extract fragments from id attributes within a HTML string.
pub(crate) fn extract_html_fragments(buf: &str) -> HashSet<String> {
let mut extractor = LinkExtractor::new(true);
let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
assert!(tokenizer.next().is_none());
extractor.fragments
}
#[cfg(test)]
mod tests {
use super::*;
const HTML_INPUT: &str = r#"
<html>
<body>
<p>This is a paragraph with some inline <code>https://example.com</code> and a normal <a href="https://example.org">example</a></p>
<body id="content">
<p>This is a paragraph with some inline <code id="inline-code">https://example.com</code> and a normal <a href="https://example.org">example</a></p>
<pre>
Some random text
https://foo.com and http://bar.com/some/path
Something else
<a href="https://baz.org">example link inside pre</a>
</pre>
<p><b>bold</b></p>
<p id="emphasis"><b>bold</b></p>
</body>
</html>"#;
#[test]
fn test_extract_fragments() {
let expected = HashSet::from([
"content".to_string(),
"inline-code".to_string(),
"emphasis".to_string(),
]);
let actual = extract_html_fragments(HTML_INPUT);
assert_eq!(actual, expected);
}
#[test]
fn test_skip_verbatim() {
let expected = vec![RawUri {

View file

@ -1,3 +1,4 @@
//! Extract links and fragments from html documents
pub(crate) mod html5ever;
pub(crate) mod html5gum;
mod srcset;

View file

@ -1,11 +1,11 @@
//! Extract things from markdown documents
//! Extract links and fragments from markdown documents
use std::collections::{HashMap, HashSet};
use pulldown_cmark::{Event, Options, Parser, Tag};
use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri};
use super::html::html5gum::extract_html;
use super::html::html5gum::{extract_html, extract_html_fragments};
/// Extract unparsed URL strings from a Markdown string.
pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUri> {
@ -80,7 +80,13 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
.collect()
}
/// Extract unparsed URL strings from a Markdown string.
/// Extract fragments/anchors/fragments from a Markdown string.
///
/// Fragments are generated from headings using the same unique kebab case method as GitHub.
/// If a [heading attribute](https://github.com/raphlinus/pulldown-cmark/blob/master/specs/heading_attrs.txt)
/// is present,
/// this will be added to the fragment set **alongside** the other generated fragment.
/// It means a single heading such as `## Frag 1 {#frag-2}` would generate two fragments.
pub(crate) fn extract_markdown_fragments(input: &str) -> HashSet<String> {
let mut in_heading = false;
let mut heading = String::new();
@ -112,6 +118,11 @@ pub(crate) fn extract_markdown_fragments(input: &str) -> HashSet<String> {
};
}
// An HTML node
Event::Html(html) => {
out.extend(extract_html_fragments(&html));
}
// Silently skip over other events
_ => (),
}
@ -158,10 +169,12 @@ mod tests {
use super::*;
const MD_INPUT: &str = r#"
# Test
# A Test
Some link in text [here](https://foo.com)
## A test {#well-still-the-same-test}
Code:
```bash
@ -171,8 +184,22 @@ https://bar.com/123
or inline like `https://bar.org` for instance.
[example](http://example.com)
<span id="the-end">The End</span>
"#;
#[test]
fn test_extract_fragments() {
let expected = HashSet::from([
"a-test".to_string(),
"a-test-1".to_string(),
"well-still-the-same-test".to_string(),
"the-end".to_string(),
]);
let actual = extract_markdown_fragments(MD_INPUT);
assert_eq!(actual, expected);
}
#[test]
fn test_skip_verbatim() {
let expected = vec![

View file

@ -1,6 +1,6 @@
use crate::types::{uri::raw::RawUri, FileType, InputContent};
mod html;
pub mod html;
pub mod markdown;
mod plaintext;

View file

@ -4,7 +4,11 @@ use std::{
sync::Arc,
};
use crate::{extract::markdown::extract_markdown_fragments, types::FileType, Result};
use crate::{
extract::{html::html5gum::extract_html_fragments, markdown::extract_markdown_fragments},
types::FileType,
Result,
};
use tokio::{fs, sync::Mutex};
use url::Url;
@ -39,13 +43,23 @@ impl FragmentChecker {
///
/// In all other cases, returns true.
pub(crate) async fn check(&self, path: &Path, url: &Url) -> Result<bool> {
match (FileType::from(path), url.fragment()) {
(FileType::Markdown, Some(fragment)) => {
let url_without_frag = Self::remove_fragment(url.clone());
self.populate_cache_if_vacant(url_without_frag, path, fragment)
.await
let Some(fragment) = url.fragment() else {
return Ok(true)
};
let url_without_frag = Self::remove_fragment(url.clone());
let extractor = match FileType::from(path) {
FileType::Markdown => extract_markdown_fragments,
FileType::Html => extract_html_fragments,
FileType::Plaintext => return Ok(true),
};
match self.cache.lock().await.entry(url_without_frag) {
Entry::Vacant(entry) => {
let content = fs::read_to_string(path).await?;
let file_frags = extractor(&content);
Ok(entry.insert(file_frags).contains(fragment))
}
_ => Ok(true),
Entry::Occupied(entry) => Ok(entry.get().contains(fragment)),
}
}
@ -53,23 +67,4 @@ impl FragmentChecker {
url.set_fragment(None);
url.into()
}
/// Populates the fragment cache with the given URL if it
/// is not already in the cache.
async fn populate_cache_if_vacant(
&self,
url_without_frag: String,
path: &Path,
fragment: &str,
) -> Result<bool> {
let mut fragment_cache = self.cache.lock().await;
match fragment_cache.entry(url_without_frag.clone()) {
Entry::Vacant(entry) => {
let content = fs::read_to_string(path).await?;
let file_frags = extract_markdown_fragments(&content);
Ok(entry.insert(file_frags).contains(fragment))
}
Entry::Occupied(entry) => Ok(entry.get().contains(fragment)),
}
}
}