mirror of
https://github.com/Hopiu/lychee.git
synced 2026-05-02 02:44:45 +00:00
Check fragments in HTML files (#1198)
* Added html5gum based fragment extractor * Markdown fragment extractor now extracts fragments from inline html * Added fragment checks for html file * Added inline html and html document to fragment checks test * Improved some comments * Improved documentation of markdown's fragment extractor.
This commit is contained in:
parent
9f6f5501fa
commit
f59aa61ee3
8 changed files with 119 additions and 42 deletions
22
fixtures/fragments/file.html
vendored
Normal file
22
fixtures/fragments/file.html
vendored
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>For Testing Fragments</title>
|
||||
</head>
|
||||
<body>
|
||||
<section id="in-the-beginning">
|
||||
<p>
|
||||
To start
|
||||
<a href="file1.md#fragment-1">
|
||||
let's run away.
|
||||
</a>
|
||||
</p>
|
||||
</section>
|
||||
<section>
|
||||
<p id="a-word">Word</p>
|
||||
<a href="#in-the-beginning">back we go</a>
|
||||
<a href="#in-the-end">doesn't exist</a>
|
||||
</section>
|
||||
</body>
|
||||
</html>
|
||||
6
fixtures/fragments/file1.md
vendored
6
fixtures/fragments/file1.md
vendored
|
|
@ -21,11 +21,11 @@ This is a test file for the fragment loader.
|
|||
Explicit fragment links are currently not supported.
|
||||
Therefore we put the test into a code block for now to prevent false positives.
|
||||
|
||||
```
|
||||
<a name="explicit-fragment"></a>
|
||||
<a id="explicit-fragment"></a>
|
||||
|
||||
[Link to explicit fragment](#explicit-fragment)
|
||||
```
|
||||
|
||||
[To the html doc](file.html#a-word)
|
||||
|
||||
## Custom Fragments
|
||||
|
||||
|
|
|
|||
|
|
@ -1427,6 +1427,7 @@ mod cli {
|
|||
.arg(input)
|
||||
.assert()
|
||||
.failure()
|
||||
.stderr(contains("fixtures/fragments/file1.md#fragment-1"))
|
||||
.stderr(contains("fixtures/fragments/file1.md#fragment-2"))
|
||||
.stderr(contains("fixtures/fragments/file2.md#custom-id"))
|
||||
.stderr(contains("fixtures/fragments/file1.md#missing-fragment"))
|
||||
|
|
@ -1434,12 +1435,15 @@ mod cli {
|
|||
.stderr(contains("fixtures/fragments/file1.md#kebab-case-fragment"))
|
||||
.stderr(contains("fixtures/fragments/file2.md#missing-fragment"))
|
||||
.stderr(contains("fixtures/fragments/empty_file#fragment"))
|
||||
.stderr(contains("fixtures/fragments/file.html#a-word"))
|
||||
.stderr(contains("fixtures/fragments/file.html#in-the-beginning"))
|
||||
.stderr(contains("fixtures/fragments/file.html#in-the-end"))
|
||||
.stderr(contains(
|
||||
"fixtures/fragments/file1.md#kebab-case-fragment-1",
|
||||
))
|
||||
.stdout(contains("8 Total"))
|
||||
.stdout(contains("6 OK"))
|
||||
// 2 failures because of missing fragments
|
||||
.stdout(contains("2 Errors"));
|
||||
.stdout(contains("13 Total"))
|
||||
.stdout(contains("10 OK"))
|
||||
// 3 failures because of missing fragments
|
||||
.stdout(contains("3 Errors"));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
use std::collections::HashSet;
|
||||
|
||||
use html5gum::{Emitter, Error, State, Tokenizer};
|
||||
|
||||
use super::{is_email_link, is_verbatim_elem, srcset};
|
||||
|
|
@ -7,6 +9,7 @@ use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri};
|
|||
struct LinkExtractor {
|
||||
// note: what html5gum calls a tag, lychee calls an element
|
||||
links: Vec<RawUri>,
|
||||
fragments: HashSet<String>,
|
||||
current_string: Vec<u8>,
|
||||
current_element_name: Vec<u8>,
|
||||
current_element_is_closing: bool,
|
||||
|
|
@ -26,9 +29,10 @@ unsafe fn from_utf8_unchecked(s: &[u8]) -> &str {
|
|||
}
|
||||
|
||||
impl LinkExtractor {
|
||||
pub(crate) const fn new(include_verbatim: bool) -> Self {
|
||||
pub(crate) fn new(include_verbatim: bool) -> Self {
|
||||
LinkExtractor {
|
||||
links: Vec::new(),
|
||||
fragments: HashSet::new(),
|
||||
current_string: Vec::new(),
|
||||
current_element_name: Vec::new(),
|
||||
current_element_is_closing: false,
|
||||
|
|
@ -181,6 +185,10 @@ impl LinkExtractor {
|
|||
};
|
||||
|
||||
self.links.extend(new_urls);
|
||||
|
||||
if attr == "id" {
|
||||
self.fragments.insert(value.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
self.current_attribute_name.clear();
|
||||
|
|
@ -288,24 +296,44 @@ pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
|
|||
assert!(tokenizer.next().is_none());
|
||||
extractor.links
|
||||
}
|
||||
|
||||
/// Extract fragments from id attributes within a HTML string.
|
||||
pub(crate) fn extract_html_fragments(buf: &str) -> HashSet<String> {
|
||||
let mut extractor = LinkExtractor::new(true);
|
||||
let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
|
||||
assert!(tokenizer.next().is_none());
|
||||
extractor.fragments
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
const HTML_INPUT: &str = r#"
|
||||
<html>
|
||||
<body>
|
||||
<p>This is a paragraph with some inline <code>https://example.com</code> and a normal <a href="https://example.org">example</a></p>
|
||||
<body id="content">
|
||||
<p>This is a paragraph with some inline <code id="inline-code">https://example.com</code> and a normal <a href="https://example.org">example</a></p>
|
||||
<pre>
|
||||
Some random text
|
||||
https://foo.com and http://bar.com/some/path
|
||||
Something else
|
||||
<a href="https://baz.org">example link inside pre</a>
|
||||
</pre>
|
||||
<p><b>bold</b></p>
|
||||
<p id="emphasis"><b>bold</b></p>
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
#[test]
|
||||
fn test_extract_fragments() {
|
||||
let expected = HashSet::from([
|
||||
"content".to_string(),
|
||||
"inline-code".to_string(),
|
||||
"emphasis".to_string(),
|
||||
]);
|
||||
let actual = extract_html_fragments(HTML_INPUT);
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_verbatim() {
|
||||
let expected = vec
|
||||
/// is present,
|
||||
/// this will be added to the fragment set **alongside** the other generated fragment.
|
||||
/// It means a single heading such as `## Frag 1 {#frag-2}` would generate two fragments.
|
||||
pub(crate) fn extract_markdown_fragments(input: &str) -> HashSet<String> {
|
||||
let mut in_heading = false;
|
||||
let mut heading = String::new();
|
||||
|
|
@ -112,6 +118,11 @@ pub(crate) fn extract_markdown_fragments(input: &str) -> HashSet<String> {
|
|||
};
|
||||
}
|
||||
|
||||
// An HTML node
|
||||
Event::Html(html) => {
|
||||
out.extend(extract_html_fragments(&html));
|
||||
}
|
||||
|
||||
// Silently skip over other events
|
||||
_ => (),
|
||||
}
|
||||
|
|
@ -158,10 +169,12 @@ mod tests {
|
|||
use super::*;
|
||||
|
||||
const MD_INPUT: &str = r#"
|
||||
# Test
|
||||
# A Test
|
||||
|
||||
Some link in text [here](https://foo.com)
|
||||
|
||||
## A test {#well-still-the-same-test}
|
||||
|
||||
Code:
|
||||
|
||||
```bash
|
||||
|
|
@ -171,8 +184,22 @@ https://bar.com/123
|
|||
or inline like `https://bar.org` for instance.
|
||||
|
||||
[example](http://example.com)
|
||||
|
||||
<span id="the-end">The End</span>
|
||||
"#;
|
||||
|
||||
#[test]
|
||||
fn test_extract_fragments() {
|
||||
let expected = HashSet::from([
|
||||
"a-test".to_string(),
|
||||
"a-test-1".to_string(),
|
||||
"well-still-the-same-test".to_string(),
|
||||
"the-end".to_string(),
|
||||
]);
|
||||
let actual = extract_markdown_fragments(MD_INPUT);
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_verbatim() {
|
||||
let expected = vec![
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
use crate::types::{uri::raw::RawUri, FileType, InputContent};
|
||||
|
||||
mod html;
|
||||
pub mod html;
|
||||
pub mod markdown;
|
||||
mod plaintext;
|
||||
|
||||
|
|
|
|||
|
|
@ -4,7 +4,11 @@ use std::{
|
|||
sync::Arc,
|
||||
};
|
||||
|
||||
use crate::{extract::markdown::extract_markdown_fragments, types::FileType, Result};
|
||||
use crate::{
|
||||
extract::{html::html5gum::extract_html_fragments, markdown::extract_markdown_fragments},
|
||||
types::FileType,
|
||||
Result,
|
||||
};
|
||||
use tokio::{fs, sync::Mutex};
|
||||
use url::Url;
|
||||
|
||||
|
|
@ -39,13 +43,23 @@ impl FragmentChecker {
|
|||
///
|
||||
/// In all other cases, returns true.
|
||||
pub(crate) async fn check(&self, path: &Path, url: &Url) -> Result<bool> {
|
||||
match (FileType::from(path), url.fragment()) {
|
||||
(FileType::Markdown, Some(fragment)) => {
|
||||
let url_without_frag = Self::remove_fragment(url.clone());
|
||||
self.populate_cache_if_vacant(url_without_frag, path, fragment)
|
||||
.await
|
||||
let Some(fragment) = url.fragment() else {
|
||||
return Ok(true)
|
||||
};
|
||||
let url_without_frag = Self::remove_fragment(url.clone());
|
||||
|
||||
let extractor = match FileType::from(path) {
|
||||
FileType::Markdown => extract_markdown_fragments,
|
||||
FileType::Html => extract_html_fragments,
|
||||
FileType::Plaintext => return Ok(true),
|
||||
};
|
||||
match self.cache.lock().await.entry(url_without_frag) {
|
||||
Entry::Vacant(entry) => {
|
||||
let content = fs::read_to_string(path).await?;
|
||||
let file_frags = extractor(&content);
|
||||
Ok(entry.insert(file_frags).contains(fragment))
|
||||
}
|
||||
_ => Ok(true),
|
||||
Entry::Occupied(entry) => Ok(entry.get().contains(fragment)),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -53,23 +67,4 @@ impl FragmentChecker {
|
|||
url.set_fragment(None);
|
||||
url.into()
|
||||
}
|
||||
|
||||
/// Populates the fragment cache with the given URL if it
|
||||
/// is not already in the cache.
|
||||
async fn populate_cache_if_vacant(
|
||||
&self,
|
||||
url_without_frag: String,
|
||||
path: &Path,
|
||||
fragment: &str,
|
||||
) -> Result<bool> {
|
||||
let mut fragment_cache = self.cache.lock().await;
|
||||
match fragment_cache.entry(url_without_frag.clone()) {
|
||||
Entry::Vacant(entry) => {
|
||||
let content = fs::read_to_string(path).await?;
|
||||
let file_frags = extract_markdown_fragments(&content);
|
||||
Ok(entry.insert(file_frags).contains(fragment))
|
||||
}
|
||||
Entry::Occupied(entry) => Ok(entry.get().contains(fragment)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue