Implement excluding code blocks (#523)

This is done in the extractor to avoid unnecessary
allocations.
This commit is contained in:
Matthias 2022-03-26 10:42:56 +01:00 committed by GitHub
parent 5a77209466
commit d616177a99
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 434 additions and 62 deletions

View file

@ -217,6 +217,7 @@ FLAGS:
--exclude-private Exclude private IP address ranges from checking
--glob-ignore-case Ignore case when expanding filesystem path glob inputs
--help Prints help information
--include-verbatim Find links in verbatim sections like `pre`- and `code` blocks
-i, --insecure Proceed for server connections considered insecure (invalid TLS)
-n, --no-progress Do not show progress bar.
This is recommended for non-interactive shells (e.g. for continuous integration)

View file

@ -6,7 +6,8 @@ use std::path::PathBuf;
fn extract(paths: &[PathBuf]) {
for path in paths {
let content: InputContent = path.try_into().unwrap();
let extracted = Extractor::extract(&content);
let extractor = Extractor::default();
let extracted = extractor.extract(&content);
println!("{}", extracted.len());
}
}

View file

@ -6,7 +6,8 @@ use std::fs;
#[tokio::main]
async fn main() -> Result<()> {
let input = fs::read_to_string("fixtures/elvis.html").unwrap();
let links = Extractor::extract(&InputContent::from_string(&input, FileType::Html));
let extractor = Extractor::default();
let links = extractor.extract(&InputContent::from_string(&input, FileType::Html));
println!("{links:#?}");
Ok(())

11
fixtures/TEST_CODE_BLOCKS.md vendored Normal file
View file

@ -0,0 +1,11 @@
# Test Links In Code
```
http://127.0.0.1/block
```
```bash
http://127.0.0.1/bash
```
`http://127.0.0.1/inline` will also be excluded by default

View file

@ -223,6 +223,7 @@ async fn run(opts: &LycheeOptions) -> Result<i32> {
let inputs = opts.inputs();
let requests = Collector::new(opts.config.base.clone())
.skip_missing_inputs(opts.config.skip_missing)
.include_verbatim(opts.config.include_verbatim)
// File a bug if you rely on this envvar! It's going to go away eventually.
.use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1"))
.collect_links(inputs)

View file

@ -300,6 +300,11 @@ pub(crate) struct Config {
#[serde(default)]
pub(crate) skip_missing: bool,
/// Find links in verbatim sections like `pre`- and `code` blocks
#[structopt(long)]
#[serde(default)]
pub(crate) include_verbatim: bool,
/// Ignore case when expanding filesystem path glob inputs
#[structopt(long)]
#[serde(default)]
@ -375,6 +380,7 @@ impl Config {
base: None;
basic_auth: None;
skip_missing: false;
include_verbatim: false;
glob_ignore_case: false;
output: None;
require_https: false;

View file

@ -9,7 +9,7 @@ mod cli {
use assert_cmd::Command;
use http::StatusCode;
use predicates::str::contains;
use predicates::str::{contains, is_empty};
use pretty_assertions::assert_eq;
use uuid::Uuid;
@ -603,6 +603,37 @@ mod cli {
Ok(())
}
#[test]
fn test_include_verbatim() -> Result<()> {
let mut cmd = main_command();
let input = fixtures_path().join("TEST_CODE_BLOCKS.md");
cmd.arg("--include-verbatim")
.arg(input)
.arg("--dump")
.assert()
.success()
.stdout(contains("http://127.0.0.1/block"))
.stdout(contains("http://127.0.0.1/inline"))
.stdout(contains("http://127.0.0.1/bash"));
Ok(())
}
#[test]
fn test_exclude_verbatim() -> Result<()> {
let mut cmd = main_command();
let input = fixtures_path().join("TEST_CODE_BLOCKS.md");
cmd.arg(input)
.arg("--dump")
.assert()
.success()
.stdout(is_empty());
Ok(())
}
#[test]
fn test_require_https() -> Result<()> {
let mut cmd = main_command();

View file

@ -13,6 +13,7 @@ use par_stream::ParStreamExt;
pub struct Collector {
base: Option<Base>,
skip_missing_inputs: bool,
include_verbatim: bool,
use_html5ever: bool,
}
@ -24,6 +25,7 @@ impl Collector {
base,
skip_missing_inputs: false,
use_html5ever: false,
include_verbatim: false,
}
}
@ -41,6 +43,13 @@ impl Collector {
self
}
/// Skip over links in verbatim sections (like Markdown code blocks)
#[must_use]
pub const fn include_verbatim(mut self, yes: bool) -> Self {
self.include_verbatim = yes;
self
}
/// Fetch all unique links from inputs
/// All relative URLs get prefixed with `base` (if given).
/// (This can be a directory or a base URL)
@ -63,11 +72,8 @@ impl Collector {
let base = base.clone();
async move {
let content = content?;
let uris: Vec<RawUri> = if self.use_html5ever {
Extractor::extract_html5ever(&content)
} else {
Extractor::extract(&content)
};
let extractor = Extractor::new(self.use_html5ever, self.include_verbatim);
let uris: Vec<RawUri> = extractor.extract(&content);
let requests = request::create(uris, &content, &base)?;
Result::Ok(stream::iter(requests.into_iter().map(Ok)))
}

View file

@ -1,15 +1,17 @@
use html5ever::{
buffer_queue::BufferQueue,
tendril::StrTendril,
tokenizer::{Tag, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
};
use super::plaintext::extract_plaintext;
use super::{is_verbatim_elem, plaintext::extract_plaintext};
use crate::types::raw_uri::RawUri;
#[derive(Clone, Default)]
struct LinkExtractor {
links: Vec<RawUri>,
include_verbatim: bool,
inside_excluded_element: bool,
}
impl TokenSink for LinkExtractor {
@ -18,20 +20,30 @@ impl TokenSink for LinkExtractor {
#[allow(clippy::match_same_arms)]
fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
match token {
Token::CharacterTokens(raw) => self.links.extend(extract_plaintext(&raw)),
Token::CharacterTokens(raw) => {
if self.inside_excluded_element {
return TokenSinkResult::Continue;
}
self.links.extend(extract_plaintext(&raw));
}
Token::TagToken(tag) => {
let Tag {
kind: _kind,
kind,
name,
self_closing: _self_closing,
attrs,
} = tag;
if !self.include_verbatim && is_verbatim_elem(&name) {
// Skip content inside excluded elements until we see the end tag.
self.inside_excluded_element = matches!(kind, TagKind::StartTag);
return TokenSinkResult::Continue;
}
for attr in attrs {
let urls = LinkExtractor::extract_urls_from_elem_attr(
attr.name.local.as_ref(),
name.as_ref(),
attr.value.as_ref(),
&attr.name.local,
&name,
&attr.value,
);
let new_urls = match urls {
@ -61,8 +73,12 @@ impl TokenSink for LinkExtractor {
}
impl LinkExtractor {
pub(crate) fn new() -> Self {
LinkExtractor::default()
pub(crate) const fn new(include_verbatim: bool) -> Self {
Self {
links: vec![],
include_verbatim,
inside_excluded_element: false,
}
}
/// Extract all semantically known links from a given html attribute.
@ -75,6 +91,7 @@ impl LinkExtractor {
// For a comprehensive list of elements that might contain URLs/URIs
// see https://www.w3.org/TR/REC-html40/index/attributes.html
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
match (elem_name, attr_name) {
// Common element/attribute combinations for links
(_, "href" | "src" | "cite" | "usemap")
@ -115,13 +132,75 @@ impl LinkExtractor {
}
/// Extract unparsed URL strings from an HTML string.
pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
let mut input = BufferQueue::new();
input.push_back(StrTendril::from(buf));
let mut tokenizer = Tokenizer::new(LinkExtractor::new(), TokenizerOpts::default());
let mut tokenizer = Tokenizer::new(
LinkExtractor::new(include_verbatim),
TokenizerOpts::default(),
);
let _handle = tokenizer.feed(&mut input);
tokenizer.end();
tokenizer.sink.links
}
#[cfg(test)]
mod tests {
use super::*;
const HTML_INPUT: &str = r#"
<html>
<body>
<p>This is a paragraph with some inline <code>https://example.com</code> and a normal <a href="https://example.org">example</a></p>
<pre>
Some random text
https://foo.com and http://bar.com/some/path
Something else
</pre>
<p><b>bold</b></p>
</body>
</html>"#;
#[test]
fn test_skip_verbatim() {
let expected = vec![RawUri {
text: "https://example.org".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}];
let uris = extract_html(HTML_INPUT, false);
assert_eq!(uris, expected);
}
#[test]
fn test_include_verbatim() {
let expected = vec![
RawUri {
text: "https://example.com".to_string(),
element: None,
attribute: None,
},
RawUri {
text: "https://example.org".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
},
RawUri {
text: "https://foo.com".to_string(),
element: None,
attribute: None,
},
RawUri {
text: "http://bar.com/some/path".to_string(),
element: None,
attribute: None,
},
];
let uris = extract_html(HTML_INPUT, true);
assert_eq!(uris, expected);
}
}

View file

@ -1,5 +1,6 @@
use html5gum::{Emitter, Error, Tokenizer};
use super::is_verbatim_elem;
use super::plaintext::extract_plaintext;
use crate::types::raw_uri::RawUri;
@ -13,6 +14,7 @@ struct LinkExtractor {
current_attribute_name: Vec<u8>,
current_attribute_value: Vec<u8>,
last_start_element: Vec<u8>,
include_verbatim: bool,
}
/// this is the same as `std::str::from_utf8_unchecked`, but with extra debug assertions for ease
@ -23,7 +25,7 @@ unsafe fn from_utf8_unchecked(s: &[u8]) -> &str {
}
impl LinkExtractor {
pub(crate) const fn new() -> Self {
pub(crate) const fn new(include_verbatim: bool) -> Self {
LinkExtractor {
links: Vec::new(),
current_string: Vec::new(),
@ -32,6 +34,7 @@ impl LinkExtractor {
current_attribute_name: Vec::new(),
current_attribute_value: Vec::new(),
last_start_element: Vec::new(),
include_verbatim,
}
}
@ -45,6 +48,7 @@ impl LinkExtractor {
// For a comprehensive list of elements that might contain URLs/URIs
// see https://www.w3.org/TR/REC-html40/index/attributes.html
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
match (elem_name, attr_name) {
// Common element/attribute combinations for links
(_, "href" | "src" | "cite" | "usemap")
@ -85,6 +89,13 @@ impl LinkExtractor {
fn flush_current_characters(&mut self) {
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
if !self.include_verbatim && is_verbatim_elem(name) {
// Early return if we don't want to extract links from preformatted text
self.current_string.clear();
return;
}
let raw = unsafe { from_utf8_unchecked(&self.current_string) };
self.links.extend(extract_plaintext(raw));
self.current_string.clear();
@ -94,6 +105,10 @@ impl LinkExtractor {
{
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
if !self.include_verbatim && is_verbatim_elem(name) {
// Early return if we don't want to extract links from preformatted text
return;
}
let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) };
let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) };
@ -199,9 +214,67 @@ impl Emitter for &mut LinkExtractor {
}
/// Extract unparsed URL strings from an HTML string.
pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
let mut extractor = LinkExtractor::new();
pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
let mut extractor = LinkExtractor::new(include_verbatim);
let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
assert!(tokenizer.next().is_none());
extractor.links
}
#[cfg(test)]
mod tests {
use super::*;
const HTML_INPUT: &str = r#"
<html>
<body>
<p>This is a paragraph with some inline <code>https://example.com</code> and a normal <a href="https://example.org">example</a></p>
<pre>
Some random text
https://foo.com and http://bar.com/some/path
Something else
</pre>
<p><b>bold</b></p>
</body>
</html>"#;
#[test]
fn test_skip_verbatim() {
let expected = vec![RawUri {
text: "https://example.org".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}];
let uris = extract_html(HTML_INPUT, false);
assert_eq!(uris, expected);
}
#[test]
fn test_include_verbatim() {
let expected = vec![
RawUri {
text: "https://example.com".to_string(),
element: None,
attribute: None,
},
RawUri {
text: "https://example.org".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
},
RawUri {
text: "https://foo.com".to_string(),
element: None,
attribute: None,
},
RawUri {
text: "http://bar.com/some/path".to_string(),
element: None,
attribute: None,
},
];
let uris = extract_html(HTML_INPUT, true);
assert_eq!(uris, expected);
}
}

View file

@ -1,35 +1,164 @@
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
use pulldown_cmark::{Event, Parser, Tag};
use crate::{extract::plaintext::extract_plaintext, types::raw_uri::RawUri};
use super::html5gum::extract_html;
/// Extract unparsed URL strings from a Markdown string.
pub(crate) fn extract_markdown(input: &str) -> Vec<RawUri> {
pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUri> {
// In some cases it is undesirable to extract links from within code blocks,
// which is why we keep track of entries and exits while traversing the input.
let mut inside_code_block = false;
let parser = Parser::new(input);
parser
.flat_map(|event| match event {
MDEvent::Start(Tag::Link(_, uri, _)) => {
vec![RawUri {
.filter_map(|event| match event {
// A link. The first field is the link type, the second the destination URL and the third is a title.
Event::Start(Tag::Link(_, uri, _)) => {
Some(vec![RawUri {
text: uri.to_string(),
// Emulate `<a href="...">` tag here to be compatible with
// HTML links. We might consider using the actual Markdown
// `LinkType` for better granularity in the future
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}]
}])
}
MDEvent::Start(Tag::Image(_, uri, _)) => {
vec![RawUri {
// An image. The first field is the link type, the second the destination URL and the third is a title.
Event::Start(Tag::Image(_, uri, _)) => {
Some(vec![RawUri {
text: uri.to_string(),
// Emulate `<img src="...">` tag here to be compatible with
// HTML links. We might consider using the actual Markdown
// `LinkType` for better granularity in the future
element: Some("img".to_string()),
attribute: Some("src".to_string()),
}]
}])
}
MDEvent::Text(txt) => extract_plaintext(&txt),
MDEvent::Html(html) => extract_plaintext(&html.to_string()),
_ => vec![],
// A code block (inline or fenced).
Event::Start(Tag::CodeBlock(_)) => {
inside_code_block = true;
None
}
Event::End(Tag::CodeBlock(_)) => {
inside_code_block = false;
None
}
// A text node.
Event::Text(txt) => {
if inside_code_block && !include_verbatim {
None
} else {
Some(extract_plaintext(&txt))
}
}
// An HTML node
Event::Html(html) => {
// This won't exclude verbatim links right now, because HTML gets passed in chunks
// by pulldown_cmark. So excluding `<pre>` and `<code>` is not handled right now.
Some(extract_html(&html.to_string(), include_verbatim))
}
// An inline code node.
Event::Code(code) => {
if include_verbatim {
Some(extract_plaintext(&code))
} else {
None
}
}
// Silently skip over other events
_ => None,
})
.flatten()
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
const MD_INPUT: &str = r#"
# Test
Some link in text [here](https://foo.com)
Code:
```bash
https://bar.com/123
```
or inline like `https://bar.org` for instance.
[example](http://example.com)
"#;
#[test]
fn test_skip_verbatim() {
let expected = vec![
RawUri {
text: "https://foo.com".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
},
RawUri {
text: "http://example.com".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
},
];
let uris = extract_markdown(MD_INPUT, false);
assert_eq!(uris, expected);
}
#[test]
fn test_include_verbatim() {
let expected = vec![
RawUri {
text: "https://foo.com".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
},
RawUri {
text: "https://bar.com/123".to_string(),
element: None,
attribute: None,
},
RawUri {
text: "https://bar.org".to_string(),
element: None,
attribute: None,
},
RawUri {
text: "http://example.com".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
},
];
let uris = extract_markdown(MD_INPUT, true);
assert_eq!(uris, expected);
}
#[test]
#[ignore]
fn test_skip_verbatim_html() {
let input = "
<code>
http://link.com
</code>
<pre>
Some pre-formatted http://pre.com
</pre>";
let expected = vec![];
let uris = extract_markdown(input, false);
assert_eq!(uris, expected);
}
}

View file

@ -1,44 +1,76 @@
use std::collections::HashSet;
use crate::types::{raw_uri::RawUri, FileType, InputContent};
mod html;
mod html5ever;
mod html5gum;
mod markdown;
mod plaintext;
use markdown::extract_markdown;
use once_cell::sync::Lazy;
use plaintext::extract_plaintext;
/// HTML elements that are deemed verbatim (i.e. preformatted).
/// These will be excluded from link checking by default.
static VERBATIM_ELEMENTS: Lazy<HashSet<String>> = Lazy::new(|| {
HashSet::from_iter([
"pre".into(),
"code".into(),
"textarea".into(),
"samp".into(),
"xmp".into(),
"plaintext".into(),
"listing".into(),
])
});
/// Check if the given element is in the list of preformatted tags
pub(crate) fn is_verbatim_elem(name: &str) -> bool {
VERBATIM_ELEMENTS.contains(name)
}
/// A handler for extracting links from various input formats like Markdown and
/// HTML. Allocations should be avoided if possible as this is a
/// performance-critical section of the library.
#[derive(Debug, Clone, Copy)]
pub struct Extractor;
#[derive(Default, Debug, Clone, Copy)]
pub struct Extractor {
use_html5ever: bool,
include_verbatim: bool,
}
impl Extractor {
/// Creates a new extractor
///
/// The extractor can be configured with the following settings:
///
/// - `use_html5ever` enables the alternative HTML parser engine html5ever, that
/// is also used in the Servo browser by Mozilla.
/// The default is `html5gum`, which is more performant and well maintained.
///
/// - `include_verbatim` ignores links inside Markdown code blocks.
/// These can be denoted as a block starting with three backticks or an indented block.
/// For more information, consult the `pulldown_cmark` documentation about code blocks
/// [here](https://docs.rs/pulldown-cmark/latest/pulldown_cmark/enum.CodeBlockKind.html)
#[must_use]
pub const fn new(use_html5ever: bool, include_verbatim: bool) -> Self {
Self {
use_html5ever,
include_verbatim,
}
}
/// Main entrypoint for extracting links from various sources
/// (Markdown, HTML, and plaintext)
#[must_use]
pub fn extract(input_content: &InputContent) -> Vec<RawUri> {
Self::extract_impl(input_content, false)
}
/// Main entrypoint for extracting links from various sources, legacy implementation using
/// html5ever
/// (Markdown, HTML, and plaintext)
#[must_use]
pub fn extract_html5ever(input_content: &InputContent) -> Vec<RawUri> {
Self::extract_impl(input_content, true)
}
#[must_use]
fn extract_impl(input_content: &InputContent, use_html5ever: bool) -> Vec<RawUri> {
pub fn extract(&self, input_content: &InputContent) -> Vec<RawUri> {
match input_content.file_type {
FileType::Markdown => extract_markdown(&input_content.content),
FileType::Markdown => extract_markdown(&input_content.content, self.include_verbatim),
FileType::Html => {
if use_html5ever {
html::extract_html(&input_content.content)
if self.use_html5ever {
html5ever::extract_html(&input_content.content, self.include_verbatim)
} else {
html5gum::extract_html(&input_content.content)
html5gum::extract_html(&input_content.content, self.include_verbatim)
}
}
FileType::Plaintext => extract_plaintext(&input_content.content),
@ -63,12 +95,16 @@ mod test {
fn extract_uris(input: &str, file_type: FileType) -> HashSet<Uri> {
let input_content = InputContent::from_string(input, file_type);
let uris_html5gum = Extractor::extract(&input_content)
let extractor = Extractor::new(false, false);
let uris_html5gum = extractor
.extract(&input_content)
.into_iter()
.filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
.collect();
let uris_html5ever = Extractor::extract_html5ever(&input_content)
let extractor = Extractor::new(true, false);
let uris_html5ever = extractor
.extract(&input_content)
.into_iter()
.filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
.collect();
@ -183,11 +219,8 @@ mod test {
};
for use_html5ever in [true, false] {
let links = if use_html5ever {
Extractor::extract_html5ever(input_content)
} else {
Extractor::extract(input_content)
};
let extractor = Extractor::new(use_html5ever, false);
let links = extractor.extract(input_content);
let urls = links
.into_iter()