mirror of
https://github.com/Hopiu/lychee.git
synced 2026-03-16 20:50:25 +00:00
Implement excluding code blocks (#523)
This is done in the extractor to avoid unnecessary allocations.
This commit is contained in:
parent
5a77209466
commit
d616177a99
12 changed files with 434 additions and 62 deletions
|
|
@ -217,6 +217,7 @@ FLAGS:
|
|||
--exclude-private Exclude private IP address ranges from checking
|
||||
--glob-ignore-case Ignore case when expanding filesystem path glob inputs
|
||||
--help Prints help information
|
||||
--include-verbatim Find links in verbatim sections like `pre`- and `code` blocks
|
||||
-i, --insecure Proceed for server connections considered insecure (invalid TLS)
|
||||
-n, --no-progress Do not show progress bar.
|
||||
This is recommended for non-interactive shells (e.g. for continuous integration)
|
||||
|
|
|
|||
|
|
@ -6,7 +6,8 @@ use std::path::PathBuf;
|
|||
fn extract(paths: &[PathBuf]) {
|
||||
for path in paths {
|
||||
let content: InputContent = path.try_into().unwrap();
|
||||
let extracted = Extractor::extract(&content);
|
||||
let extractor = Extractor::default();
|
||||
let extracted = extractor.extract(&content);
|
||||
println!("{}", extracted.len());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,7 +6,8 @@ use std::fs;
|
|||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let input = fs::read_to_string("fixtures/elvis.html").unwrap();
|
||||
let links = Extractor::extract(&InputContent::from_string(&input, FileType::Html));
|
||||
let extractor = Extractor::default();
|
||||
let links = extractor.extract(&InputContent::from_string(&input, FileType::Html));
|
||||
println!("{links:#?}");
|
||||
|
||||
Ok(())
|
||||
|
|
|
|||
11
fixtures/TEST_CODE_BLOCKS.md
vendored
Normal file
11
fixtures/TEST_CODE_BLOCKS.md
vendored
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
# Test Links In Code
|
||||
|
||||
```
|
||||
http://127.0.0.1/block
|
||||
```
|
||||
|
||||
```bash
|
||||
http://127.0.0.1/bash
|
||||
```
|
||||
|
||||
`http://127.0.0.1/inline` will also be excluded by default
|
||||
|
|
@ -223,6 +223,7 @@ async fn run(opts: &LycheeOptions) -> Result<i32> {
|
|||
let inputs = opts.inputs();
|
||||
let requests = Collector::new(opts.config.base.clone())
|
||||
.skip_missing_inputs(opts.config.skip_missing)
|
||||
.include_verbatim(opts.config.include_verbatim)
|
||||
// File a bug if you rely on this envvar! It's going to go away eventually.
|
||||
.use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1"))
|
||||
.collect_links(inputs)
|
||||
|
|
|
|||
|
|
@ -300,6 +300,11 @@ pub(crate) struct Config {
|
|||
#[serde(default)]
|
||||
pub(crate) skip_missing: bool,
|
||||
|
||||
/// Find links in verbatim sections like `pre`- and `code` blocks
|
||||
#[structopt(long)]
|
||||
#[serde(default)]
|
||||
pub(crate) include_verbatim: bool,
|
||||
|
||||
/// Ignore case when expanding filesystem path glob inputs
|
||||
#[structopt(long)]
|
||||
#[serde(default)]
|
||||
|
|
@ -375,6 +380,7 @@ impl Config {
|
|||
base: None;
|
||||
basic_auth: None;
|
||||
skip_missing: false;
|
||||
include_verbatim: false;
|
||||
glob_ignore_case: false;
|
||||
output: None;
|
||||
require_https: false;
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ mod cli {
|
|||
|
||||
use assert_cmd::Command;
|
||||
use http::StatusCode;
|
||||
use predicates::str::contains;
|
||||
use predicates::str::{contains, is_empty};
|
||||
use pretty_assertions::assert_eq;
|
||||
use uuid::Uuid;
|
||||
|
||||
|
|
@ -603,6 +603,37 @@ mod cli {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_include_verbatim() -> Result<()> {
|
||||
let mut cmd = main_command();
|
||||
let input = fixtures_path().join("TEST_CODE_BLOCKS.md");
|
||||
|
||||
cmd.arg("--include-verbatim")
|
||||
.arg(input)
|
||||
.arg("--dump")
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(contains("http://127.0.0.1/block"))
|
||||
.stdout(contains("http://127.0.0.1/inline"))
|
||||
.stdout(contains("http://127.0.0.1/bash"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exclude_verbatim() -> Result<()> {
|
||||
let mut cmd = main_command();
|
||||
let input = fixtures_path().join("TEST_CODE_BLOCKS.md");
|
||||
|
||||
cmd.arg(input)
|
||||
.arg("--dump")
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(is_empty());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_require_https() -> Result<()> {
|
||||
let mut cmd = main_command();
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ use par_stream::ParStreamExt;
|
|||
pub struct Collector {
|
||||
base: Option<Base>,
|
||||
skip_missing_inputs: bool,
|
||||
include_verbatim: bool,
|
||||
use_html5ever: bool,
|
||||
}
|
||||
|
||||
|
|
@ -24,6 +25,7 @@ impl Collector {
|
|||
base,
|
||||
skip_missing_inputs: false,
|
||||
use_html5ever: false,
|
||||
include_verbatim: false,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -41,6 +43,13 @@ impl Collector {
|
|||
self
|
||||
}
|
||||
|
||||
/// Skip over links in verbatim sections (like Markdown code blocks)
|
||||
#[must_use]
|
||||
pub const fn include_verbatim(mut self, yes: bool) -> Self {
|
||||
self.include_verbatim = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Fetch all unique links from inputs
|
||||
/// All relative URLs get prefixed with `base` (if given).
|
||||
/// (This can be a directory or a base URL)
|
||||
|
|
@ -63,11 +72,8 @@ impl Collector {
|
|||
let base = base.clone();
|
||||
async move {
|
||||
let content = content?;
|
||||
let uris: Vec<RawUri> = if self.use_html5ever {
|
||||
Extractor::extract_html5ever(&content)
|
||||
} else {
|
||||
Extractor::extract(&content)
|
||||
};
|
||||
let extractor = Extractor::new(self.use_html5ever, self.include_verbatim);
|
||||
let uris: Vec<RawUri> = extractor.extract(&content);
|
||||
let requests = request::create(uris, &content, &base)?;
|
||||
Result::Ok(stream::iter(requests.into_iter().map(Ok)))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,15 +1,17 @@
|
|||
use html5ever::{
|
||||
buffer_queue::BufferQueue,
|
||||
tendril::StrTendril,
|
||||
tokenizer::{Tag, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
|
||||
tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
|
||||
};
|
||||
|
||||
use super::plaintext::extract_plaintext;
|
||||
use super::{is_verbatim_elem, plaintext::extract_plaintext};
|
||||
use crate::types::raw_uri::RawUri;
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
struct LinkExtractor {
|
||||
links: Vec<RawUri>,
|
||||
include_verbatim: bool,
|
||||
inside_excluded_element: bool,
|
||||
}
|
||||
|
||||
impl TokenSink for LinkExtractor {
|
||||
|
|
@ -18,20 +20,30 @@ impl TokenSink for LinkExtractor {
|
|||
#[allow(clippy::match_same_arms)]
|
||||
fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
|
||||
match token {
|
||||
Token::CharacterTokens(raw) => self.links.extend(extract_plaintext(&raw)),
|
||||
Token::CharacterTokens(raw) => {
|
||||
if self.inside_excluded_element {
|
||||
return TokenSinkResult::Continue;
|
||||
}
|
||||
self.links.extend(extract_plaintext(&raw));
|
||||
}
|
||||
Token::TagToken(tag) => {
|
||||
let Tag {
|
||||
kind: _kind,
|
||||
kind,
|
||||
name,
|
||||
self_closing: _self_closing,
|
||||
attrs,
|
||||
} = tag;
|
||||
if !self.include_verbatim && is_verbatim_elem(&name) {
|
||||
// Skip content inside excluded elements until we see the end tag.
|
||||
self.inside_excluded_element = matches!(kind, TagKind::StartTag);
|
||||
return TokenSinkResult::Continue;
|
||||
}
|
||||
|
||||
for attr in attrs {
|
||||
let urls = LinkExtractor::extract_urls_from_elem_attr(
|
||||
attr.name.local.as_ref(),
|
||||
name.as_ref(),
|
||||
attr.value.as_ref(),
|
||||
&attr.name.local,
|
||||
&name,
|
||||
&attr.value,
|
||||
);
|
||||
|
||||
let new_urls = match urls {
|
||||
|
|
@ -61,8 +73,12 @@ impl TokenSink for LinkExtractor {
|
|||
}
|
||||
|
||||
impl LinkExtractor {
|
||||
pub(crate) fn new() -> Self {
|
||||
LinkExtractor::default()
|
||||
pub(crate) const fn new(include_verbatim: bool) -> Self {
|
||||
Self {
|
||||
links: vec![],
|
||||
include_verbatim,
|
||||
inside_excluded_element: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract all semantically known links from a given html attribute.
|
||||
|
|
@ -75,6 +91,7 @@ impl LinkExtractor {
|
|||
// For a comprehensive list of elements that might contain URLs/URIs
|
||||
// see https://www.w3.org/TR/REC-html40/index/attributes.html
|
||||
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
|
||||
|
||||
match (elem_name, attr_name) {
|
||||
// Common element/attribute combinations for links
|
||||
(_, "href" | "src" | "cite" | "usemap")
|
||||
|
|
@ -115,13 +132,75 @@ impl LinkExtractor {
|
|||
}
|
||||
|
||||
/// Extract unparsed URL strings from an HTML string.
|
||||
pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
|
||||
pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
|
||||
let mut input = BufferQueue::new();
|
||||
input.push_back(StrTendril::from(buf));
|
||||
|
||||
let mut tokenizer = Tokenizer::new(LinkExtractor::new(), TokenizerOpts::default());
|
||||
let mut tokenizer = Tokenizer::new(
|
||||
LinkExtractor::new(include_verbatim),
|
||||
TokenizerOpts::default(),
|
||||
);
|
||||
let _handle = tokenizer.feed(&mut input);
|
||||
tokenizer.end();
|
||||
|
||||
tokenizer.sink.links
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
const HTML_INPUT: &str = r#"
|
||||
<html>
|
||||
<body>
|
||||
<p>This is a paragraph with some inline <code>https://example.com</code> and a normal <a href="https://example.org">example</a></p>
|
||||
<pre>
|
||||
Some random text
|
||||
https://foo.com and http://bar.com/some/path
|
||||
Something else
|
||||
</pre>
|
||||
<p><b>bold</b></p>
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
#[test]
|
||||
fn test_skip_verbatim() {
|
||||
let expected = vec![RawUri {
|
||||
text: "https://example.org".to_string(),
|
||||
element: Some("a".to_string()),
|
||||
attribute: Some("href".to_string()),
|
||||
}];
|
||||
|
||||
let uris = extract_html(HTML_INPUT, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_include_verbatim() {
|
||||
let expected = vec![
|
||||
RawUri {
|
||||
text: "https://example.com".to_string(),
|
||||
element: None,
|
||||
attribute: None,
|
||||
},
|
||||
RawUri {
|
||||
text: "https://example.org".to_string(),
|
||||
element: Some("a".to_string()),
|
||||
attribute: Some("href".to_string()),
|
||||
},
|
||||
RawUri {
|
||||
text: "https://foo.com".to_string(),
|
||||
element: None,
|
||||
attribute: None,
|
||||
},
|
||||
RawUri {
|
||||
text: "http://bar.com/some/path".to_string(),
|
||||
element: None,
|
||||
attribute: None,
|
||||
},
|
||||
];
|
||||
|
||||
let uris = extract_html(HTML_INPUT, true);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
use html5gum::{Emitter, Error, Tokenizer};
|
||||
|
||||
use super::is_verbatim_elem;
|
||||
use super::plaintext::extract_plaintext;
|
||||
use crate::types::raw_uri::RawUri;
|
||||
|
||||
|
|
@ -13,6 +14,7 @@ struct LinkExtractor {
|
|||
current_attribute_name: Vec<u8>,
|
||||
current_attribute_value: Vec<u8>,
|
||||
last_start_element: Vec<u8>,
|
||||
include_verbatim: bool,
|
||||
}
|
||||
|
||||
/// this is the same as `std::str::from_utf8_unchecked`, but with extra debug assertions for ease
|
||||
|
|
@ -23,7 +25,7 @@ unsafe fn from_utf8_unchecked(s: &[u8]) -> &str {
|
|||
}
|
||||
|
||||
impl LinkExtractor {
|
||||
pub(crate) const fn new() -> Self {
|
||||
pub(crate) const fn new(include_verbatim: bool) -> Self {
|
||||
LinkExtractor {
|
||||
links: Vec::new(),
|
||||
current_string: Vec::new(),
|
||||
|
|
@ -32,6 +34,7 @@ impl LinkExtractor {
|
|||
current_attribute_name: Vec::new(),
|
||||
current_attribute_value: Vec::new(),
|
||||
last_start_element: Vec::new(),
|
||||
include_verbatim,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -45,6 +48,7 @@ impl LinkExtractor {
|
|||
// For a comprehensive list of elements that might contain URLs/URIs
|
||||
// see https://www.w3.org/TR/REC-html40/index/attributes.html
|
||||
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
|
||||
|
||||
match (elem_name, attr_name) {
|
||||
// Common element/attribute combinations for links
|
||||
(_, "href" | "src" | "cite" | "usemap")
|
||||
|
|
@ -85,6 +89,13 @@ impl LinkExtractor {
|
|||
|
||||
fn flush_current_characters(&mut self) {
|
||||
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
|
||||
let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
|
||||
if !self.include_verbatim && is_verbatim_elem(name) {
|
||||
// Early return if we don't want to extract links from preformatted text
|
||||
self.current_string.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
let raw = unsafe { from_utf8_unchecked(&self.current_string) };
|
||||
self.links.extend(extract_plaintext(raw));
|
||||
self.current_string.clear();
|
||||
|
|
@ -94,6 +105,10 @@ impl LinkExtractor {
|
|||
{
|
||||
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
|
||||
let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
|
||||
if !self.include_verbatim && is_verbatim_elem(name) {
|
||||
// Early return if we don't want to extract links from preformatted text
|
||||
return;
|
||||
}
|
||||
let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) };
|
||||
let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) };
|
||||
|
||||
|
|
@ -199,9 +214,67 @@ impl Emitter for &mut LinkExtractor {
|
|||
}
|
||||
|
||||
/// Extract unparsed URL strings from an HTML string.
|
||||
pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
|
||||
let mut extractor = LinkExtractor::new();
|
||||
pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
|
||||
let mut extractor = LinkExtractor::new(include_verbatim);
|
||||
let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
|
||||
assert!(tokenizer.next().is_none());
|
||||
extractor.links
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
const HTML_INPUT: &str = r#"
|
||||
<html>
|
||||
<body>
|
||||
<p>This is a paragraph with some inline <code>https://example.com</code> and a normal <a href="https://example.org">example</a></p>
|
||||
<pre>
|
||||
Some random text
|
||||
https://foo.com and http://bar.com/some/path
|
||||
Something else
|
||||
</pre>
|
||||
<p><b>bold</b></p>
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
#[test]
|
||||
fn test_skip_verbatim() {
|
||||
let expected = vec![RawUri {
|
||||
text: "https://example.org".to_string(),
|
||||
element: Some("a".to_string()),
|
||||
attribute: Some("href".to_string()),
|
||||
}];
|
||||
|
||||
let uris = extract_html(HTML_INPUT, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_include_verbatim() {
|
||||
let expected = vec![
|
||||
RawUri {
|
||||
text: "https://example.com".to_string(),
|
||||
element: None,
|
||||
attribute: None,
|
||||
},
|
||||
RawUri {
|
||||
text: "https://example.org".to_string(),
|
||||
element: Some("a".to_string()),
|
||||
attribute: Some("href".to_string()),
|
||||
},
|
||||
RawUri {
|
||||
text: "https://foo.com".to_string(),
|
||||
element: None,
|
||||
attribute: None,
|
||||
},
|
||||
RawUri {
|
||||
text: "http://bar.com/some/path".to_string(),
|
||||
element: None,
|
||||
attribute: None,
|
||||
},
|
||||
];
|
||||
|
||||
let uris = extract_html(HTML_INPUT, true);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,35 +1,164 @@
|
|||
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
|
||||
use pulldown_cmark::{Event, Parser, Tag};
|
||||
|
||||
use crate::{extract::plaintext::extract_plaintext, types::raw_uri::RawUri};
|
||||
|
||||
use super::html5gum::extract_html;
|
||||
|
||||
/// Extract unparsed URL strings from a Markdown string.
|
||||
pub(crate) fn extract_markdown(input: &str) -> Vec<RawUri> {
|
||||
pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUri> {
|
||||
// In some cases it is undesirable to extract links from within code blocks,
|
||||
// which is why we keep track of entries and exits while traversing the input.
|
||||
let mut inside_code_block = false;
|
||||
|
||||
let parser = Parser::new(input);
|
||||
parser
|
||||
.flat_map(|event| match event {
|
||||
MDEvent::Start(Tag::Link(_, uri, _)) => {
|
||||
vec![RawUri {
|
||||
.filter_map(|event| match event {
|
||||
// A link. The first field is the link type, the second the destination URL and the third is a title.
|
||||
Event::Start(Tag::Link(_, uri, _)) => {
|
||||
Some(vec![RawUri {
|
||||
text: uri.to_string(),
|
||||
// Emulate `<a href="...">` tag here to be compatible with
|
||||
// HTML links. We might consider using the actual Markdown
|
||||
// `LinkType` for better granularity in the future
|
||||
element: Some("a".to_string()),
|
||||
attribute: Some("href".to_string()),
|
||||
}]
|
||||
}])
|
||||
}
|
||||
MDEvent::Start(Tag::Image(_, uri, _)) => {
|
||||
vec![RawUri {
|
||||
// An image. The first field is the link type, the second the destination URL and the third is a title.
|
||||
Event::Start(Tag::Image(_, uri, _)) => {
|
||||
Some(vec![RawUri {
|
||||
text: uri.to_string(),
|
||||
// Emulate `<img src="...">` tag here to be compatible with
|
||||
// HTML links. We might consider using the actual Markdown
|
||||
// `LinkType` for better granularity in the future
|
||||
element: Some("img".to_string()),
|
||||
attribute: Some("src".to_string()),
|
||||
}]
|
||||
}])
|
||||
}
|
||||
MDEvent::Text(txt) => extract_plaintext(&txt),
|
||||
MDEvent::Html(html) => extract_plaintext(&html.to_string()),
|
||||
_ => vec![],
|
||||
// A code block (inline or fenced).
|
||||
Event::Start(Tag::CodeBlock(_)) => {
|
||||
inside_code_block = true;
|
||||
None
|
||||
}
|
||||
Event::End(Tag::CodeBlock(_)) => {
|
||||
inside_code_block = false;
|
||||
None
|
||||
}
|
||||
|
||||
// A text node.
|
||||
Event::Text(txt) => {
|
||||
if inside_code_block && !include_verbatim {
|
||||
None
|
||||
} else {
|
||||
Some(extract_plaintext(&txt))
|
||||
}
|
||||
}
|
||||
|
||||
// An HTML node
|
||||
Event::Html(html) => {
|
||||
// This won't exclude verbatim links right now, because HTML gets passed in chunks
|
||||
// by pulldown_cmark. So excluding `<pre>` and `<code>` is not handled right now.
|
||||
Some(extract_html(&html.to_string(), include_verbatim))
|
||||
}
|
||||
|
||||
// An inline code node.
|
||||
Event::Code(code) => {
|
||||
if include_verbatim {
|
||||
Some(extract_plaintext(&code))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
// Silently skip over other events
|
||||
_ => None,
|
||||
})
|
||||
.flatten()
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
const MD_INPUT: &str = r#"
|
||||
# Test
|
||||
|
||||
Some link in text [here](https://foo.com)
|
||||
|
||||
Code:
|
||||
|
||||
```bash
|
||||
https://bar.com/123
|
||||
```
|
||||
|
||||
or inline like `https://bar.org` for instance.
|
||||
|
||||
[example](http://example.com)
|
||||
"#;
|
||||
|
||||
#[test]
|
||||
fn test_skip_verbatim() {
|
||||
let expected = vec![
|
||||
RawUri {
|
||||
text: "https://foo.com".to_string(),
|
||||
element: Some("a".to_string()),
|
||||
attribute: Some("href".to_string()),
|
||||
},
|
||||
RawUri {
|
||||
text: "http://example.com".to_string(),
|
||||
element: Some("a".to_string()),
|
||||
attribute: Some("href".to_string()),
|
||||
},
|
||||
];
|
||||
|
||||
let uris = extract_markdown(MD_INPUT, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_include_verbatim() {
|
||||
let expected = vec![
|
||||
RawUri {
|
||||
text: "https://foo.com".to_string(),
|
||||
element: Some("a".to_string()),
|
||||
attribute: Some("href".to_string()),
|
||||
},
|
||||
RawUri {
|
||||
text: "https://bar.com/123".to_string(),
|
||||
element: None,
|
||||
attribute: None,
|
||||
},
|
||||
RawUri {
|
||||
text: "https://bar.org".to_string(),
|
||||
element: None,
|
||||
attribute: None,
|
||||
},
|
||||
RawUri {
|
||||
text: "http://example.com".to_string(),
|
||||
element: Some("a".to_string()),
|
||||
attribute: Some("href".to_string()),
|
||||
},
|
||||
];
|
||||
|
||||
let uris = extract_markdown(MD_INPUT, true);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_skip_verbatim_html() {
|
||||
let input = "
|
||||
<code>
|
||||
http://link.com
|
||||
</code>
|
||||
<pre>
|
||||
Some pre-formatted http://pre.com
|
||||
</pre>";
|
||||
|
||||
let expected = vec![];
|
||||
|
||||
let uris = extract_markdown(input, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,44 +1,76 @@
|
|||
use std::collections::HashSet;
|
||||
|
||||
use crate::types::{raw_uri::RawUri, FileType, InputContent};
|
||||
|
||||
mod html;
|
||||
mod html5ever;
|
||||
mod html5gum;
|
||||
mod markdown;
|
||||
mod plaintext;
|
||||
|
||||
use markdown::extract_markdown;
|
||||
use once_cell::sync::Lazy;
|
||||
use plaintext::extract_plaintext;
|
||||
|
||||
/// HTML elements that are deemed verbatim (i.e. preformatted).
|
||||
/// These will be excluded from link checking by default.
|
||||
static VERBATIM_ELEMENTS: Lazy<HashSet<String>> = Lazy::new(|| {
|
||||
HashSet::from_iter([
|
||||
"pre".into(),
|
||||
"code".into(),
|
||||
"textarea".into(),
|
||||
"samp".into(),
|
||||
"xmp".into(),
|
||||
"plaintext".into(),
|
||||
"listing".into(),
|
||||
])
|
||||
});
|
||||
|
||||
/// Check if the given element is in the list of preformatted tags
|
||||
pub(crate) fn is_verbatim_elem(name: &str) -> bool {
|
||||
VERBATIM_ELEMENTS.contains(name)
|
||||
}
|
||||
|
||||
/// A handler for extracting links from various input formats like Markdown and
|
||||
/// HTML. Allocations should be avoided if possible as this is a
|
||||
/// performance-critical section of the library.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Extractor;
|
||||
#[derive(Default, Debug, Clone, Copy)]
|
||||
pub struct Extractor {
|
||||
use_html5ever: bool,
|
||||
include_verbatim: bool,
|
||||
}
|
||||
|
||||
impl Extractor {
|
||||
/// Creates a new extractor
|
||||
///
|
||||
/// The extractor can be configured with the following settings:
|
||||
///
|
||||
/// - `use_html5ever` enables the alternative HTML parser engine html5ever, that
|
||||
/// is also used in the Servo browser by Mozilla.
|
||||
/// The default is `html5gum`, which is more performant and well maintained.
|
||||
///
|
||||
/// - `include_verbatim` ignores links inside Markdown code blocks.
|
||||
/// These can be denoted as a block starting with three backticks or an indented block.
|
||||
/// For more information, consult the `pulldown_cmark` documentation about code blocks
|
||||
/// [here](https://docs.rs/pulldown-cmark/latest/pulldown_cmark/enum.CodeBlockKind.html)
|
||||
#[must_use]
|
||||
pub const fn new(use_html5ever: bool, include_verbatim: bool) -> Self {
|
||||
Self {
|
||||
use_html5ever,
|
||||
include_verbatim,
|
||||
}
|
||||
}
|
||||
|
||||
/// Main entrypoint for extracting links from various sources
|
||||
/// (Markdown, HTML, and plaintext)
|
||||
#[must_use]
|
||||
pub fn extract(input_content: &InputContent) -> Vec<RawUri> {
|
||||
Self::extract_impl(input_content, false)
|
||||
}
|
||||
|
||||
/// Main entrypoint for extracting links from various sources, legacy implementation using
|
||||
/// html5ever
|
||||
/// (Markdown, HTML, and plaintext)
|
||||
#[must_use]
|
||||
pub fn extract_html5ever(input_content: &InputContent) -> Vec<RawUri> {
|
||||
Self::extract_impl(input_content, true)
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
fn extract_impl(input_content: &InputContent, use_html5ever: bool) -> Vec<RawUri> {
|
||||
pub fn extract(&self, input_content: &InputContent) -> Vec<RawUri> {
|
||||
match input_content.file_type {
|
||||
FileType::Markdown => extract_markdown(&input_content.content),
|
||||
FileType::Markdown => extract_markdown(&input_content.content, self.include_verbatim),
|
||||
FileType::Html => {
|
||||
if use_html5ever {
|
||||
html::extract_html(&input_content.content)
|
||||
if self.use_html5ever {
|
||||
html5ever::extract_html(&input_content.content, self.include_verbatim)
|
||||
} else {
|
||||
html5gum::extract_html(&input_content.content)
|
||||
html5gum::extract_html(&input_content.content, self.include_verbatim)
|
||||
}
|
||||
}
|
||||
FileType::Plaintext => extract_plaintext(&input_content.content),
|
||||
|
|
@ -63,12 +95,16 @@ mod test {
|
|||
fn extract_uris(input: &str, file_type: FileType) -> HashSet<Uri> {
|
||||
let input_content = InputContent::from_string(input, file_type);
|
||||
|
||||
let uris_html5gum = Extractor::extract(&input_content)
|
||||
let extractor = Extractor::new(false, false);
|
||||
let uris_html5gum = extractor
|
||||
.extract(&input_content)
|
||||
.into_iter()
|
||||
.filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
|
||||
.collect();
|
||||
|
||||
let uris_html5ever = Extractor::extract_html5ever(&input_content)
|
||||
let extractor = Extractor::new(true, false);
|
||||
let uris_html5ever = extractor
|
||||
.extract(&input_content)
|
||||
.into_iter()
|
||||
.filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
|
||||
.collect();
|
||||
|
|
@ -183,11 +219,8 @@ mod test {
|
|||
};
|
||||
|
||||
for use_html5ever in [true, false] {
|
||||
let links = if use_html5ever {
|
||||
Extractor::extract_html5ever(input_content)
|
||||
} else {
|
||||
Extractor::extract(input_content)
|
||||
};
|
||||
let extractor = Extractor::new(use_html5ever, false);
|
||||
let links = extractor.extract(input_content);
|
||||
|
||||
let urls = links
|
||||
.into_iter()
|
||||
|
|
|
|||
Loading…
Reference in a new issue