Add html5gum as alternative link extractor (#480)

html5gum is a HTML parser that offers lower-level control over which tokens actually get created and are tracked. As such, the extractor doesn't allocate anything tokens it doesn't care about. On some benchmarks it provides a substantial performance boost. The old parser, html5ever is still available by setting the `LYCHEE_USE_HTML5EVER=1` env var.
This commit is contained in:
Markus Unterwaditzer 2022-02-07 22:54:47 +01:00 committed by GitHub
parent 6bf8c1fe39
commit 68d09f7e5b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 326 additions and 53 deletions

16
Cargo.lock generated
View file

@ -1410,6 +1410,15 @@ dependencies = [
"syn",
]
[[package]]
name = "html5gum"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dad48b66db55322add2819ae1d7bda0c32f3415269a08330679dbc8b0afeb30"
dependencies = [
"jetscii",
]
[[package]]
name = "http"
version = "0.2.6"
@ -1638,6 +1647,12 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35"
[[package]]
name = "jetscii"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c9447923c57a8a2d5c1b0875cdf96a6324275df728b498f2ede0e5cbde088a15"
[[package]]
name = "js-sys"
version = "0.3.55"
@ -1812,6 +1827,7 @@ dependencies = [
"futures",
"glob",
"html5ever",
"html5gum",
"http",
"hubcaps",
"jwalk",

View file

@ -20,16 +20,13 @@ async fn main() -> Result<()> {
},
];
let links = Collector::new(
None, // base
false, // don't skip missing inputs
)
.collect_links(
inputs, // base url or directory
)
.await
.collect::<Result<Vec<_>>>()
.await?;
let links = Collector::new(None) // base
.skip_missing_inputs(false) // don't skip missing inputs? (default=false)
.use_html5ever(false) // use html5ever for parsing? (default=false)
.collect_links(inputs) // base url or directory
.await
.collect::<Result<Vec<_>>>()
.await?;
dbg!(links);

View file

@ -200,7 +200,10 @@ fn run_main() -> Result<i32> {
/// Run lychee on the given inputs
async fn run(opts: &LycheeOptions) -> Result<i32> {
let inputs = opts.inputs();
let requests = Collector::new(opts.config.base.clone(), opts.config.skip_missing)
let requests = Collector::new(opts.config.base.clone())
.skip_missing_inputs(opts.config.skip_missing)
// File a bug if you rely on this envvar! It's going to go away eventually.
.use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1"))
.collect_links(inputs)
.await;

View file

@ -20,7 +20,6 @@ version = "0.8.2"
check-if-email-exists = "0.8.26"
fast_chemail = "0.9.6"
glob = "0.3.0"
html5ever = "0.25.1"
http = "0.2.6"
hubcaps = "0.6.2"
linkify = "0.8.0"
@ -50,6 +49,8 @@ once_cell = "1.9.0"
thiserror = "1.0.30"
futures = "0.3.19"
lazy_static = "1.4.0"
html5ever = "0.25.1"
html5gum = "0.4.0"
[dependencies.par-stream]
version = "0.10.0"

View file

@ -13,18 +13,34 @@ use par_stream::ParStreamExt;
pub struct Collector {
base: Option<Base>,
skip_missing_inputs: bool,
use_html5ever: bool,
}
impl Collector {
/// Create a new collector with an empty cache
#[must_use]
pub const fn new(base: Option<Base>, skip_missing_inputs: bool) -> Self {
pub const fn new(base: Option<Base>) -> Self {
Collector {
base,
skip_missing_inputs,
skip_missing_inputs: false,
use_html5ever: false,
}
}
/// Skip missing input files (default is to error if they don't exist)
#[must_use]
pub const fn skip_missing_inputs(mut self, yes: bool) -> Self {
self.skip_missing_inputs = yes;
self
}
/// Use `html5ever` to parse HTML instead of `html5gum`.
#[must_use]
pub const fn use_html5ever(mut self, yes: bool) -> Self {
self.use_html5ever = yes;
self
}
/// Fetch all unique links from inputs
/// All relative URLs get prefixed with `base` (if given).
/// (This can be a directory or a base URL)
@ -47,7 +63,11 @@ impl Collector {
let base = base.clone();
async move {
let content = content?;
let uris: Vec<RawUri> = Extractor::extract(&content);
let uris: Vec<RawUri> = if self.use_html5ever {
Extractor::extract_html5ever(&content)
} else {
Extractor::extract(&content)
};
let requests = request::create(uris, &content, &base)?;
Result::Ok(stream::iter(requests.into_iter().map(Ok)))
}
@ -74,7 +94,7 @@ mod test {
// Helper function to run the collector on the given inputs
async fn collect(inputs: Vec<Input>, base: Option<Base>) -> HashSet<Uri> {
let responses = Collector::new(base, false).collect_links(inputs).await;
let responses = Collector::new(base).collect_links(inputs).await;
responses.map(|r| r.unwrap().uri).collect().await
}

View file

@ -7,7 +7,7 @@ use html5ever::{
use super::plaintext::extract_plaintext;
use crate::types::raw_uri::RawUri;
#[derive(Clone)]
#[derive(Clone, Default)]
struct LinkExtractor {
links: Vec<RawUri>,
}
@ -61,8 +61,8 @@ impl TokenSink for LinkExtractor {
}
impl LinkExtractor {
pub(crate) const fn new() -> Self {
Self { links: Vec::new() }
pub(crate) fn new() -> Self {
LinkExtractor::default()
}
/// Extract all semantically known links from a given html attribute.
@ -125,20 +125,3 @@ pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
tokenizer.sink.links
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_link_at_end_of_line() {
let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
let link = input.trim_end();
let uris: Vec<String> = extract_html(input)
.into_iter()
.map(|raw_uri| raw_uri.text)
.collect();
assert_eq!(vec![link.to_string()], uris);
}
}

View file

@ -0,0 +1,207 @@
use html5gum::{Emitter, Error, Tokenizer};
use super::plaintext::extract_plaintext;
use crate::types::raw_uri::RawUri;
#[derive(Clone)]
struct LinkExtractor {
// note: what html5gum calls a tag, lychee calls an element
links: Vec<RawUri>,
current_string: Vec<u8>,
current_element_name: Vec<u8>,
current_element_is_closing: bool,
current_attribute_name: Vec<u8>,
current_attribute_value: Vec<u8>,
last_start_element: Vec<u8>,
}
/// this is the same as `std::str::from_utf8_unchecked`, but with extra debug assertions for ease
/// of debugging
unsafe fn from_utf8_unchecked(s: &[u8]) -> &str {
debug_assert!(std::str::from_utf8(s).is_ok());
std::str::from_utf8_unchecked(s)
}
impl LinkExtractor {
pub(crate) const fn new() -> Self {
LinkExtractor {
links: Vec::new(),
current_string: Vec::new(),
current_element_name: Vec::new(),
current_element_is_closing: false,
current_attribute_name: Vec::new(),
current_attribute_value: Vec::new(),
last_start_element: Vec::new(),
}
}
/// Extract all semantically known links from a given html attribute.
#[allow(clippy::unnested_or_patterns)]
pub(crate) fn extract_urls_from_elem_attr<'a>(
attr_name: &str,
elem_name: &str,
attr_value: &'a str,
) -> Option<impl Iterator<Item = &'a str>> {
// For a comprehensive list of elements that might contain URLs/URIs
// see https://www.w3.org/TR/REC-html40/index/attributes.html
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
match (elem_name, attr_name) {
// Common element/attribute combinations for links
(_, "href" | "src" | "cite" | "usemap")
// Less common (but still valid!) combinations
| ("applet", "codebase")
| ("body", "background")
| ("button", "formaction")
| ("command", "icon")
| ("form", "action")
| ("frame", "longdesc")
| ("head", "profile")
| ("html", "manifest")
| ("iframe", "longdesc")
| ("img", "longdesc")
| ("input", "formaction")
| ("object", "classid")
| ("object", "codebase")
| ("object", "data")
| ("video", "poster") => {
Some(vec![attr_value].into_iter())
}
(_, "srcset") => {
let mut urls = Vec::new();
for image_candidate_string in attr_value.trim().split(',') {
for part in image_candidate_string.split_ascii_whitespace() {
if part.is_empty() {
continue;
}
urls.push(part);
break;
}
}
Some(urls.into_iter())
}
_ => None,
}
}
fn flush_current_characters(&mut self) {
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
let raw = unsafe { from_utf8_unchecked(&self.current_string) };
self.links.extend(extract_plaintext(raw));
self.current_string.clear();
}
fn flush_old_attribute(&mut self) {
{
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) };
let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) };
let urls = LinkExtractor::extract_urls_from_elem_attr(attr, name, value);
let new_urls = match urls {
None => extract_plaintext(value),
Some(urls) => urls
.into_iter()
.map(|url| RawUri {
text: url.to_string(),
element: Some(name.to_string()),
attribute: Some(attr.to_string()),
})
.collect::<Vec<_>>(),
};
self.links.extend(new_urls);
}
self.current_attribute_name.clear();
self.current_attribute_value.clear();
}
}
impl Emitter for &mut LinkExtractor {
type Token = ();
fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) {
self.last_start_element.clear();
self.last_start_element
.extend(last_start_tag.unwrap_or_default());
}
fn emit_eof(&mut self) {
self.flush_current_characters();
}
fn emit_error(&mut self, _: Error) {}
fn pop_token(&mut self) -> Option<()> {
None
}
fn emit_string(&mut self, c: &[u8]) {
self.current_string.extend(c);
}
fn init_start_tag(&mut self) {
self.flush_current_characters();
self.current_element_name.clear();
self.current_element_is_closing = false;
}
fn init_end_tag(&mut self) {
self.flush_current_characters();
self.current_element_name.clear();
self.current_element_is_closing = true;
}
fn init_comment(&mut self) {
self.flush_current_characters();
}
fn emit_current_tag(&mut self) {
self.flush_old_attribute();
}
fn emit_current_doctype(&mut self) {}
fn set_self_closing(&mut self) {
self.current_element_is_closing = true;
}
fn set_force_quirks(&mut self) {}
fn push_tag_name(&mut self, s: &[u8]) {
self.current_element_name.extend(s);
}
fn push_comment(&mut self, _: &[u8]) {}
fn push_doctype_name(&mut self, _: &[u8]) {}
fn init_doctype(&mut self) {
self.flush_current_characters();
}
fn init_attribute(&mut self) {
self.flush_old_attribute();
}
fn push_attribute_name(&mut self, s: &[u8]) {
self.current_attribute_name.extend(s);
}
fn push_attribute_value(&mut self, s: &[u8]) {
self.current_attribute_value.extend(s);
}
fn set_doctype_public_identifier(&mut self, _: &[u8]) {}
fn set_doctype_system_identifier(&mut self, _: &[u8]) {}
fn push_doctype_public_identifier(&mut self, _: &[u8]) {}
fn push_doctype_system_identifier(&mut self, _: &[u8]) {}
fn current_is_appropriate_end_tag_token(&mut self) -> bool {
self.current_element_is_closing
&& !self.current_element_name.is_empty()
&& self.current_element_name == self.last_start_element
}
fn emit_current_comment(&mut self) {}
}
/// Extract unparsed URL strings from an HTML string.
pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
let mut extractor = LinkExtractor::new();
let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
assert!(tokenizer.next().is_none());
extractor.links
}

View file

@ -1,10 +1,10 @@
use crate::types::{raw_uri::RawUri, FileType, InputContent};
mod html;
mod html5gum;
mod markdown;
mod plaintext;
use html::extract_html;
use markdown::extract_markdown;
use plaintext::extract_plaintext;
@ -19,9 +19,28 @@ impl Extractor {
/// (Markdown, HTML, and plaintext)
#[must_use]
pub fn extract(input_content: &InputContent) -> Vec<RawUri> {
Self::extract_impl(input_content, false)
}
/// Main entrypoint for extracting links from various sources, legacy implementation using
/// html5ever
/// (Markdown, HTML, and plaintext)
#[must_use]
pub fn extract_html5ever(input_content: &InputContent) -> Vec<RawUri> {
Self::extract_impl(input_content, true)
}
#[must_use]
fn extract_impl(input_content: &InputContent, use_html5ever: bool) -> Vec<RawUri> {
match input_content.file_type {
FileType::Markdown => extract_markdown(&input_content.content),
FileType::Html => extract_html(&input_content.content),
FileType::Html => {
if use_html5ever {
html::extract_html(&input_content.content)
} else {
html5gum::extract_html(&input_content.content)
}
}
FileType::Plaintext => extract_plaintext(&input_content.content),
}
}
@ -43,10 +62,19 @@ mod test {
fn extract_uris(input: &str, file_type: FileType) -> HashSet<Uri> {
let input_content = InputContent::from_string(input, file_type);
Extractor::extract(&input_content)
let uris_html5gum = Extractor::extract(&input_content)
.into_iter()
.filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
.collect()
.collect();
let uris_html5ever = Extractor::extract_html5ever(&input_content)
.into_iter()
.filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
.collect();
assert_eq!(uris_html5gum, uris_html5ever);
uris_html5gum
}
#[test]
@ -154,19 +182,26 @@ mod test {
content: contents.to_string(),
};
let links = Extractor::extract(input_content);
let urls = links
.into_iter()
.map(|raw_uri| raw_uri.text)
for use_html5ever in [true, false] {
let links = if use_html5ever {
Extractor::extract_html5ever(input_content)
} else {
Extractor::extract(input_content)
};
let urls = links
.into_iter()
.map(|raw_uri| raw_uri.text)
.collect::<HashSet<_>>();
let expected_urls = IntoIterator::into_iter([
String::from("https://github.com/lycheeverse/lychee/"),
String::from("/about"),
])
.collect::<HashSet<_>>();
let expected_urls = IntoIterator::into_iter([
String::from("https://github.com/lycheeverse/lychee/"),
String::from("/about"),
])
.collect::<HashSet<_>>();
assert_eq!(urls, expected_urls);
assert_eq!(urls, expected_urls);
}
}
#[test]
@ -242,4 +277,16 @@ mod test {
assert_eq!(links, expected_links);
}
#[test]
fn test_extract_link_at_end_of_line() {
let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
let links = extract_uris(input, FileType::Plaintext);
let expected_links =
IntoIterator::into_iter([website("https://www.apache.org/licenses/LICENSE-2.0")])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}
}

View file

@ -1,4 +1,3 @@
use html5ever::tendril::StrTendril;
use log::info;
use percent_encoding::percent_decode_str;
use reqwest::Url;
@ -28,7 +27,7 @@ pub(crate) fn create(
.into_iter()
.map(|raw_uri| {
let is_anchor = raw_uri.is_anchor();
let text = StrTendril::from(raw_uri.text.clone());
let text = raw_uri.text.clone();
let element = raw_uri.element.clone();
let attribute = raw_uri.attribute.clone();