mirror of
https://github.com/Hopiu/lychee.git
synced 2026-04-25 07:24:46 +00:00
Add html5gum as alternative link extractor (#480)
html5gum is a HTML parser that offers lower-level control over which tokens actually get created and are tracked. As such, the extractor doesn't allocate anything tokens it doesn't care about. On some benchmarks it provides a substantial performance boost. The old parser, html5ever is still available by setting the `LYCHEE_USE_HTML5EVER=1` env var.
This commit is contained in:
parent
6bf8c1fe39
commit
68d09f7e5b
9 changed files with 326 additions and 53 deletions
16
Cargo.lock
generated
16
Cargo.lock
generated
|
|
@ -1410,6 +1410,15 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "html5gum"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2dad48b66db55322add2819ae1d7bda0c32f3415269a08330679dbc8b0afeb30"
|
||||
dependencies = [
|
||||
"jetscii",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "0.2.6"
|
||||
|
|
@ -1638,6 +1647,12 @@ version = "1.0.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35"
|
||||
|
||||
[[package]]
|
||||
name = "jetscii"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c9447923c57a8a2d5c1b0875cdf96a6324275df728b498f2ede0e5cbde088a15"
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.55"
|
||||
|
|
@ -1812,6 +1827,7 @@ dependencies = [
|
|||
"futures",
|
||||
"glob",
|
||||
"html5ever",
|
||||
"html5gum",
|
||||
"http",
|
||||
"hubcaps",
|
||||
"jwalk",
|
||||
|
|
|
|||
|
|
@ -20,16 +20,13 @@ async fn main() -> Result<()> {
|
|||
},
|
||||
];
|
||||
|
||||
let links = Collector::new(
|
||||
None, // base
|
||||
false, // don't skip missing inputs
|
||||
)
|
||||
.collect_links(
|
||||
inputs, // base url or directory
|
||||
)
|
||||
.await
|
||||
.collect::<Result<Vec<_>>>()
|
||||
.await?;
|
||||
let links = Collector::new(None) // base
|
||||
.skip_missing_inputs(false) // don't skip missing inputs? (default=false)
|
||||
.use_html5ever(false) // use html5ever for parsing? (default=false)
|
||||
.collect_links(inputs) // base url or directory
|
||||
.await
|
||||
.collect::<Result<Vec<_>>>()
|
||||
.await?;
|
||||
|
||||
dbg!(links);
|
||||
|
||||
|
|
|
|||
|
|
@ -200,7 +200,10 @@ fn run_main() -> Result<i32> {
|
|||
/// Run lychee on the given inputs
|
||||
async fn run(opts: &LycheeOptions) -> Result<i32> {
|
||||
let inputs = opts.inputs();
|
||||
let requests = Collector::new(opts.config.base.clone(), opts.config.skip_missing)
|
||||
let requests = Collector::new(opts.config.base.clone())
|
||||
.skip_missing_inputs(opts.config.skip_missing)
|
||||
// File a bug if you rely on this envvar! It's going to go away eventually.
|
||||
.use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1"))
|
||||
.collect_links(inputs)
|
||||
.await;
|
||||
|
||||
|
|
|
|||
|
|
@ -20,7 +20,6 @@ version = "0.8.2"
|
|||
check-if-email-exists = "0.8.26"
|
||||
fast_chemail = "0.9.6"
|
||||
glob = "0.3.0"
|
||||
html5ever = "0.25.1"
|
||||
http = "0.2.6"
|
||||
hubcaps = "0.6.2"
|
||||
linkify = "0.8.0"
|
||||
|
|
@ -50,6 +49,8 @@ once_cell = "1.9.0"
|
|||
thiserror = "1.0.30"
|
||||
futures = "0.3.19"
|
||||
lazy_static = "1.4.0"
|
||||
html5ever = "0.25.1"
|
||||
html5gum = "0.4.0"
|
||||
|
||||
[dependencies.par-stream]
|
||||
version = "0.10.0"
|
||||
|
|
|
|||
|
|
@ -13,18 +13,34 @@ use par_stream::ParStreamExt;
|
|||
pub struct Collector {
|
||||
base: Option<Base>,
|
||||
skip_missing_inputs: bool,
|
||||
use_html5ever: bool,
|
||||
}
|
||||
|
||||
impl Collector {
|
||||
/// Create a new collector with an empty cache
|
||||
#[must_use]
|
||||
pub const fn new(base: Option<Base>, skip_missing_inputs: bool) -> Self {
|
||||
pub const fn new(base: Option<Base>) -> Self {
|
||||
Collector {
|
||||
base,
|
||||
skip_missing_inputs,
|
||||
skip_missing_inputs: false,
|
||||
use_html5ever: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Skip missing input files (default is to error if they don't exist)
|
||||
#[must_use]
|
||||
pub const fn skip_missing_inputs(mut self, yes: bool) -> Self {
|
||||
self.skip_missing_inputs = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Use `html5ever` to parse HTML instead of `html5gum`.
|
||||
#[must_use]
|
||||
pub const fn use_html5ever(mut self, yes: bool) -> Self {
|
||||
self.use_html5ever = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Fetch all unique links from inputs
|
||||
/// All relative URLs get prefixed with `base` (if given).
|
||||
/// (This can be a directory or a base URL)
|
||||
|
|
@ -47,7 +63,11 @@ impl Collector {
|
|||
let base = base.clone();
|
||||
async move {
|
||||
let content = content?;
|
||||
let uris: Vec<RawUri> = Extractor::extract(&content);
|
||||
let uris: Vec<RawUri> = if self.use_html5ever {
|
||||
Extractor::extract_html5ever(&content)
|
||||
} else {
|
||||
Extractor::extract(&content)
|
||||
};
|
||||
let requests = request::create(uris, &content, &base)?;
|
||||
Result::Ok(stream::iter(requests.into_iter().map(Ok)))
|
||||
}
|
||||
|
|
@ -74,7 +94,7 @@ mod test {
|
|||
|
||||
// Helper function to run the collector on the given inputs
|
||||
async fn collect(inputs: Vec<Input>, base: Option<Base>) -> HashSet<Uri> {
|
||||
let responses = Collector::new(base, false).collect_links(inputs).await;
|
||||
let responses = Collector::new(base).collect_links(inputs).await;
|
||||
responses.map(|r| r.unwrap().uri).collect().await
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ use html5ever::{
|
|||
use super::plaintext::extract_plaintext;
|
||||
use crate::types::raw_uri::RawUri;
|
||||
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Default)]
|
||||
struct LinkExtractor {
|
||||
links: Vec<RawUri>,
|
||||
}
|
||||
|
|
@ -61,8 +61,8 @@ impl TokenSink for LinkExtractor {
|
|||
}
|
||||
|
||||
impl LinkExtractor {
|
||||
pub(crate) const fn new() -> Self {
|
||||
Self { links: Vec::new() }
|
||||
pub(crate) fn new() -> Self {
|
||||
LinkExtractor::default()
|
||||
}
|
||||
|
||||
/// Extract all semantically known links from a given html attribute.
|
||||
|
|
@ -125,20 +125,3 @@ pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
|
|||
|
||||
tokenizer.sink.links
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_extract_link_at_end_of_line() {
|
||||
let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
|
||||
let link = input.trim_end();
|
||||
|
||||
let uris: Vec<String> = extract_html(input)
|
||||
.into_iter()
|
||||
.map(|raw_uri| raw_uri.text)
|
||||
.collect();
|
||||
assert_eq!(vec![link.to_string()], uris);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
207
lychee-lib/src/extract/html5gum.rs
Normal file
207
lychee-lib/src/extract/html5gum.rs
Normal file
|
|
@ -0,0 +1,207 @@
|
|||
use html5gum::{Emitter, Error, Tokenizer};
|
||||
|
||||
use super::plaintext::extract_plaintext;
|
||||
use crate::types::raw_uri::RawUri;
|
||||
|
||||
#[derive(Clone)]
|
||||
struct LinkExtractor {
|
||||
// note: what html5gum calls a tag, lychee calls an element
|
||||
links: Vec<RawUri>,
|
||||
current_string: Vec<u8>,
|
||||
current_element_name: Vec<u8>,
|
||||
current_element_is_closing: bool,
|
||||
current_attribute_name: Vec<u8>,
|
||||
current_attribute_value: Vec<u8>,
|
||||
last_start_element: Vec<u8>,
|
||||
}
|
||||
|
||||
/// this is the same as `std::str::from_utf8_unchecked`, but with extra debug assertions for ease
|
||||
/// of debugging
|
||||
unsafe fn from_utf8_unchecked(s: &[u8]) -> &str {
|
||||
debug_assert!(std::str::from_utf8(s).is_ok());
|
||||
std::str::from_utf8_unchecked(s)
|
||||
}
|
||||
|
||||
impl LinkExtractor {
|
||||
pub(crate) const fn new() -> Self {
|
||||
LinkExtractor {
|
||||
links: Vec::new(),
|
||||
current_string: Vec::new(),
|
||||
current_element_name: Vec::new(),
|
||||
current_element_is_closing: false,
|
||||
current_attribute_name: Vec::new(),
|
||||
current_attribute_value: Vec::new(),
|
||||
last_start_element: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract all semantically known links from a given html attribute.
|
||||
#[allow(clippy::unnested_or_patterns)]
|
||||
pub(crate) fn extract_urls_from_elem_attr<'a>(
|
||||
attr_name: &str,
|
||||
elem_name: &str,
|
||||
attr_value: &'a str,
|
||||
) -> Option<impl Iterator<Item = &'a str>> {
|
||||
// For a comprehensive list of elements that might contain URLs/URIs
|
||||
// see https://www.w3.org/TR/REC-html40/index/attributes.html
|
||||
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
|
||||
match (elem_name, attr_name) {
|
||||
// Common element/attribute combinations for links
|
||||
(_, "href" | "src" | "cite" | "usemap")
|
||||
// Less common (but still valid!) combinations
|
||||
| ("applet", "codebase")
|
||||
| ("body", "background")
|
||||
| ("button", "formaction")
|
||||
| ("command", "icon")
|
||||
| ("form", "action")
|
||||
| ("frame", "longdesc")
|
||||
| ("head", "profile")
|
||||
| ("html", "manifest")
|
||||
| ("iframe", "longdesc")
|
||||
| ("img", "longdesc")
|
||||
| ("input", "formaction")
|
||||
| ("object", "classid")
|
||||
| ("object", "codebase")
|
||||
| ("object", "data")
|
||||
| ("video", "poster") => {
|
||||
Some(vec![attr_value].into_iter())
|
||||
}
|
||||
(_, "srcset") => {
|
||||
let mut urls = Vec::new();
|
||||
for image_candidate_string in attr_value.trim().split(',') {
|
||||
for part in image_candidate_string.split_ascii_whitespace() {
|
||||
if part.is_empty() {
|
||||
continue;
|
||||
}
|
||||
urls.push(part);
|
||||
break;
|
||||
}
|
||||
}
|
||||
Some(urls.into_iter())
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn flush_current_characters(&mut self) {
|
||||
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
|
||||
let raw = unsafe { from_utf8_unchecked(&self.current_string) };
|
||||
self.links.extend(extract_plaintext(raw));
|
||||
self.current_string.clear();
|
||||
}
|
||||
|
||||
fn flush_old_attribute(&mut self) {
|
||||
{
|
||||
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
|
||||
let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
|
||||
let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) };
|
||||
let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) };
|
||||
|
||||
let urls = LinkExtractor::extract_urls_from_elem_attr(attr, name, value);
|
||||
|
||||
let new_urls = match urls {
|
||||
None => extract_plaintext(value),
|
||||
Some(urls) => urls
|
||||
.into_iter()
|
||||
.map(|url| RawUri {
|
||||
text: url.to_string(),
|
||||
element: Some(name.to_string()),
|
||||
attribute: Some(attr.to_string()),
|
||||
})
|
||||
.collect::<Vec<_>>(),
|
||||
};
|
||||
|
||||
self.links.extend(new_urls);
|
||||
}
|
||||
|
||||
self.current_attribute_name.clear();
|
||||
self.current_attribute_value.clear();
|
||||
}
|
||||
}
|
||||
|
||||
impl Emitter for &mut LinkExtractor {
|
||||
type Token = ();
|
||||
|
||||
fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) {
|
||||
self.last_start_element.clear();
|
||||
self.last_start_element
|
||||
.extend(last_start_tag.unwrap_or_default());
|
||||
}
|
||||
|
||||
fn emit_eof(&mut self) {
|
||||
self.flush_current_characters();
|
||||
}
|
||||
fn emit_error(&mut self, _: Error) {}
|
||||
fn pop_token(&mut self) -> Option<()> {
|
||||
None
|
||||
}
|
||||
|
||||
fn emit_string(&mut self, c: &[u8]) {
|
||||
self.current_string.extend(c);
|
||||
}
|
||||
|
||||
fn init_start_tag(&mut self) {
|
||||
self.flush_current_characters();
|
||||
self.current_element_name.clear();
|
||||
self.current_element_is_closing = false;
|
||||
}
|
||||
|
||||
fn init_end_tag(&mut self) {
|
||||
self.flush_current_characters();
|
||||
self.current_element_name.clear();
|
||||
self.current_element_is_closing = true;
|
||||
}
|
||||
|
||||
fn init_comment(&mut self) {
|
||||
self.flush_current_characters();
|
||||
}
|
||||
|
||||
fn emit_current_tag(&mut self) {
|
||||
self.flush_old_attribute();
|
||||
}
|
||||
|
||||
fn emit_current_doctype(&mut self) {}
|
||||
fn set_self_closing(&mut self) {
|
||||
self.current_element_is_closing = true;
|
||||
}
|
||||
fn set_force_quirks(&mut self) {}
|
||||
|
||||
fn push_tag_name(&mut self, s: &[u8]) {
|
||||
self.current_element_name.extend(s);
|
||||
}
|
||||
|
||||
fn push_comment(&mut self, _: &[u8]) {}
|
||||
fn push_doctype_name(&mut self, _: &[u8]) {}
|
||||
fn init_doctype(&mut self) {
|
||||
self.flush_current_characters();
|
||||
}
|
||||
fn init_attribute(&mut self) {
|
||||
self.flush_old_attribute();
|
||||
}
|
||||
fn push_attribute_name(&mut self, s: &[u8]) {
|
||||
self.current_attribute_name.extend(s);
|
||||
}
|
||||
fn push_attribute_value(&mut self, s: &[u8]) {
|
||||
self.current_attribute_value.extend(s);
|
||||
}
|
||||
|
||||
fn set_doctype_public_identifier(&mut self, _: &[u8]) {}
|
||||
fn set_doctype_system_identifier(&mut self, _: &[u8]) {}
|
||||
fn push_doctype_public_identifier(&mut self, _: &[u8]) {}
|
||||
fn push_doctype_system_identifier(&mut self, _: &[u8]) {}
|
||||
fn current_is_appropriate_end_tag_token(&mut self) -> bool {
|
||||
self.current_element_is_closing
|
||||
&& !self.current_element_name.is_empty()
|
||||
&& self.current_element_name == self.last_start_element
|
||||
}
|
||||
|
||||
fn emit_current_comment(&mut self) {}
|
||||
}
|
||||
|
||||
/// Extract unparsed URL strings from an HTML string.
|
||||
pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
|
||||
let mut extractor = LinkExtractor::new();
|
||||
let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
|
||||
assert!(tokenizer.next().is_none());
|
||||
extractor.links
|
||||
}
|
||||
|
|
@ -1,10 +1,10 @@
|
|||
use crate::types::{raw_uri::RawUri, FileType, InputContent};
|
||||
|
||||
mod html;
|
||||
mod html5gum;
|
||||
mod markdown;
|
||||
mod plaintext;
|
||||
|
||||
use html::extract_html;
|
||||
use markdown::extract_markdown;
|
||||
use plaintext::extract_plaintext;
|
||||
|
||||
|
|
@ -19,9 +19,28 @@ impl Extractor {
|
|||
/// (Markdown, HTML, and plaintext)
|
||||
#[must_use]
|
||||
pub fn extract(input_content: &InputContent) -> Vec<RawUri> {
|
||||
Self::extract_impl(input_content, false)
|
||||
}
|
||||
|
||||
/// Main entrypoint for extracting links from various sources, legacy implementation using
|
||||
/// html5ever
|
||||
/// (Markdown, HTML, and plaintext)
|
||||
#[must_use]
|
||||
pub fn extract_html5ever(input_content: &InputContent) -> Vec<RawUri> {
|
||||
Self::extract_impl(input_content, true)
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
fn extract_impl(input_content: &InputContent, use_html5ever: bool) -> Vec<RawUri> {
|
||||
match input_content.file_type {
|
||||
FileType::Markdown => extract_markdown(&input_content.content),
|
||||
FileType::Html => extract_html(&input_content.content),
|
||||
FileType::Html => {
|
||||
if use_html5ever {
|
||||
html::extract_html(&input_content.content)
|
||||
} else {
|
||||
html5gum::extract_html(&input_content.content)
|
||||
}
|
||||
}
|
||||
FileType::Plaintext => extract_plaintext(&input_content.content),
|
||||
}
|
||||
}
|
||||
|
|
@ -43,10 +62,19 @@ mod test {
|
|||
|
||||
fn extract_uris(input: &str, file_type: FileType) -> HashSet<Uri> {
|
||||
let input_content = InputContent::from_string(input, file_type);
|
||||
Extractor::extract(&input_content)
|
||||
|
||||
let uris_html5gum = Extractor::extract(&input_content)
|
||||
.into_iter()
|
||||
.filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
|
||||
.collect()
|
||||
.collect();
|
||||
|
||||
let uris_html5ever = Extractor::extract_html5ever(&input_content)
|
||||
.into_iter()
|
||||
.filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
|
||||
.collect();
|
||||
|
||||
assert_eq!(uris_html5gum, uris_html5ever);
|
||||
uris_html5gum
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -154,19 +182,26 @@ mod test {
|
|||
content: contents.to_string(),
|
||||
};
|
||||
|
||||
let links = Extractor::extract(input_content);
|
||||
let urls = links
|
||||
.into_iter()
|
||||
.map(|raw_uri| raw_uri.text)
|
||||
for use_html5ever in [true, false] {
|
||||
let links = if use_html5ever {
|
||||
Extractor::extract_html5ever(input_content)
|
||||
} else {
|
||||
Extractor::extract(input_content)
|
||||
};
|
||||
|
||||
let urls = links
|
||||
.into_iter()
|
||||
.map(|raw_uri| raw_uri.text)
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
let expected_urls = IntoIterator::into_iter([
|
||||
String::from("https://github.com/lycheeverse/lychee/"),
|
||||
String::from("/about"),
|
||||
])
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
let expected_urls = IntoIterator::into_iter([
|
||||
String::from("https://github.com/lycheeverse/lychee/"),
|
||||
String::from("/about"),
|
||||
])
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
assert_eq!(urls, expected_urls);
|
||||
assert_eq!(urls, expected_urls);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -242,4 +277,16 @@ mod test {
|
|||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_link_at_end_of_line() {
|
||||
let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
|
||||
let links = extract_uris(input, FileType::Plaintext);
|
||||
|
||||
let expected_links =
|
||||
IntoIterator::into_iter([website("https://www.apache.org/licenses/LICENSE-2.0")])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
use html5ever::tendril::StrTendril;
|
||||
use log::info;
|
||||
use percent_encoding::percent_decode_str;
|
||||
use reqwest::Url;
|
||||
|
|
@ -28,7 +27,7 @@ pub(crate) fn create(
|
|||
.into_iter()
|
||||
.map(|raw_uri| {
|
||||
let is_anchor = raw_uri.is_anchor();
|
||||
let text = StrTendril::from(raw_uri.text.clone());
|
||||
let text = raw_uri.text.clone();
|
||||
let element = raw_uri.element.clone();
|
||||
let attribute = raw_uri.attribute.clone();
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue