Refactor extractor (#354)

This avoids sending URLs back and forth between the different parsers.
Also, it should allow for future optimizations to reduce allocs.
This commit is contained in:
Matthias 2021-10-07 12:51:02 +02:00 committed by GitHub
parent 59a7e3af03
commit a7f809612d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 155 additions and 137 deletions

7
Cargo.lock generated
View file

@ -1624,16 +1624,13 @@ dependencies = [
[[package]]
name = "markup5ever"
version = "0.10.0"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab"
checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd"
dependencies = [
"log",
"phf",
"phf_codegen",
"serde",
"serde_derive",
"serde_json",
"string_cache",
"string_cache_codegen",
"tendril",

View file

@ -8,3 +8,6 @@ members = [
[patch.crates-io]
# Switch back to version on crates.io after 0.6.3+ is released
hubcaps = { git="https://github.com/softprops/hubcaps.git" }
[profile.release]
debug = true

View file

@ -1,4 +1,4 @@
use crate::{extract::extract_links, Base, Input, Request, Result, Uri};
use crate::{extract::Extractor, Base, Input, Request, Result, Uri};
use std::collections::HashSet;
/// Collector keeps the state of link collection
@ -52,8 +52,10 @@ impl Collector {
while let Some(result) = contents_rx.recv().await {
for input_content in result? {
let base = self.base.clone();
let handle =
tokio::task::spawn_blocking(move || extract_links(&input_content, &base));
let handle = tokio::task::spawn_blocking(move || {
let mut extractor = Extractor::new(base);
extractor.extract(&input_content)
});
extract_links_handles.push(handle);
}
}

View file

@ -16,137 +16,147 @@ use crate::{
Base, ErrorKind, Input, Request, Result, Uri,
};
/// Main entrypoint for extracting links from various sources
/// (Markdown, HTML, and plaintext)
pub(crate) fn extract_links(
input_content: &InputContent,
base: &Option<Base>,
) -> Result<HashSet<Request>> {
let links = match input_content.file_type {
FileType::Markdown => extract_links_from_markdown(&input_content.content),
FileType::Html => extract_links_from_html(&input_content.content),
FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
};
/// A handler for extracting links from various input formats like Markdown and
/// HTML. Allocations are avoided if possible as this is a performance-critical
/// section of the library.
#[derive(Debug)]
pub struct Extractor {
/// URLs extracted from input
pub urls: Vec<StrTendril>,
/// Base URL or Path
pub base: Option<Base>,
}
// Only keep legit URLs. For example this filters out anchors.
let mut requests: HashSet<Request> = HashSet::new();
for link in links {
let req = if let Ok(uri) = Uri::try_from(&*link) {
Request::new(uri, input_content.input.clone())
} else if let Some(url) = base.as_ref().and_then(|u| u.join(&link)) {
Request::new(Uri { url }, input_content.input.clone())
} else if let Input::FsPath(root) = &input_content.input {
if url::is_anchor(&link) {
// Silently ignore anchor links for now
continue;
}
match create_uri_from_path(root, base, &link)? {
Some(url) => Request::new(Uri { url }, input_content.input.clone()),
None => {
// In case we cannot create a URI from a path but we didn't receive an error,
// it means that some preconditions were not met, e.g. the `base_url` wasn't set.
impl Extractor {
pub(crate) const fn new(base: Option<Base>) -> Self {
Extractor {
urls: Vec::new(),
base,
}
}
/// Main entrypoint for extracting links from various sources
/// (Markdown, HTML, and plaintext)
pub(crate) fn extract(&mut self, input_content: &InputContent) -> Result<HashSet<Request>> {
match input_content.file_type {
FileType::Markdown => self.extract_markdown(&input_content.content),
FileType::Html => self.extract_html(&input_content.content),
FileType::Plaintext => self.extract_plaintext(&input_content.content),
};
self.create_requests(input_content)
}
/// Create requests out of the collected URLs.
/// Only keeps legit URLs. For example this filters out anchors.
fn create_requests(&self, input_content: &InputContent) -> Result<HashSet<Request>> {
let mut requests: HashSet<Request> = HashSet::with_capacity(self.urls.len());
for url in &self.urls {
let req = if let Ok(uri) = Uri::try_from(url.as_ref()) {
Request::new(uri, input_content.input.clone())
} else if let Some(url) = self.base.as_ref().and_then(|u| u.join(url)) {
Request::new(Uri { url }, input_content.input.clone())
} else if let Input::FsPath(root) = &input_content.input {
if url::is_anchor(url) {
// Silently ignore anchor links for now
continue;
}
}
} else {
info!("Handling of {} not implemented yet", &link);
continue;
};
requests.insert(req);
match self.create_uri_from_path(root, url)? {
Some(url) => Request::new(Uri { url }, input_content.input.clone()),
None => {
// In case we cannot create a URI from a path but we didn't receive an error,
// it means that some preconditions were not met, e.g. the `base_url` wasn't set.
continue;
}
}
} else {
info!("Handling of {} not implemented yet", &url);
continue;
};
requests.insert(req);
}
Ok(requests)
}
Ok(requests)
}
/// Extract unparsed URL strings from a Markdown string.
fn extract_links_from_markdown(input: &str) -> Vec<StrTendril> {
let parser = Parser::new(input);
parser
.flat_map(|event| match event {
MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => {
vec![StrTendril::from(url.as_ref())]
/// Extract unparsed URL strings from a Markdown string.
fn extract_markdown(&mut self, input: &str) {
let parser = Parser::new(input);
for event in parser {
match event {
MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => {
self.urls.push(StrTendril::from(url.as_ref()));
}
MDEvent::Text(txt) => self.extract_plaintext(&txt),
MDEvent::Html(html) => self.extract_html(&html.to_string()),
_ => {}
}
MDEvent::Text(txt) => extract_links_from_plaintext(&txt),
MDEvent::Html(html) => extract_links_from_html(&html.to_string()),
_ => vec![],
})
.collect()
}
/// Extract unparsed URL strings from an HTML string.
fn extract_links_from_html(input: &str) -> Vec<StrTendril> {
let tendril = StrTendril::from(input);
let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);
let mut urls = Vec::new();
// We pass mutable URL references here to avoid
// extra allocations in each recursive descent
walk_html_links(&mut urls, &rc_dom.document);
urls
}
/// Recursively walk links in a HTML document, aggregating URL strings in `urls`.
fn walk_html_links(mut urls: &mut Vec<StrTendril>, node: &Handle) {
match node.data {
NodeData::Text { ref contents } => {
urls.append(&mut extract_links_from_plaintext(&contents.borrow()));
}
}
NodeData::Comment { ref contents } => {
urls.append(&mut extract_links_from_plaintext(contents));
}
/// Extract unparsed URL strings from an HTML string.
fn extract_html(&mut self, input: &str) {
let tendril = StrTendril::from(input);
let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);
NodeData::Element {
ref name,
ref attrs,
..
} => {
for attr in attrs.borrow().iter() {
if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
urls.push(attr.value.clone());
} else {
urls.append(&mut extract_links_from_plaintext(&attr.value));
self.walk_html_links(&rc_dom.document);
}
/// Recursively walk links in a HTML document, aggregating URL strings in `urls`.
fn walk_html_links(&mut self, node: &Handle) {
match node.data {
NodeData::Text { ref contents } => {
self.extract_plaintext(&contents.borrow());
}
NodeData::Comment { ref contents } => {
self.extract_plaintext(contents);
}
NodeData::Element {
ref name,
ref attrs,
..
} => {
for attr in attrs.borrow().iter() {
if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
self.urls.push(attr.value.clone());
} else {
self.extract_plaintext(&attr.value);
}
}
}
_ => {}
}
_ => {}
// recursively traverse the document's nodes -- this doesn't need any extra
// exit conditions, because the document is a tree
for child in node.children.borrow().iter() {
self.walk_html_links(child);
}
}
// recursively traverse the document's nodes -- this doesn't need any extra
// exit conditions, because the document is a tree
for child in node.children.borrow().iter() {
walk_html_links(&mut urls, child);
/// Extract unparsed URL strings from plaintext
fn extract_plaintext(&mut self, input: &str) {
self.urls
.extend(url::find_links(input).map(|l| StrTendril::from(l.as_str())));
}
}
/// Extract unparsed URL strings from plaintext
fn extract_links_from_plaintext(input: &str) -> Vec<StrTendril> {
url::find_links(input)
.iter()
.map(|l| StrTendril::from(l.as_str()))
.collect()
}
fn create_uri_from_path(src: &Path, base: &Option<Base>, dst: &str) -> Result<Option<Url>> {
let dst = url::remove_get_params_and_fragment(dst);
// Avoid double-encoding already encoded destination paths by removing any
// potential encoding (e.g. `web%20site` becomes `web site`).
// That's because Url::from_file_path will encode the full URL in the end.
// This behavior cannot be configured.
// See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
// TODO: This is not a perfect solution.
// Ideally, only `src` and `base` should be URL encoded (as is done by
// `from_file_path` at the moment) while `dst` is left untouched and simply
// appended to the end.
let decoded = percent_decode_str(dst).decode_utf8()?;
let resolved = path::resolve(src, &PathBuf::from(&*decoded), base)?;
match resolved {
Some(path) => Url::from_file_path(&path)
.map(Some)
.map_err(|_e| ErrorKind::InvalidUrl(path)),
None => Ok(None),
fn create_uri_from_path(&self, src: &Path, dst: &str) -> Result<Option<Url>> {
let dst = url::remove_get_params_and_fragment(dst);
// Avoid double-encoding already encoded destination paths by removing any
// potential encoding (e.g. `web%20site` becomes `web site`).
// That's because Url::from_file_path will encode the full URL in the end.
// This behavior cannot be configured.
// See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
// TODO: This is not a perfect solution.
// Ideally, only `src` and `base` should be URL encoded (as is done by
// `from_file_path` at the moment) while `dst` is left untouched and simply
// appended to the end.
let decoded = percent_decode_str(dst).decode_utf8()?;
let resolved = path::resolve(src, &PathBuf::from(&*decoded), &self.base)?;
match resolved {
Some(path) => Url::from_file_path(&path)
.map(Some)
.map_err(|_e| ErrorKind::InvalidUrl(path)),
None => Ok(None),
}
}
}
@ -175,8 +185,10 @@ mod test {
#[test]
fn test_create_uri_from_path() {
let result =
create_uri_from_path(&PathBuf::from("/README.md"), &None, "test+encoding").unwrap();
let extractor = Extractor::new(None);
let result = extractor
.create_uri_from_path(&PathBuf::from("/README.md"), "test+encoding")
.unwrap();
assert_eq!(result.unwrap().as_str(), "file:///test+encoding");
}
@ -200,7 +212,9 @@ mod test {
fn extract_uris(input: &str, file_type: FileType, base_url: Option<&str>) -> HashSet<Uri> {
let base = base_url.map(|url| Base::Remote(Url::parse(url).unwrap()));
extract_links(&InputContent::from_string(input, file_type), &base)
let mut extractor = Extractor::new(base);
extractor
.extract(&InputContent::from_string(input, file_type))
// unwrap is fine here as this helper function is only used in tests
.unwrap()
.into_iter()
@ -228,15 +242,17 @@ mod test {
let input = "http://www.apache.org/licenses/LICENSE-2.0\n";
let link = input.trim_end();
assert_eq!(
vec![StrTendril::from(link)],
extract_links_from_markdown(input)
);
assert_eq!(
vec![StrTendril::from(link)],
extract_links_from_plaintext(input)
);
assert_eq!(vec![StrTendril::from(link)], extract_links_from_html(input));
let mut extractor = Extractor::new(None);
extractor.extract_markdown(input);
assert_eq!(vec![StrTendril::from(link)], extractor.urls);
let mut extractor = Extractor::new(None);
extractor.extract_plaintext(input);
assert_eq!(vec![StrTendril::from(link)], extractor.urls);
let mut extractor = Extractor::new(None);
extractor.extract_html(input);
assert_eq!(vec![StrTendril::from(link)], extractor.urls);
}
#[test]
@ -341,7 +357,7 @@ mod test {
#[test]
fn test_md_escape() {
let input = r#"http://msdn.microsoft.com/library/ie/ms535874\(v=vs.85\).aspx"#;
let links = find_links(input);
let links: Vec<_> = find_links(input).collect();
let expected = "http://msdn.microsoft.com/library/ie/ms535874(v=vs.85).aspx)";
matches!(&links[..], [link] if link.as_str() == expected);

View file

@ -34,8 +34,8 @@ pub(crate) fn is_anchor(url: &str) -> bool {
}
// Use `LinkFinder` to offload the raw link searching in plaintext
pub(crate) fn find_links(input: &str) -> Vec<linkify::Link> {
LINK_FINDER.links(input).collect()
pub(crate) fn find_links(input: &str) -> impl Iterator<Item = linkify::Link> {
LINK_FINDER.links(input)
}
#[cfg(test)]