mirror of
https://github.com/Hopiu/lychee.git
synced 2026-03-16 20:50:25 +00:00
Refactor extractor (#354)
This avoids sending URLs back and forth between the different parsers. Also, it should allow for future optimizations to reduce allocs.
This commit is contained in:
parent
59a7e3af03
commit
a7f809612d
5 changed files with 155 additions and 137 deletions
7
Cargo.lock
generated
7
Cargo.lock
generated
|
|
@ -1624,16 +1624,13 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "markup5ever"
|
||||
version = "0.10.0"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab"
|
||||
checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd"
|
||||
dependencies = [
|
||||
"log",
|
||||
"phf",
|
||||
"phf_codegen",
|
||||
"serde",
|
||||
"serde_derive",
|
||||
"serde_json",
|
||||
"string_cache",
|
||||
"string_cache_codegen",
|
||||
"tendril",
|
||||
|
|
|
|||
|
|
@ -8,3 +8,6 @@ members = [
|
|||
[patch.crates-io]
|
||||
# Switch back to version on crates.io after 0.6.3+ is released
|
||||
hubcaps = { git="https://github.com/softprops/hubcaps.git" }
|
||||
|
||||
[profile.release]
|
||||
debug = true
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
use crate::{extract::extract_links, Base, Input, Request, Result, Uri};
|
||||
use crate::{extract::Extractor, Base, Input, Request, Result, Uri};
|
||||
use std::collections::HashSet;
|
||||
|
||||
/// Collector keeps the state of link collection
|
||||
|
|
@ -52,8 +52,10 @@ impl Collector {
|
|||
while let Some(result) = contents_rx.recv().await {
|
||||
for input_content in result? {
|
||||
let base = self.base.clone();
|
||||
let handle =
|
||||
tokio::task::spawn_blocking(move || extract_links(&input_content, &base));
|
||||
let handle = tokio::task::spawn_blocking(move || {
|
||||
let mut extractor = Extractor::new(base);
|
||||
extractor.extract(&input_content)
|
||||
});
|
||||
extract_links_handles.push(handle);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,137 +16,147 @@ use crate::{
|
|||
Base, ErrorKind, Input, Request, Result, Uri,
|
||||
};
|
||||
|
||||
/// Main entrypoint for extracting links from various sources
|
||||
/// (Markdown, HTML, and plaintext)
|
||||
pub(crate) fn extract_links(
|
||||
input_content: &InputContent,
|
||||
base: &Option<Base>,
|
||||
) -> Result<HashSet<Request>> {
|
||||
let links = match input_content.file_type {
|
||||
FileType::Markdown => extract_links_from_markdown(&input_content.content),
|
||||
FileType::Html => extract_links_from_html(&input_content.content),
|
||||
FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
|
||||
};
|
||||
/// A handler for extracting links from various input formats like Markdown and
|
||||
/// HTML. Allocations are avoided if possible as this is a performance-critical
|
||||
/// section of the library.
|
||||
#[derive(Debug)]
|
||||
pub struct Extractor {
|
||||
/// URLs extracted from input
|
||||
pub urls: Vec<StrTendril>,
|
||||
/// Base URL or Path
|
||||
pub base: Option<Base>,
|
||||
}
|
||||
|
||||
// Only keep legit URLs. For example this filters out anchors.
|
||||
let mut requests: HashSet<Request> = HashSet::new();
|
||||
for link in links {
|
||||
let req = if let Ok(uri) = Uri::try_from(&*link) {
|
||||
Request::new(uri, input_content.input.clone())
|
||||
} else if let Some(url) = base.as_ref().and_then(|u| u.join(&link)) {
|
||||
Request::new(Uri { url }, input_content.input.clone())
|
||||
} else if let Input::FsPath(root) = &input_content.input {
|
||||
if url::is_anchor(&link) {
|
||||
// Silently ignore anchor links for now
|
||||
continue;
|
||||
}
|
||||
match create_uri_from_path(root, base, &link)? {
|
||||
Some(url) => Request::new(Uri { url }, input_content.input.clone()),
|
||||
None => {
|
||||
// In case we cannot create a URI from a path but we didn't receive an error,
|
||||
// it means that some preconditions were not met, e.g. the `base_url` wasn't set.
|
||||
impl Extractor {
|
||||
pub(crate) const fn new(base: Option<Base>) -> Self {
|
||||
Extractor {
|
||||
urls: Vec::new(),
|
||||
base,
|
||||
}
|
||||
}
|
||||
|
||||
/// Main entrypoint for extracting links from various sources
|
||||
/// (Markdown, HTML, and plaintext)
|
||||
pub(crate) fn extract(&mut self, input_content: &InputContent) -> Result<HashSet<Request>> {
|
||||
match input_content.file_type {
|
||||
FileType::Markdown => self.extract_markdown(&input_content.content),
|
||||
FileType::Html => self.extract_html(&input_content.content),
|
||||
FileType::Plaintext => self.extract_plaintext(&input_content.content),
|
||||
};
|
||||
self.create_requests(input_content)
|
||||
}
|
||||
|
||||
/// Create requests out of the collected URLs.
|
||||
/// Only keeps legit URLs. For example this filters out anchors.
|
||||
fn create_requests(&self, input_content: &InputContent) -> Result<HashSet<Request>> {
|
||||
let mut requests: HashSet<Request> = HashSet::with_capacity(self.urls.len());
|
||||
for url in &self.urls {
|
||||
let req = if let Ok(uri) = Uri::try_from(url.as_ref()) {
|
||||
Request::new(uri, input_content.input.clone())
|
||||
} else if let Some(url) = self.base.as_ref().and_then(|u| u.join(url)) {
|
||||
Request::new(Uri { url }, input_content.input.clone())
|
||||
} else if let Input::FsPath(root) = &input_content.input {
|
||||
if url::is_anchor(url) {
|
||||
// Silently ignore anchor links for now
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
info!("Handling of {} not implemented yet", &link);
|
||||
continue;
|
||||
};
|
||||
requests.insert(req);
|
||||
match self.create_uri_from_path(root, url)? {
|
||||
Some(url) => Request::new(Uri { url }, input_content.input.clone()),
|
||||
None => {
|
||||
// In case we cannot create a URI from a path but we didn't receive an error,
|
||||
// it means that some preconditions were not met, e.g. the `base_url` wasn't set.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
info!("Handling of {} not implemented yet", &url);
|
||||
continue;
|
||||
};
|
||||
requests.insert(req);
|
||||
}
|
||||
Ok(requests)
|
||||
}
|
||||
Ok(requests)
|
||||
}
|
||||
|
||||
/// Extract unparsed URL strings from a Markdown string.
|
||||
fn extract_links_from_markdown(input: &str) -> Vec<StrTendril> {
|
||||
let parser = Parser::new(input);
|
||||
parser
|
||||
.flat_map(|event| match event {
|
||||
MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => {
|
||||
vec![StrTendril::from(url.as_ref())]
|
||||
/// Extract unparsed URL strings from a Markdown string.
|
||||
fn extract_markdown(&mut self, input: &str) {
|
||||
let parser = Parser::new(input);
|
||||
for event in parser {
|
||||
match event {
|
||||
MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => {
|
||||
self.urls.push(StrTendril::from(url.as_ref()));
|
||||
}
|
||||
MDEvent::Text(txt) => self.extract_plaintext(&txt),
|
||||
MDEvent::Html(html) => self.extract_html(&html.to_string()),
|
||||
_ => {}
|
||||
}
|
||||
MDEvent::Text(txt) => extract_links_from_plaintext(&txt),
|
||||
MDEvent::Html(html) => extract_links_from_html(&html.to_string()),
|
||||
_ => vec![],
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Extract unparsed URL strings from an HTML string.
|
||||
fn extract_links_from_html(input: &str) -> Vec<StrTendril> {
|
||||
let tendril = StrTendril::from(input);
|
||||
let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);
|
||||
|
||||
let mut urls = Vec::new();
|
||||
|
||||
// We pass mutable URL references here to avoid
|
||||
// extra allocations in each recursive descent
|
||||
walk_html_links(&mut urls, &rc_dom.document);
|
||||
|
||||
urls
|
||||
}
|
||||
|
||||
/// Recursively walk links in a HTML document, aggregating URL strings in `urls`.
|
||||
fn walk_html_links(mut urls: &mut Vec<StrTendril>, node: &Handle) {
|
||||
match node.data {
|
||||
NodeData::Text { ref contents } => {
|
||||
urls.append(&mut extract_links_from_plaintext(&contents.borrow()));
|
||||
}
|
||||
}
|
||||
|
||||
NodeData::Comment { ref contents } => {
|
||||
urls.append(&mut extract_links_from_plaintext(contents));
|
||||
}
|
||||
/// Extract unparsed URL strings from an HTML string.
|
||||
fn extract_html(&mut self, input: &str) {
|
||||
let tendril = StrTendril::from(input);
|
||||
let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);
|
||||
|
||||
NodeData::Element {
|
||||
ref name,
|
||||
ref attrs,
|
||||
..
|
||||
} => {
|
||||
for attr in attrs.borrow().iter() {
|
||||
if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
|
||||
urls.push(attr.value.clone());
|
||||
} else {
|
||||
urls.append(&mut extract_links_from_plaintext(&attr.value));
|
||||
self.walk_html_links(&rc_dom.document);
|
||||
}
|
||||
|
||||
/// Recursively walk links in a HTML document, aggregating URL strings in `urls`.
|
||||
fn walk_html_links(&mut self, node: &Handle) {
|
||||
match node.data {
|
||||
NodeData::Text { ref contents } => {
|
||||
self.extract_plaintext(&contents.borrow());
|
||||
}
|
||||
NodeData::Comment { ref contents } => {
|
||||
self.extract_plaintext(contents);
|
||||
}
|
||||
NodeData::Element {
|
||||
ref name,
|
||||
ref attrs,
|
||||
..
|
||||
} => {
|
||||
for attr in attrs.borrow().iter() {
|
||||
if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
|
||||
self.urls.push(attr.value.clone());
|
||||
} else {
|
||||
self.extract_plaintext(&attr.value);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
_ => {}
|
||||
// recursively traverse the document's nodes -- this doesn't need any extra
|
||||
// exit conditions, because the document is a tree
|
||||
for child in node.children.borrow().iter() {
|
||||
self.walk_html_links(child);
|
||||
}
|
||||
}
|
||||
|
||||
// recursively traverse the document's nodes -- this doesn't need any extra
|
||||
// exit conditions, because the document is a tree
|
||||
for child in node.children.borrow().iter() {
|
||||
walk_html_links(&mut urls, child);
|
||||
/// Extract unparsed URL strings from plaintext
|
||||
fn extract_plaintext(&mut self, input: &str) {
|
||||
self.urls
|
||||
.extend(url::find_links(input).map(|l| StrTendril::from(l.as_str())));
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract unparsed URL strings from plaintext
|
||||
fn extract_links_from_plaintext(input: &str) -> Vec<StrTendril> {
|
||||
url::find_links(input)
|
||||
.iter()
|
||||
.map(|l| StrTendril::from(l.as_str()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn create_uri_from_path(src: &Path, base: &Option<Base>, dst: &str) -> Result<Option<Url>> {
|
||||
let dst = url::remove_get_params_and_fragment(dst);
|
||||
// Avoid double-encoding already encoded destination paths by removing any
|
||||
// potential encoding (e.g. `web%20site` becomes `web site`).
|
||||
// That's because Url::from_file_path will encode the full URL in the end.
|
||||
// This behavior cannot be configured.
|
||||
// See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
|
||||
// TODO: This is not a perfect solution.
|
||||
// Ideally, only `src` and `base` should be URL encoded (as is done by
|
||||
// `from_file_path` at the moment) while `dst` is left untouched and simply
|
||||
// appended to the end.
|
||||
let decoded = percent_decode_str(dst).decode_utf8()?;
|
||||
let resolved = path::resolve(src, &PathBuf::from(&*decoded), base)?;
|
||||
match resolved {
|
||||
Some(path) => Url::from_file_path(&path)
|
||||
.map(Some)
|
||||
.map_err(|_e| ErrorKind::InvalidUrl(path)),
|
||||
None => Ok(None),
|
||||
fn create_uri_from_path(&self, src: &Path, dst: &str) -> Result<Option<Url>> {
|
||||
let dst = url::remove_get_params_and_fragment(dst);
|
||||
// Avoid double-encoding already encoded destination paths by removing any
|
||||
// potential encoding (e.g. `web%20site` becomes `web site`).
|
||||
// That's because Url::from_file_path will encode the full URL in the end.
|
||||
// This behavior cannot be configured.
|
||||
// See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
|
||||
// TODO: This is not a perfect solution.
|
||||
// Ideally, only `src` and `base` should be URL encoded (as is done by
|
||||
// `from_file_path` at the moment) while `dst` is left untouched and simply
|
||||
// appended to the end.
|
||||
let decoded = percent_decode_str(dst).decode_utf8()?;
|
||||
let resolved = path::resolve(src, &PathBuf::from(&*decoded), &self.base)?;
|
||||
match resolved {
|
||||
Some(path) => Url::from_file_path(&path)
|
||||
.map(Some)
|
||||
.map_err(|_e| ErrorKind::InvalidUrl(path)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -175,8 +185,10 @@ mod test {
|
|||
|
||||
#[test]
|
||||
fn test_create_uri_from_path() {
|
||||
let result =
|
||||
create_uri_from_path(&PathBuf::from("/README.md"), &None, "test+encoding").unwrap();
|
||||
let extractor = Extractor::new(None);
|
||||
let result = extractor
|
||||
.create_uri_from_path(&PathBuf::from("/README.md"), "test+encoding")
|
||||
.unwrap();
|
||||
assert_eq!(result.unwrap().as_str(), "file:///test+encoding");
|
||||
}
|
||||
|
||||
|
|
@ -200,7 +212,9 @@ mod test {
|
|||
|
||||
fn extract_uris(input: &str, file_type: FileType, base_url: Option<&str>) -> HashSet<Uri> {
|
||||
let base = base_url.map(|url| Base::Remote(Url::parse(url).unwrap()));
|
||||
extract_links(&InputContent::from_string(input, file_type), &base)
|
||||
let mut extractor = Extractor::new(base);
|
||||
extractor
|
||||
.extract(&InputContent::from_string(input, file_type))
|
||||
// unwrap is fine here as this helper function is only used in tests
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
|
|
@ -228,15 +242,17 @@ mod test {
|
|||
let input = "http://www.apache.org/licenses/LICENSE-2.0\n";
|
||||
let link = input.trim_end();
|
||||
|
||||
assert_eq!(
|
||||
vec![StrTendril::from(link)],
|
||||
extract_links_from_markdown(input)
|
||||
);
|
||||
assert_eq!(
|
||||
vec![StrTendril::from(link)],
|
||||
extract_links_from_plaintext(input)
|
||||
);
|
||||
assert_eq!(vec![StrTendril::from(link)], extract_links_from_html(input));
|
||||
let mut extractor = Extractor::new(None);
|
||||
extractor.extract_markdown(input);
|
||||
assert_eq!(vec![StrTendril::from(link)], extractor.urls);
|
||||
|
||||
let mut extractor = Extractor::new(None);
|
||||
extractor.extract_plaintext(input);
|
||||
assert_eq!(vec![StrTendril::from(link)], extractor.urls);
|
||||
|
||||
let mut extractor = Extractor::new(None);
|
||||
extractor.extract_html(input);
|
||||
assert_eq!(vec![StrTendril::from(link)], extractor.urls);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -341,7 +357,7 @@ mod test {
|
|||
#[test]
|
||||
fn test_md_escape() {
|
||||
let input = r#"http://msdn.microsoft.com/library/ie/ms535874\(v=vs.85\).aspx"#;
|
||||
let links = find_links(input);
|
||||
let links: Vec<_> = find_links(input).collect();
|
||||
let expected = "http://msdn.microsoft.com/library/ie/ms535874(v=vs.85).aspx)";
|
||||
|
||||
matches!(&links[..], [link] if link.as_str() == expected);
|
||||
|
|
|
|||
|
|
@ -34,8 +34,8 @@ pub(crate) fn is_anchor(url: &str) -> bool {
|
|||
}
|
||||
|
||||
// Use `LinkFinder` to offload the raw link searching in plaintext
|
||||
pub(crate) fn find_links(input: &str) -> Vec<linkify::Link> {
|
||||
LINK_FINDER.links(input).collect()
|
||||
pub(crate) fn find_links(input: &str) -> impl Iterator<Item = linkify::Link> {
|
||||
LINK_FINDER.links(input)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
|
|||
Loading…
Reference in a new issue