diff --git a/Cargo.lock b/Cargo.lock
index f3062ba..b9a1f51 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1624,16 +1624,13 @@ dependencies = [
[[package]]
name = "markup5ever"
-version = "0.10.0"
+version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab"
+checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd"
dependencies = [
"log",
"phf",
"phf_codegen",
- "serde",
- "serde_derive",
- "serde_json",
"string_cache",
"string_cache_codegen",
"tendril",
diff --git a/Cargo.toml b/Cargo.toml
index 7c228c6..f38c23a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,3 +8,6 @@ members = [
[patch.crates-io]
# Switch back to version on crates.io after 0.6.3+ is released
hubcaps = { git="https://github.com/softprops/hubcaps.git" }
+
+[profile.release]
+debug = true
diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs
index b5e69d9..871e869 100644
--- a/lychee-lib/src/collector.rs
+++ b/lychee-lib/src/collector.rs
@@ -1,4 +1,4 @@
-use crate::{extract::extract_links, Base, Input, Request, Result, Uri};
+use crate::{extract::Extractor, Base, Input, Request, Result, Uri};
use std::collections::HashSet;
/// Collector keeps the state of link collection
@@ -52,8 +52,10 @@ impl Collector {
while let Some(result) = contents_rx.recv().await {
for input_content in result? {
let base = self.base.clone();
- let handle =
- tokio::task::spawn_blocking(move || extract_links(&input_content, &base));
+ let handle = tokio::task::spawn_blocking(move || {
+ let mut extractor = Extractor::new(base);
+ extractor.extract(&input_content)
+ });
extract_links_handles.push(handle);
}
}
diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs
index e5753d8..f8ade53 100644
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@@ -16,137 +16,147 @@ use crate::{
Base, ErrorKind, Input, Request, Result, Uri,
};
-/// Main entrypoint for extracting links from various sources
-/// (Markdown, HTML, and plaintext)
-pub(crate) fn extract_links(
- input_content: &InputContent,
- base: &Option,
-) -> Result> {
- let links = match input_content.file_type {
- FileType::Markdown => extract_links_from_markdown(&input_content.content),
- FileType::Html => extract_links_from_html(&input_content.content),
- FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
- };
+/// A handler for extracting links from various input formats like Markdown and
+/// HTML. Allocations are avoided if possible as this is a performance-critical
+/// section of the library.
+#[derive(Debug)]
+pub struct Extractor {
+ /// URLs extracted from input
+ pub urls: Vec,
+ /// Base URL or Path
+ pub base: Option,
+}
- // Only keep legit URLs. For example this filters out anchors.
- let mut requests: HashSet = HashSet::new();
- for link in links {
- let req = if let Ok(uri) = Uri::try_from(&*link) {
- Request::new(uri, input_content.input.clone())
- } else if let Some(url) = base.as_ref().and_then(|u| u.join(&link)) {
- Request::new(Uri { url }, input_content.input.clone())
- } else if let Input::FsPath(root) = &input_content.input {
- if url::is_anchor(&link) {
- // Silently ignore anchor links for now
- continue;
- }
- match create_uri_from_path(root, base, &link)? {
- Some(url) => Request::new(Uri { url }, input_content.input.clone()),
- None => {
- // In case we cannot create a URI from a path but we didn't receive an error,
- // it means that some preconditions were not met, e.g. the `base_url` wasn't set.
+impl Extractor {
+ pub(crate) const fn new(base: Option) -> Self {
+ Extractor {
+ urls: Vec::new(),
+ base,
+ }
+ }
+
+ /// Main entrypoint for extracting links from various sources
+ /// (Markdown, HTML, and plaintext)
+ pub(crate) fn extract(&mut self, input_content: &InputContent) -> Result> {
+ match input_content.file_type {
+ FileType::Markdown => self.extract_markdown(&input_content.content),
+ FileType::Html => self.extract_html(&input_content.content),
+ FileType::Plaintext => self.extract_plaintext(&input_content.content),
+ };
+ self.create_requests(input_content)
+ }
+
+ /// Create requests out of the collected URLs.
+ /// Only keeps legit URLs. For example this filters out anchors.
+ fn create_requests(&self, input_content: &InputContent) -> Result> {
+ let mut requests: HashSet = HashSet::with_capacity(self.urls.len());
+ for url in &self.urls {
+ let req = if let Ok(uri) = Uri::try_from(url.as_ref()) {
+ Request::new(uri, input_content.input.clone())
+ } else if let Some(url) = self.base.as_ref().and_then(|u| u.join(url)) {
+ Request::new(Uri { url }, input_content.input.clone())
+ } else if let Input::FsPath(root) = &input_content.input {
+ if url::is_anchor(url) {
+ // Silently ignore anchor links for now
continue;
}
- }
- } else {
- info!("Handling of {} not implemented yet", &link);
- continue;
- };
- requests.insert(req);
+ match self.create_uri_from_path(root, url)? {
+ Some(url) => Request::new(Uri { url }, input_content.input.clone()),
+ None => {
+ // In case we cannot create a URI from a path but we didn't receive an error,
+ // it means that some preconditions were not met, e.g. the `base_url` wasn't set.
+ continue;
+ }
+ }
+ } else {
+ info!("Handling of {} not implemented yet", &url);
+ continue;
+ };
+ requests.insert(req);
+ }
+ Ok(requests)
}
- Ok(requests)
-}
-/// Extract unparsed URL strings from a Markdown string.
-fn extract_links_from_markdown(input: &str) -> Vec {
- let parser = Parser::new(input);
- parser
- .flat_map(|event| match event {
- MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => {
- vec![StrTendril::from(url.as_ref())]
+ /// Extract unparsed URL strings from a Markdown string.
+ fn extract_markdown(&mut self, input: &str) {
+ let parser = Parser::new(input);
+ for event in parser {
+ match event {
+ MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => {
+ self.urls.push(StrTendril::from(url.as_ref()));
+ }
+ MDEvent::Text(txt) => self.extract_plaintext(&txt),
+ MDEvent::Html(html) => self.extract_html(&html.to_string()),
+ _ => {}
}
- MDEvent::Text(txt) => extract_links_from_plaintext(&txt),
- MDEvent::Html(html) => extract_links_from_html(&html.to_string()),
- _ => vec![],
- })
- .collect()
-}
-
-/// Extract unparsed URL strings from an HTML string.
-fn extract_links_from_html(input: &str) -> Vec {
- let tendril = StrTendril::from(input);
- let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);
-
- let mut urls = Vec::new();
-
- // We pass mutable URL references here to avoid
- // extra allocations in each recursive descent
- walk_html_links(&mut urls, &rc_dom.document);
-
- urls
-}
-
-/// Recursively walk links in a HTML document, aggregating URL strings in `urls`.
-fn walk_html_links(mut urls: &mut Vec, node: &Handle) {
- match node.data {
- NodeData::Text { ref contents } => {
- urls.append(&mut extract_links_from_plaintext(&contents.borrow()));
}
+ }
- NodeData::Comment { ref contents } => {
- urls.append(&mut extract_links_from_plaintext(contents));
- }
+ /// Extract unparsed URL strings from an HTML string.
+ fn extract_html(&mut self, input: &str) {
+ let tendril = StrTendril::from(input);
+ let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);
- NodeData::Element {
- ref name,
- ref attrs,
- ..
- } => {
- for attr in attrs.borrow().iter() {
- if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
- urls.push(attr.value.clone());
- } else {
- urls.append(&mut extract_links_from_plaintext(&attr.value));
+ self.walk_html_links(&rc_dom.document);
+ }
+
+ /// Recursively walk links in a HTML document, aggregating URL strings in `urls`.
+ fn walk_html_links(&mut self, node: &Handle) {
+ match node.data {
+ NodeData::Text { ref contents } => {
+ self.extract_plaintext(&contents.borrow());
+ }
+ NodeData::Comment { ref contents } => {
+ self.extract_plaintext(contents);
+ }
+ NodeData::Element {
+ ref name,
+ ref attrs,
+ ..
+ } => {
+ for attr in attrs.borrow().iter() {
+ if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
+ self.urls.push(attr.value.clone());
+ } else {
+ self.extract_plaintext(&attr.value);
+ }
}
}
+ _ => {}
}
- _ => {}
+ // recursively traverse the document's nodes -- this doesn't need any extra
+ // exit conditions, because the document is a tree
+ for child in node.children.borrow().iter() {
+ self.walk_html_links(child);
+ }
}
- // recursively traverse the document's nodes -- this doesn't need any extra
- // exit conditions, because the document is a tree
- for child in node.children.borrow().iter() {
- walk_html_links(&mut urls, child);
+ /// Extract unparsed URL strings from plaintext
+ fn extract_plaintext(&mut self, input: &str) {
+ self.urls
+ .extend(url::find_links(input).map(|l| StrTendril::from(l.as_str())));
}
-}
-/// Extract unparsed URL strings from plaintext
-fn extract_links_from_plaintext(input: &str) -> Vec {
- url::find_links(input)
- .iter()
- .map(|l| StrTendril::from(l.as_str()))
- .collect()
-}
-
-fn create_uri_from_path(src: &Path, base: &Option, dst: &str) -> Result