Refactor extractor (#354)

This avoids sending URLs back and forth between the different parsers. Also, it should allow for future optimizations to reduce allocs.
2026-05-08 22:04:45 +00:00 · 2021-10-07 12:51:02 +02:00 · 2021-10-07 12:51:02 +02:00 · a7f809612d
commit a7f809612d
parent 59a7e3af03
5 changed files with 155 additions and 137 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1624,16 +1624,13 @@ dependencies = [

 [[package]]
 name = "markup5ever"
-version = "0.10.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab"
+checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd"
 dependencies = [
 "log",
 "phf",
 "phf_codegen",
- "serde",
- "serde_derive",
- "serde_json",
 "string_cache",
 "string_cache_codegen",
 "tendril",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -8,3 +8,6 @@ members = [
 [patch.crates-io]
 # Switch back to version on crates.io after 0.6.3+ is released
 hubcaps = { git="https://github.com/softprops/hubcaps.git" }
+
+[profile.release]
+debug = true
--- a/lychee-lib/src/collector.rs
+++ b/lychee-lib/src/collector.rs
@ -1,4 +1,4 @@
-use crate::{extract::extract_links, Base, Input, Request, Result, Uri};
+use crate::{extract::Extractor, Base, Input, Request, Result, Uri};
 use std::collections::HashSet;

 /// Collector keeps the state of link collection
@ -52,8 +52,10 @@ impl Collector {
        while let Some(result) = contents_rx.recv().await {
            for input_content in result? {
                let base = self.base.clone();
-                let handle =
-                    tokio::task::spawn_blocking(move || extract_links(&input_content, &base));
+                let handle = tokio::task::spawn_blocking(move || {
+                    let mut extractor = Extractor::new(base);
+                    extractor.extract(&input_content)
+                });
                extract_links_handles.push(handle);
            }
        }
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@ -16,137 +16,147 @@ use crate::{
    Base, ErrorKind, Input, Request, Result, Uri,
 };

-/// Main entrypoint for extracting links from various sources
-/// (Markdown, HTML, and plaintext)
-pub(crate) fn extract_links(
-    input_content: &InputContent,
-    base: &Option<Base>,
-) -> Result<HashSet<Request>> {
-    let links = match input_content.file_type {
-        FileType::Markdown => extract_links_from_markdown(&input_content.content),
-        FileType::Html => extract_links_from_html(&input_content.content),
-        FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
-    };
+/// A handler for extracting links from various input formats like Markdown and
+/// HTML. Allocations are avoided if possible as this is a performance-critical
+/// section of the library.
+#[derive(Debug)]
+pub struct Extractor {
+    /// URLs extracted from input
+    pub urls: Vec<StrTendril>,
+    /// Base URL or Path
+    pub base: Option<Base>,
+}

-    // Only keep legit URLs. For example this filters out anchors.
-    let mut requests: HashSet<Request> = HashSet::new();
-    for link in links {
-        let req = if let Ok(uri) = Uri::try_from(&*link) {
-            Request::new(uri, input_content.input.clone())
-        } else if let Some(url) = base.as_ref().and_then(|u| u.join(&link)) {
-            Request::new(Uri { url }, input_content.input.clone())
-        } else if let Input::FsPath(root) = &input_content.input {
-            if url::is_anchor(&link) {
-                // Silently ignore anchor links for now
-                continue;
-            }
-            match create_uri_from_path(root, base, &link)? {
-                Some(url) => Request::new(Uri { url }, input_content.input.clone()),
-                None => {
-                    // In case we cannot create a URI from a path but we didn't receive an error,
-                    // it means that some preconditions were not met, e.g. the `base_url` wasn't set.
+impl Extractor {
+    pub(crate) const fn new(base: Option<Base>) -> Self {
+        Extractor {
+            urls: Vec::new(),
+            base,
+        }
+    }
+
+    /// Main entrypoint for extracting links from various sources
+    /// (Markdown, HTML, and plaintext)
+    pub(crate) fn extract(&mut self, input_content: &InputContent) -> Result<HashSet<Request>> {
+        match input_content.file_type {
+            FileType::Markdown => self.extract_markdown(&input_content.content),
+            FileType::Html => self.extract_html(&input_content.content),
+            FileType::Plaintext => self.extract_plaintext(&input_content.content),
+        };
+        self.create_requests(input_content)
+    }
+
+    /// Create requests out of the collected URLs.
+    /// Only keeps legit URLs. For example this filters out anchors.
+    fn create_requests(&self, input_content: &InputContent) -> Result<HashSet<Request>> {
+        let mut requests: HashSet<Request> = HashSet::with_capacity(self.urls.len());
+        for url in &self.urls {
+            let req = if let Ok(uri) = Uri::try_from(url.as_ref()) {
+                Request::new(uri, input_content.input.clone())
+            } else if let Some(url) = self.base.as_ref().and_then(|u| u.join(url)) {
+                Request::new(Uri { url }, input_content.input.clone())
+            } else if let Input::FsPath(root) = &input_content.input {
+                if url::is_anchor(url) {
+                    // Silently ignore anchor links for now
                    continue;
                }
-            }
-        } else {
-            info!("Handling of {} not implemented yet", &link);
-            continue;
-        };
-        requests.insert(req);
+                match self.create_uri_from_path(root, url)? {
+                    Some(url) => Request::new(Uri { url }, input_content.input.clone()),
+                    None => {
+                        // In case we cannot create a URI from a path but we didn't receive an error,
+                        // it means that some preconditions were not met, e.g. the `base_url` wasn't set.
+                        continue;
+                    }
+                }
+            } else {
+                info!("Handling of {} not implemented yet", &url);
+                continue;
+            };
+            requests.insert(req);
+        }
+        Ok(requests)
    }
-    Ok(requests)
-}

-/// Extract unparsed URL strings from a Markdown string.
-fn extract_links_from_markdown(input: &str) -> Vec<StrTendril> {
-    let parser = Parser::new(input);
-    parser
-        .flat_map(|event| match event {
-            MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => {
-                vec![StrTendril::from(url.as_ref())]
+    /// Extract unparsed URL strings from a Markdown string.
+    fn extract_markdown(&mut self, input: &str) {
+        let parser = Parser::new(input);
+        for event in parser {
+            match event {
+                MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => {
+                    self.urls.push(StrTendril::from(url.as_ref()));
+                }
+                MDEvent::Text(txt) => self.extract_plaintext(&txt),
+                MDEvent::Html(html) => self.extract_html(&html.to_string()),
+                _ => {}
            }
-            MDEvent::Text(txt) => extract_links_from_plaintext(&txt),
-            MDEvent::Html(html) => extract_links_from_html(&html.to_string()),
-            _ => vec![],
-        })
-        .collect()
-}
-
-/// Extract unparsed URL strings from an HTML string.
-fn extract_links_from_html(input: &str) -> Vec<StrTendril> {
-    let tendril = StrTendril::from(input);
-    let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);
-
-    let mut urls = Vec::new();
-
-    // We pass mutable URL references here to avoid
-    // extra allocations in each recursive descent
-    walk_html_links(&mut urls, &rc_dom.document);
-
-    urls
-}
-
-/// Recursively walk links in a HTML document, aggregating URL strings in `urls`.
-fn walk_html_links(mut urls: &mut Vec<StrTendril>, node: &Handle) {
-    match node.data {
-        NodeData::Text { ref contents } => {
-            urls.append(&mut extract_links_from_plaintext(&contents.borrow()));
        }
+    }

-        NodeData::Comment { ref contents } => {
-            urls.append(&mut extract_links_from_plaintext(contents));
-        }
+    /// Extract unparsed URL strings from an HTML string.
+    fn extract_html(&mut self, input: &str) {
+        let tendril = StrTendril::from(input);
+        let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);

-        NodeData::Element {
-            ref name,
-            ref attrs,
-            ..
-        } => {
-            for attr in attrs.borrow().iter() {
-                if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
-                    urls.push(attr.value.clone());
-                } else {
-                    urls.append(&mut extract_links_from_plaintext(&attr.value));
+        self.walk_html_links(&rc_dom.document);
+    }
+
+    /// Recursively walk links in a HTML document, aggregating URL strings in `urls`.
+    fn walk_html_links(&mut self, node: &Handle) {
+        match node.data {
+            NodeData::Text { ref contents } => {
+                self.extract_plaintext(&contents.borrow());
+            }
+            NodeData::Comment { ref contents } => {
+                self.extract_plaintext(contents);
+            }
+            NodeData::Element {
+                ref name,
+                ref attrs,
+                ..
+            } => {
+                for attr in attrs.borrow().iter() {
+                    if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
+                        self.urls.push(attr.value.clone());
+                    } else {
+                        self.extract_plaintext(&attr.value);
+                    }
                }
            }
+            _ => {}
        }

-        _ => {}
+        // recursively traverse the document's nodes -- this doesn't need any extra
+        // exit conditions, because the document is a tree
+        for child in node.children.borrow().iter() {
+            self.walk_html_links(child);
+        }
    }

-    // recursively traverse the document's nodes -- this doesn't need any extra
-    // exit conditions, because the document is a tree
-    for child in node.children.borrow().iter() {
-        walk_html_links(&mut urls, child);
+    /// Extract unparsed URL strings from plaintext
+    fn extract_plaintext(&mut self, input: &str) {
+        self.urls
+            .extend(url::find_links(input).map(|l| StrTendril::from(l.as_str())));
    }
-}

-/// Extract unparsed URL strings from plaintext
-fn extract_links_from_plaintext(input: &str) -> Vec<StrTendril> {
-    url::find_links(input)
-        .iter()
-        .map(|l| StrTendril::from(l.as_str()))
-        .collect()
-}
-
-fn create_uri_from_path(src: &Path, base: &Option<Base>, dst: &str) -> Result<Option<Url>> {
-    let dst = url::remove_get_params_and_fragment(dst);
-    // Avoid double-encoding already encoded destination paths by removing any
-    // potential encoding (e.g. `web%20site` becomes `web site`).
-    // That's because Url::from_file_path will encode the full URL in the end.
-    // This behavior cannot be configured.
-    // See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
-    // TODO: This is not a perfect solution.
-    // Ideally, only `src` and `base` should be URL encoded (as is done by
-    // `from_file_path` at the moment) while `dst` is left untouched and simply
-    // appended to the end.
-    let decoded = percent_decode_str(dst).decode_utf8()?;
-    let resolved = path::resolve(src, &PathBuf::from(&*decoded), base)?;
-    match resolved {
-        Some(path) => Url::from_file_path(&path)
-            .map(Some)
-            .map_err(|_e| ErrorKind::InvalidUrl(path)),
-        None => Ok(None),
+    fn create_uri_from_path(&self, src: &Path, dst: &str) -> Result<Option<Url>> {
+        let dst = url::remove_get_params_and_fragment(dst);
+        // Avoid double-encoding already encoded destination paths by removing any
+        // potential encoding (e.g. `web%20site` becomes `web site`).
+        // That's because Url::from_file_path will encode the full URL in the end.
+        // This behavior cannot be configured.
+        // See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
+        // TODO: This is not a perfect solution.
+        // Ideally, only `src` and `base` should be URL encoded (as is done by
+        // `from_file_path` at the moment) while `dst` is left untouched and simply
+        // appended to the end.
+        let decoded = percent_decode_str(dst).decode_utf8()?;
+        let resolved = path::resolve(src, &PathBuf::from(&*decoded), &self.base)?;
+        match resolved {
+            Some(path) => Url::from_file_path(&path)
+                .map(Some)
+                .map_err(|_e| ErrorKind::InvalidUrl(path)),
+            None => Ok(None),
+        }
    }
 }

@ -175,8 +185,10 @@ mod test {

    #[test]
    fn test_create_uri_from_path() {
-        let result =
-            create_uri_from_path(&PathBuf::from("/README.md"), &None, "test+encoding").unwrap();
+        let extractor = Extractor::new(None);
+        let result = extractor
+            .create_uri_from_path(&PathBuf::from("/README.md"), "test+encoding")
+            .unwrap();
        assert_eq!(result.unwrap().as_str(), "file:///test+encoding");
    }

@ -200,7 +212,9 @@ mod test {

    fn extract_uris(input: &str, file_type: FileType, base_url: Option<&str>) -> HashSet<Uri> {
        let base = base_url.map(|url| Base::Remote(Url::parse(url).unwrap()));
-        extract_links(&InputContent::from_string(input, file_type), &base)
+        let mut extractor = Extractor::new(base);
+        extractor
+            .extract(&InputContent::from_string(input, file_type))
            // unwrap is fine here as this helper function is only used in tests
            .unwrap()
            .into_iter()
@ -228,15 +242,17 @@ mod test {
        let input = "http://www.apache.org/licenses/LICENSE-2.0\n";
        let link = input.trim_end();

-        assert_eq!(
-            vec![StrTendril::from(link)],
-            extract_links_from_markdown(input)
-        );
-        assert_eq!(
-            vec![StrTendril::from(link)],
-            extract_links_from_plaintext(input)
-        );
-        assert_eq!(vec![StrTendril::from(link)], extract_links_from_html(input));
+        let mut extractor = Extractor::new(None);
+        extractor.extract_markdown(input);
+        assert_eq!(vec![StrTendril::from(link)], extractor.urls);
+
+        let mut extractor = Extractor::new(None);
+        extractor.extract_plaintext(input);
+        assert_eq!(vec![StrTendril::from(link)], extractor.urls);
+
+        let mut extractor = Extractor::new(None);
+        extractor.extract_html(input);
+        assert_eq!(vec![StrTendril::from(link)], extractor.urls);
    }

    #[test]
@ -341,7 +357,7 @@ mod test {
    #[test]
    fn test_md_escape() {
        let input = r#"http://msdn.microsoft.com/library/ie/ms535874\(v=vs.85\).aspx"#;
-        let links = find_links(input);
+        let links: Vec<_> = find_links(input).collect();
        let expected = "http://msdn.microsoft.com/library/ie/ms535874(v=vs.85).aspx)";

        matches!(&links[..], [link] if link.as_str() == expected);
--- a/lychee-lib/src/helpers/url.rs
+++ b/lychee-lib/src/helpers/url.rs
@ -34,8 +34,8 @@ pub(crate) fn is_anchor(url: &str) -> bool {
 }

 // Use `LinkFinder` to offload the raw link searching in plaintext
-pub(crate) fn find_links(input: &str) -> Vec<linkify::Link> {
-    LINK_FINDER.links(input).collect()
+pub(crate) fn find_links(input: &str) -> impl Iterator<Item = linkify::Link> {
+    LINK_FINDER.links(input)
 }

 #[cfg(test)]