feat: Support relative URLs (#15)

2026-04-20 05:01:02 +00:00 · 2020-10-21 05:01:06 +05:30 · 2020-10-21 05:01:06 +05:30 · 6bd7bbf51f
commit 6bd7bbf51f
parent 6663f23707
8 changed files with 241 additions and 21 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1042,6 +1042,15 @@ dependencies = [
 "version_check",
 ]

+[[package]]
+name = "getopts"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
+dependencies = [
+ "unicode-width",
+]
+
 [[package]]
 name = "getrandom"
 version = "0.1.14"
@ -1561,6 +1570,8 @@ dependencies = [
 "log",
 "predicates",
 "pretty_env_logger",
+ "pulldown-cmark",
+ "quick-xml",
 "regex",
 "reqwest",
 "tokio",
@ -2066,12 +2077,33 @@ dependencies = [
 "unicode-xid",
 ]

+[[package]]
+name = "pulldown-cmark"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ffade02495f22453cd593159ea2f59827aae7f53fa8323f756799b670881dcf8"
+dependencies = [
+ "bitflags",
+ "getopts",
+ "memchr",
+ "unicase",
+]
+
 [[package]]
 name = "quick-error"
 version = "1.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"

+[[package]]
+name = "quick-xml"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26aab6b48e2590e4a64d1ed808749ba06257882b461d01ca71baeb747074a6dd"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "quote"
 version = "1.0.7"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -23,6 +23,8 @@ regex = "1.3.9"
 url = "2.1.1"
 check-if-email-exists = "0.8.13"
 indicatif = "0.15.0"
+pulldown-cmark = "0.8.0"
+quick-xml = "0.20.0"

 [dependencies.reqwest]
 features = ["gzip"]
--- a/fixtures/TEST.html
+++ b/fixtures/TEST.html
@ -10,6 +10,7 @@
        >
      </li>
      <li><a href="https://hello-rust.show/10/">Hello Rust</a></li>
+      <li><a href="20/">Hello Rust</a></li>
    </ul>
  </body>
 </html>
--- a/fixtures/TEST_ALL_PRIVATE.md
+++ b/fixtures/TEST_ALL_PRIVATE.md
@ -9,4 +9,4 @@ Test file: "private" URLs (should all be excluded when using `-E` flag).

 IPv6:

- Loopback: http://[::1]
+- Loopback: [IPV6 Address](http://[::1])
--- a/src/collector.rs
+++ b/src/collector.rs
@ -1,27 +1,58 @@
-use crate::extract::{self, extract_links};
+use crate::extract::{self, extract_links, FileType};
 use anyhow::Result;
 use extract::Uri;
 use glob::glob;
 use reqwest::Url;
+use std::path::Path;
 use std::{collections::HashSet, fs};

-pub(crate) async fn collect_links(inputs: Vec<String>) -> Result<HashSet<Uri>> {
+fn resolve_file_type_by_path<P: AsRef<Path>>(p: P) -> FileType {
+    let path = p.as_ref();
+    match path.extension() {
+        Some(ext) => match ext.to_str().unwrap() {
+            "md" => FileType::Markdown,
+            "html" | "htm" => FileType::HTML,
+            _ => FileType::Plaintext,
+        },
+        None => FileType::Plaintext,
+    }
+}
+
+pub(crate) async fn collect_links(
+    inputs: Vec<String>,
+    base_url: Option<String>,
+) -> Result<HashSet<Uri>> {
+    let base_url = match base_url {
+        Some(url) => Some(Url::parse(&url)?),
+        _ => None,
+    };
+
    let mut links = HashSet::new();

    for input in inputs {
        match Url::parse(&input) {
            Ok(url) => {
+                let path = String::from(url.path());
                let res = reqwest::get(url).await?;
                let content = res.text().await?;
-                links.extend(extract_links(&content));
+
+                links.extend(extract_links(
+                    resolve_file_type_by_path(path),
+                    &content,
+                    base_url.clone(),
+                ));
            }
            Err(_) => {
                // Assume we got a single file or a glob on our hands
                for entry in glob(&input)? {
                    match entry {
                        Ok(path) => {
-                            let content = fs::read_to_string(path)?;
-                            links.extend(extract_links(&content));
+                            let content = fs::read_to_string(&path)?;
+                            links.extend(extract_links(
+                                resolve_file_type_by_path(&path),
+                                &content,
+                                base_url.clone(),
+                            ));
                        }
                        Err(e) => println!("{:?}", e),
                    }
--- a/src/extract.rs
+++ b/src/extract.rs
@ -1,6 +1,8 @@
 use linkify::LinkFinder;
-
+use pulldown_cmark::{Event as MDEvent, Parser, Tag};
+use quick_xml::{events::Event as HTMLEvent, Reader};
 use std::net::IpAddr;
+use std::path::Path;
 use std::{collections::HashSet, fmt::Display};
 use url::Url;

@ -10,6 +12,13 @@ pub(crate) enum Uri {
    Mail(String),
 }

+#[derive(Clone, Debug)]
+pub(crate) enum FileType {
+    HTML,
+    Markdown,
+    Plaintext,
+}
+
 impl Uri {
    pub fn as_str(&self) -> &str {
        match self {
@ -49,17 +58,125 @@ fn find_links(input: &str) -> Vec<linkify::Link> {
    finder.links(input).collect()
 }

-pub(crate) fn extract_links(input: &str) -> HashSet<Uri> {
-    let links = find_links(input);
+// Extracting unparsed URL strings from a markdown string
+fn extract_links_from_markdown(input: &str) -> Vec<String> {
+    let parser = Parser::new(input);
+    parser
+        .flat_map(|event| match event {
+            MDEvent::Start(tag) => match tag {
+                Tag::Link(_, url, _) | Tag::Image(_, url, _) => vec![url.to_string()],
+                _ => vec![],
+            },
+            MDEvent::Text(txt) => extract_links_from_plaintext(&txt.to_string()),
+            MDEvent::Html(html) => extract_links_from_html(&html.to_string()),
+            _ => vec![],
+        })
+        .collect()
+}
+
+// Extracting unparsed URL strings from a HTML string
+fn extract_links_from_html(input: &str) -> Vec<String> {
+    let mut reader = Reader::from_str(input);
+    let mut buf = Vec::new();
+    let mut urls = Vec::new();
+    while let Ok(e) = reader.read_event(&mut buf) {
+        match e {
+            HTMLEvent::Start(ref e) => {
+                for attr in e.attributes() {
+                    if let Ok(attr) = attr {
+                        match (attr.key, e.name()) {
+                            (b"href", b"a")
+                            | (b"href", b"area")
+                            | (b"href", b"base")
+                            | (b"href", b"link")
+                            | (b"src", b"audio")
+                            | (b"src", b"embed")
+                            | (b"src", b"iframe")
+                            | (b"src", b"img")
+                            | (b"src", b"input")
+                            | (b"src", b"script")
+                            | (b"src", b"source")
+                            | (b"src", b"track")
+                            | (b"src", b"video")
+                            | (b"srcset", b"img")
+                            | (b"srcset", b"source")
+                            | (b"cite", b"blockquote")
+                            | (b"cite", b"del")
+                            | (b"cite", b"ins")
+                            | (b"cite", b"q")
+                            | (b"data", b"object")
+                            | (b"onhashchange", b"body") => {
+                                urls.push(String::from_utf8_lossy(attr.value.as_ref()).to_string());
+                            }
+                            _ => {
+                                for link in extract_links_from_plaintext(
+                                    &String::from_utf8_lossy(attr.value.as_ref()).to_string(),
+                                ) {
+                                    urls.push(link);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            HTMLEvent::Text(txt) | HTMLEvent::Comment(txt) => {
+                for link in extract_links_from_plaintext(
+                    &String::from_utf8_lossy(txt.escaped()).to_string(),
+                ) {
+                    urls.push(link);
+                }
+            }
+            HTMLEvent::Eof => {
+                break;
+            }
+            _ => {}
+        }
+        buf.clear();
+    }
+    urls
+}
+
+// Extracting unparsed URL strings from a plaintext
+fn extract_links_from_plaintext(input: &str) -> Vec<String> {
+    find_links(input)
+        .iter()
+        .map(|l| String::from(l.as_str()))
+        .collect()
+}
+
+pub(crate) fn extract_links(
+    file_type: FileType,
+    input: &str,
+    base_url: Option<Url>,
+) -> HashSet<Uri> {
+    let links = match file_type {
+        FileType::Markdown => extract_links_from_markdown(input),
+        FileType::HTML => extract_links_from_html(input),
+        FileType::Plaintext => extract_links_from_plaintext(input),
+    };
+
    // Only keep legit URLs. This sorts out things like anchors.
    // Silently ignore the parse failures for now.
    let mut uris = HashSet::new();
    for link in links {
-        match Url::parse(link.as_str()) {
-            Ok(url) => uris.insert(Uri::Website(url)),
-            Err(_) => uris.insert(Uri::Mail(link.as_str().to_owned())),
+        match Url::parse(&link) {
+            Ok(url) => {
+                uris.insert(Uri::Website(url));
+            }
+            Err(_) => {
+                if link.contains('@') {
+                    uris.insert(Uri::Mail(link));
+                } else if !Path::new(&link).exists() {
+                    if let Some(base_url) = &base_url {
+                        if let Ok(new_url) = base_url.clone().join(&link) {
+                            uris.insert(Uri::Website(new_url));
+                        }
+                    }
+                }
+            }
        };
    }
+
    debug!("Found: {:#?}", uris);
    uris
 }
@ -72,29 +189,63 @@ mod test {

    #[test]
    fn test_extract_markdown_links() {
-        let input = "This is [a test](https://endler.dev).";
-        let links = extract_links(input);
+        let input = "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)";
+        let links = extract_links(
+            FileType::Markdown,
+            input,
+            Some(Url::parse("https://github.com/hello-rust/lychee/").unwrap()),
+        );
        assert_eq!(
            links,
            HashSet::from_iter(
-                [Uri::Website(Url::parse("https://endler.dev").unwrap())]
-                    .iter()
-                    .cloned()
+                [
+                    Uri::Website(Url::parse("https://endler.dev").unwrap()),
+                    Uri::Website(
+                        Url::parse("https://github.com/hello-rust/lychee/relative_link").unwrap()
+                    )
+                ]
+                .iter()
+                .cloned()
            )
        )
    }

+    #[test]
+    fn test_extract_html_links() {
+        let input = r#"<html>
+                <div class="row">
+                    <a href="https://github.com/hello-rust/lychee/">
+                    <a href="blob/master/README.md">README</a>
+                </div>
+            </html>"#;
+
+        let links = extract_links(
+            FileType::HTML,
+            input,
+            Some(Url::parse("https://github.com/hello-rust/").unwrap()),
+        );
+
+        assert_eq!(
+            links
+                .get(&Uri::Website(
+                    Url::parse("https://github.com/hello-rust/blob/master/README.md").unwrap()
+                ))
+                .is_some(),
+            true
+        );
+    }
+
    #[test]
    fn test_skip_markdown_anchors() {
        let input = "This is [a test](#lol).";
-        let links = extract_links(input);
+        let links = extract_links(FileType::Markdown, input, None);
        assert_eq!(links, HashSet::new())
    }

    #[test]
    fn test_skip_markdown_internal_urls() {
        let input = "This is [a test](./internal).";
-        let links = extract_links(input);
+        let links = extract_links(FileType::Markdown, input, None);
        assert_eq!(links, HashSet::new())
    }

@ -102,7 +253,7 @@ mod test {
    fn test_non_markdown_links() {
        let input =
            "https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com";
-        let links = extract_links(input);
+        let links = extract_links(FileType::Plaintext, input, None);
        let expected = HashSet::from_iter(
            [
                Uri::Website(Url::parse("https://endler.dev").unwrap()),
--- a/src/main.rs
+++ b/src/main.rs
@ -66,7 +66,7 @@ async fn run(opts: LycheeOptions) -> Result<i32> {
        None => None,
    };
    let timeout = parse_timeout(opts.timeout)?;
-    let links = collector::collect_links(opts.inputs).await?;
+    let links = collector::collect_links(opts.inputs, opts.base_url).await?;
    let progress_bar = if opts.progress {
        Some(
            ProgressBar::new(links.len() as u64)
--- a/src/options.rs
+++ b/src/options.rs
@ -68,4 +68,7 @@ pub(crate) struct LycheeOptions {

    #[options(help = "Request method", default = "get")]
    pub method: String,
+
+    #[options(help = "Base URL to check relative URls")]
+    pub base_url: Option<String>,
 }