feat: Support relative URLs (#15)

This commit is contained in:
WhizSid 2020-10-21 05:01:06 +05:30 committed by GitHub
parent 6663f23707
commit 6bd7bbf51f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 241 additions and 21 deletions

32
Cargo.lock generated
View file

@ -1042,6 +1042,15 @@ dependencies = [
"version_check",
]
[[package]]
name = "getopts"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
dependencies = [
"unicode-width",
]
[[package]]
name = "getrandom"
version = "0.1.14"
@ -1561,6 +1570,8 @@ dependencies = [
"log",
"predicates",
"pretty_env_logger",
"pulldown-cmark",
"quick-xml",
"regex",
"reqwest",
"tokio",
@ -2066,12 +2077,33 @@ dependencies = [
"unicode-xid",
]
[[package]]
name = "pulldown-cmark"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffade02495f22453cd593159ea2f59827aae7f53fa8323f756799b670881dcf8"
dependencies = [
"bitflags",
"getopts",
"memchr",
"unicase",
]
[[package]]
name = "quick-error"
version = "1.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
[[package]]
name = "quick-xml"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26aab6b48e2590e4a64d1ed808749ba06257882b461d01ca71baeb747074a6dd"
dependencies = [
"memchr",
]
[[package]]
name = "quote"
version = "1.0.7"

View file

@ -23,6 +23,8 @@ regex = "1.3.9"
url = "2.1.1"
check-if-email-exists = "0.8.13"
indicatif = "0.15.0"
pulldown-cmark = "0.8.0"
quick-xml = "0.20.0"
[dependencies.reqwest]
features = ["gzip"]

View file

@ -10,6 +10,7 @@
>
</li>
<li><a href="https://hello-rust.show/10/">Hello Rust</a></li>
<li><a href="20/">Hello Rust</a></li>
</ul>
</body>
</html>

View file

@ -9,4 +9,4 @@ Test file: "private" URLs (should all be excluded when using `-E` flag).
IPv6:
- Loopback: http://[::1]
- Loopback: [IPV6 Address](http://[::1])

View file

@ -1,27 +1,58 @@
use crate::extract::{self, extract_links};
use crate::extract::{self, extract_links, FileType};
use anyhow::Result;
use extract::Uri;
use glob::glob;
use reqwest::Url;
use std::path::Path;
use std::{collections::HashSet, fs};
pub(crate) async fn collect_links(inputs: Vec<String>) -> Result<HashSet<Uri>> {
fn resolve_file_type_by_path<P: AsRef<Path>>(p: P) -> FileType {
let path = p.as_ref();
match path.extension() {
Some(ext) => match ext.to_str().unwrap() {
"md" => FileType::Markdown,
"html" | "htm" => FileType::HTML,
_ => FileType::Plaintext,
},
None => FileType::Plaintext,
}
}
pub(crate) async fn collect_links(
inputs: Vec<String>,
base_url: Option<String>,
) -> Result<HashSet<Uri>> {
let base_url = match base_url {
Some(url) => Some(Url::parse(&url)?),
_ => None,
};
let mut links = HashSet::new();
for input in inputs {
match Url::parse(&input) {
Ok(url) => {
let path = String::from(url.path());
let res = reqwest::get(url).await?;
let content = res.text().await?;
links.extend(extract_links(&content));
links.extend(extract_links(
resolve_file_type_by_path(path),
&content,
base_url.clone(),
));
}
Err(_) => {
// Assume we got a single file or a glob on our hands
for entry in glob(&input)? {
match entry {
Ok(path) => {
let content = fs::read_to_string(path)?;
links.extend(extract_links(&content));
let content = fs::read_to_string(&path)?;
links.extend(extract_links(
resolve_file_type_by_path(&path),
&content,
base_url.clone(),
));
}
Err(e) => println!("{:?}", e),
}

View file

@ -1,6 +1,8 @@
use linkify::LinkFinder;
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
use quick_xml::{events::Event as HTMLEvent, Reader};
use std::net::IpAddr;
use std::path::Path;
use std::{collections::HashSet, fmt::Display};
use url::Url;
@ -10,6 +12,13 @@ pub(crate) enum Uri {
Mail(String),
}
#[derive(Clone, Debug)]
pub(crate) enum FileType {
HTML,
Markdown,
Plaintext,
}
impl Uri {
pub fn as_str(&self) -> &str {
match self {
@ -49,17 +58,125 @@ fn find_links(input: &str) -> Vec<linkify::Link> {
finder.links(input).collect()
}
pub(crate) fn extract_links(input: &str) -> HashSet<Uri> {
let links = find_links(input);
// Extracting unparsed URL strings from a markdown string
fn extract_links_from_markdown(input: &str) -> Vec<String> {
let parser = Parser::new(input);
parser
.flat_map(|event| match event {
MDEvent::Start(tag) => match tag {
Tag::Link(_, url, _) | Tag::Image(_, url, _) => vec![url.to_string()],
_ => vec![],
},
MDEvent::Text(txt) => extract_links_from_plaintext(&txt.to_string()),
MDEvent::Html(html) => extract_links_from_html(&html.to_string()),
_ => vec![],
})
.collect()
}
// Extracting unparsed URL strings from a HTML string
fn extract_links_from_html(input: &str) -> Vec<String> {
let mut reader = Reader::from_str(input);
let mut buf = Vec::new();
let mut urls = Vec::new();
while let Ok(e) = reader.read_event(&mut buf) {
match e {
HTMLEvent::Start(ref e) => {
for attr in e.attributes() {
if let Ok(attr) = attr {
match (attr.key, e.name()) {
(b"href", b"a")
| (b"href", b"area")
| (b"href", b"base")
| (b"href", b"link")
| (b"src", b"audio")
| (b"src", b"embed")
| (b"src", b"iframe")
| (b"src", b"img")
| (b"src", b"input")
| (b"src", b"script")
| (b"src", b"source")
| (b"src", b"track")
| (b"src", b"video")
| (b"srcset", b"img")
| (b"srcset", b"source")
| (b"cite", b"blockquote")
| (b"cite", b"del")
| (b"cite", b"ins")
| (b"cite", b"q")
| (b"data", b"object")
| (b"onhashchange", b"body") => {
urls.push(String::from_utf8_lossy(attr.value.as_ref()).to_string());
}
_ => {
for link in extract_links_from_plaintext(
&String::from_utf8_lossy(attr.value.as_ref()).to_string(),
) {
urls.push(link);
}
}
}
}
}
}
HTMLEvent::Text(txt) | HTMLEvent::Comment(txt) => {
for link in extract_links_from_plaintext(
&String::from_utf8_lossy(txt.escaped()).to_string(),
) {
urls.push(link);
}
}
HTMLEvent::Eof => {
break;
}
_ => {}
}
buf.clear();
}
urls
}
// Extracting unparsed URL strings from a plaintext
fn extract_links_from_plaintext(input: &str) -> Vec<String> {
find_links(input)
.iter()
.map(|l| String::from(l.as_str()))
.collect()
}
pub(crate) fn extract_links(
file_type: FileType,
input: &str,
base_url: Option<Url>,
) -> HashSet<Uri> {
let links = match file_type {
FileType::Markdown => extract_links_from_markdown(input),
FileType::HTML => extract_links_from_html(input),
FileType::Plaintext => extract_links_from_plaintext(input),
};
// Only keep legit URLs. This sorts out things like anchors.
// Silently ignore the parse failures for now.
let mut uris = HashSet::new();
for link in links {
match Url::parse(link.as_str()) {
Ok(url) => uris.insert(Uri::Website(url)),
Err(_) => uris.insert(Uri::Mail(link.as_str().to_owned())),
match Url::parse(&link) {
Ok(url) => {
uris.insert(Uri::Website(url));
}
Err(_) => {
if link.contains('@') {
uris.insert(Uri::Mail(link));
} else if !Path::new(&link).exists() {
if let Some(base_url) = &base_url {
if let Ok(new_url) = base_url.clone().join(&link) {
uris.insert(Uri::Website(new_url));
}
}
}
}
};
}
debug!("Found: {:#?}", uris);
uris
}
@ -72,29 +189,63 @@ mod test {
#[test]
fn test_extract_markdown_links() {
let input = "This is [a test](https://endler.dev).";
let links = extract_links(input);
let input = "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)";
let links = extract_links(
FileType::Markdown,
input,
Some(Url::parse("https://github.com/hello-rust/lychee/").unwrap()),
);
assert_eq!(
links,
HashSet::from_iter(
[Uri::Website(Url::parse("https://endler.dev").unwrap())]
.iter()
.cloned()
[
Uri::Website(Url::parse("https://endler.dev").unwrap()),
Uri::Website(
Url::parse("https://github.com/hello-rust/lychee/relative_link").unwrap()
)
]
.iter()
.cloned()
)
)
}
#[test]
fn test_extract_html_links() {
let input = r#"<html>
<div class="row">
<a href="https://github.com/hello-rust/lychee/">
<a href="blob/master/README.md">README</a>
</div>
</html>"#;
let links = extract_links(
FileType::HTML,
input,
Some(Url::parse("https://github.com/hello-rust/").unwrap()),
);
assert_eq!(
links
.get(&Uri::Website(
Url::parse("https://github.com/hello-rust/blob/master/README.md").unwrap()
))
.is_some(),
true
);
}
#[test]
fn test_skip_markdown_anchors() {
let input = "This is [a test](#lol).";
let links = extract_links(input);
let links = extract_links(FileType::Markdown, input, None);
assert_eq!(links, HashSet::new())
}
#[test]
fn test_skip_markdown_internal_urls() {
let input = "This is [a test](./internal).";
let links = extract_links(input);
let links = extract_links(FileType::Markdown, input, None);
assert_eq!(links, HashSet::new())
}
@ -102,7 +253,7 @@ mod test {
fn test_non_markdown_links() {
let input =
"https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com";
let links = extract_links(input);
let links = extract_links(FileType::Plaintext, input, None);
let expected = HashSet::from_iter(
[
Uri::Website(Url::parse("https://endler.dev").unwrap()),

View file

@ -66,7 +66,7 @@ async fn run(opts: LycheeOptions) -> Result<i32> {
None => None,
};
let timeout = parse_timeout(opts.timeout)?;
let links = collector::collect_links(opts.inputs).await?;
let links = collector::collect_links(opts.inputs, opts.base_url).await?;
let progress_bar = if opts.progress {
Some(
ProgressBar::new(links.len() as u64)

View file

@ -68,4 +68,7 @@ pub(crate) struct LycheeOptions {
#[options(help = "Request method", default = "get")]
pub method: String,
#[options(help = "Base URL to check relative URls")]
pub base_url: Option<String>,
}