mirror of
https://github.com/Hopiu/lychee.git
synced 2026-04-20 05:01:02 +00:00
feat: Support relative URLs (#15)
This commit is contained in:
parent
6663f23707
commit
6bd7bbf51f
8 changed files with 241 additions and 21 deletions
32
Cargo.lock
generated
32
Cargo.lock
generated
|
|
@ -1042,6 +1042,15 @@ dependencies = [
|
|||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getopts"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
|
||||
dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.1.14"
|
||||
|
|
@ -1561,6 +1570,8 @@ dependencies = [
|
|||
"log",
|
||||
"predicates",
|
||||
"pretty_env_logger",
|
||||
"pulldown-cmark",
|
||||
"quick-xml",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"tokio",
|
||||
|
|
@ -2066,12 +2077,33 @@ dependencies = [
|
|||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pulldown-cmark"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ffade02495f22453cd593159ea2f59827aae7f53fa8323f756799b670881dcf8"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"getopts",
|
||||
"memchr",
|
||||
"unicase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-error"
|
||||
version = "1.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "26aab6b48e2590e4a64d1ed808749ba06257882b461d01ca71baeb747074a6dd"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.7"
|
||||
|
|
|
|||
|
|
@ -23,6 +23,8 @@ regex = "1.3.9"
|
|||
url = "2.1.1"
|
||||
check-if-email-exists = "0.8.13"
|
||||
indicatif = "0.15.0"
|
||||
pulldown-cmark = "0.8.0"
|
||||
quick-xml = "0.20.0"
|
||||
|
||||
[dependencies.reqwest]
|
||||
features = ["gzip"]
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@
|
|||
>
|
||||
</li>
|
||||
<li><a href="https://hello-rust.show/10/">Hello Rust</a></li>
|
||||
<li><a href="20/">Hello Rust</a></li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
|||
|
|
@ -9,4 +9,4 @@ Test file: "private" URLs (should all be excluded when using `-E` flag).
|
|||
|
||||
IPv6:
|
||||
|
||||
- Loopback: http://[::1]
|
||||
- Loopback: [IPV6 Address](http://[::1])
|
||||
|
|
|
|||
|
|
@ -1,27 +1,58 @@
|
|||
use crate::extract::{self, extract_links};
|
||||
use crate::extract::{self, extract_links, FileType};
|
||||
use anyhow::Result;
|
||||
use extract::Uri;
|
||||
use glob::glob;
|
||||
use reqwest::Url;
|
||||
use std::path::Path;
|
||||
use std::{collections::HashSet, fs};
|
||||
|
||||
pub(crate) async fn collect_links(inputs: Vec<String>) -> Result<HashSet<Uri>> {
|
||||
fn resolve_file_type_by_path<P: AsRef<Path>>(p: P) -> FileType {
|
||||
let path = p.as_ref();
|
||||
match path.extension() {
|
||||
Some(ext) => match ext.to_str().unwrap() {
|
||||
"md" => FileType::Markdown,
|
||||
"html" | "htm" => FileType::HTML,
|
||||
_ => FileType::Plaintext,
|
||||
},
|
||||
None => FileType::Plaintext,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn collect_links(
|
||||
inputs: Vec<String>,
|
||||
base_url: Option<String>,
|
||||
) -> Result<HashSet<Uri>> {
|
||||
let base_url = match base_url {
|
||||
Some(url) => Some(Url::parse(&url)?),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let mut links = HashSet::new();
|
||||
|
||||
for input in inputs {
|
||||
match Url::parse(&input) {
|
||||
Ok(url) => {
|
||||
let path = String::from(url.path());
|
||||
let res = reqwest::get(url).await?;
|
||||
let content = res.text().await?;
|
||||
links.extend(extract_links(&content));
|
||||
|
||||
links.extend(extract_links(
|
||||
resolve_file_type_by_path(path),
|
||||
&content,
|
||||
base_url.clone(),
|
||||
));
|
||||
}
|
||||
Err(_) => {
|
||||
// Assume we got a single file or a glob on our hands
|
||||
for entry in glob(&input)? {
|
||||
match entry {
|
||||
Ok(path) => {
|
||||
let content = fs::read_to_string(path)?;
|
||||
links.extend(extract_links(&content));
|
||||
let content = fs::read_to_string(&path)?;
|
||||
links.extend(extract_links(
|
||||
resolve_file_type_by_path(&path),
|
||||
&content,
|
||||
base_url.clone(),
|
||||
));
|
||||
}
|
||||
Err(e) => println!("{:?}", e),
|
||||
}
|
||||
|
|
|
|||
179
src/extract.rs
179
src/extract.rs
|
|
@ -1,6 +1,8 @@
|
|||
use linkify::LinkFinder;
|
||||
|
||||
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
|
||||
use quick_xml::{events::Event as HTMLEvent, Reader};
|
||||
use std::net::IpAddr;
|
||||
use std::path::Path;
|
||||
use std::{collections::HashSet, fmt::Display};
|
||||
use url::Url;
|
||||
|
||||
|
|
@ -10,6 +12,13 @@ pub(crate) enum Uri {
|
|||
Mail(String),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) enum FileType {
|
||||
HTML,
|
||||
Markdown,
|
||||
Plaintext,
|
||||
}
|
||||
|
||||
impl Uri {
|
||||
pub fn as_str(&self) -> &str {
|
||||
match self {
|
||||
|
|
@ -49,17 +58,125 @@ fn find_links(input: &str) -> Vec<linkify::Link> {
|
|||
finder.links(input).collect()
|
||||
}
|
||||
|
||||
pub(crate) fn extract_links(input: &str) -> HashSet<Uri> {
|
||||
let links = find_links(input);
|
||||
// Extracting unparsed URL strings from a markdown string
|
||||
fn extract_links_from_markdown(input: &str) -> Vec<String> {
|
||||
let parser = Parser::new(input);
|
||||
parser
|
||||
.flat_map(|event| match event {
|
||||
MDEvent::Start(tag) => match tag {
|
||||
Tag::Link(_, url, _) | Tag::Image(_, url, _) => vec![url.to_string()],
|
||||
_ => vec![],
|
||||
},
|
||||
MDEvent::Text(txt) => extract_links_from_plaintext(&txt.to_string()),
|
||||
MDEvent::Html(html) => extract_links_from_html(&html.to_string()),
|
||||
_ => vec![],
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
// Extracting unparsed URL strings from a HTML string
|
||||
fn extract_links_from_html(input: &str) -> Vec<String> {
|
||||
let mut reader = Reader::from_str(input);
|
||||
let mut buf = Vec::new();
|
||||
let mut urls = Vec::new();
|
||||
while let Ok(e) = reader.read_event(&mut buf) {
|
||||
match e {
|
||||
HTMLEvent::Start(ref e) => {
|
||||
for attr in e.attributes() {
|
||||
if let Ok(attr) = attr {
|
||||
match (attr.key, e.name()) {
|
||||
(b"href", b"a")
|
||||
| (b"href", b"area")
|
||||
| (b"href", b"base")
|
||||
| (b"href", b"link")
|
||||
| (b"src", b"audio")
|
||||
| (b"src", b"embed")
|
||||
| (b"src", b"iframe")
|
||||
| (b"src", b"img")
|
||||
| (b"src", b"input")
|
||||
| (b"src", b"script")
|
||||
| (b"src", b"source")
|
||||
| (b"src", b"track")
|
||||
| (b"src", b"video")
|
||||
| (b"srcset", b"img")
|
||||
| (b"srcset", b"source")
|
||||
| (b"cite", b"blockquote")
|
||||
| (b"cite", b"del")
|
||||
| (b"cite", b"ins")
|
||||
| (b"cite", b"q")
|
||||
| (b"data", b"object")
|
||||
| (b"onhashchange", b"body") => {
|
||||
urls.push(String::from_utf8_lossy(attr.value.as_ref()).to_string());
|
||||
}
|
||||
_ => {
|
||||
for link in extract_links_from_plaintext(
|
||||
&String::from_utf8_lossy(attr.value.as_ref()).to_string(),
|
||||
) {
|
||||
urls.push(link);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
HTMLEvent::Text(txt) | HTMLEvent::Comment(txt) => {
|
||||
for link in extract_links_from_plaintext(
|
||||
&String::from_utf8_lossy(txt.escaped()).to_string(),
|
||||
) {
|
||||
urls.push(link);
|
||||
}
|
||||
}
|
||||
HTMLEvent::Eof => {
|
||||
break;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
urls
|
||||
}
|
||||
|
||||
// Extracting unparsed URL strings from a plaintext
|
||||
fn extract_links_from_plaintext(input: &str) -> Vec<String> {
|
||||
find_links(input)
|
||||
.iter()
|
||||
.map(|l| String::from(l.as_str()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub(crate) fn extract_links(
|
||||
file_type: FileType,
|
||||
input: &str,
|
||||
base_url: Option<Url>,
|
||||
) -> HashSet<Uri> {
|
||||
let links = match file_type {
|
||||
FileType::Markdown => extract_links_from_markdown(input),
|
||||
FileType::HTML => extract_links_from_html(input),
|
||||
FileType::Plaintext => extract_links_from_plaintext(input),
|
||||
};
|
||||
|
||||
// Only keep legit URLs. This sorts out things like anchors.
|
||||
// Silently ignore the parse failures for now.
|
||||
let mut uris = HashSet::new();
|
||||
for link in links {
|
||||
match Url::parse(link.as_str()) {
|
||||
Ok(url) => uris.insert(Uri::Website(url)),
|
||||
Err(_) => uris.insert(Uri::Mail(link.as_str().to_owned())),
|
||||
match Url::parse(&link) {
|
||||
Ok(url) => {
|
||||
uris.insert(Uri::Website(url));
|
||||
}
|
||||
Err(_) => {
|
||||
if link.contains('@') {
|
||||
uris.insert(Uri::Mail(link));
|
||||
} else if !Path::new(&link).exists() {
|
||||
if let Some(base_url) = &base_url {
|
||||
if let Ok(new_url) = base_url.clone().join(&link) {
|
||||
uris.insert(Uri::Website(new_url));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
debug!("Found: {:#?}", uris);
|
||||
uris
|
||||
}
|
||||
|
|
@ -72,29 +189,63 @@ mod test {
|
|||
|
||||
#[test]
|
||||
fn test_extract_markdown_links() {
|
||||
let input = "This is [a test](https://endler.dev).";
|
||||
let links = extract_links(input);
|
||||
let input = "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)";
|
||||
let links = extract_links(
|
||||
FileType::Markdown,
|
||||
input,
|
||||
Some(Url::parse("https://github.com/hello-rust/lychee/").unwrap()),
|
||||
);
|
||||
assert_eq!(
|
||||
links,
|
||||
HashSet::from_iter(
|
||||
[Uri::Website(Url::parse("https://endler.dev").unwrap())]
|
||||
.iter()
|
||||
.cloned()
|
||||
[
|
||||
Uri::Website(Url::parse("https://endler.dev").unwrap()),
|
||||
Uri::Website(
|
||||
Url::parse("https://github.com/hello-rust/lychee/relative_link").unwrap()
|
||||
)
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html_links() {
|
||||
let input = r#"<html>
|
||||
<div class="row">
|
||||
<a href="https://github.com/hello-rust/lychee/">
|
||||
<a href="blob/master/README.md">README</a>
|
||||
</div>
|
||||
</html>"#;
|
||||
|
||||
let links = extract_links(
|
||||
FileType::HTML,
|
||||
input,
|
||||
Some(Url::parse("https://github.com/hello-rust/").unwrap()),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
links
|
||||
.get(&Uri::Website(
|
||||
Url::parse("https://github.com/hello-rust/blob/master/README.md").unwrap()
|
||||
))
|
||||
.is_some(),
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_markdown_anchors() {
|
||||
let input = "This is [a test](#lol).";
|
||||
let links = extract_links(input);
|
||||
let links = extract_links(FileType::Markdown, input, None);
|
||||
assert_eq!(links, HashSet::new())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_markdown_internal_urls() {
|
||||
let input = "This is [a test](./internal).";
|
||||
let links = extract_links(input);
|
||||
let links = extract_links(FileType::Markdown, input, None);
|
||||
assert_eq!(links, HashSet::new())
|
||||
}
|
||||
|
||||
|
|
@ -102,7 +253,7 @@ mod test {
|
|||
fn test_non_markdown_links() {
|
||||
let input =
|
||||
"https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com";
|
||||
let links = extract_links(input);
|
||||
let links = extract_links(FileType::Plaintext, input, None);
|
||||
let expected = HashSet::from_iter(
|
||||
[
|
||||
Uri::Website(Url::parse("https://endler.dev").unwrap()),
|
||||
|
|
|
|||
|
|
@ -66,7 +66,7 @@ async fn run(opts: LycheeOptions) -> Result<i32> {
|
|||
None => None,
|
||||
};
|
||||
let timeout = parse_timeout(opts.timeout)?;
|
||||
let links = collector::collect_links(opts.inputs).await?;
|
||||
let links = collector::collect_links(opts.inputs, opts.base_url).await?;
|
||||
let progress_bar = if opts.progress {
|
||||
Some(
|
||||
ProgressBar::new(links.len() as u64)
|
||||
|
|
|
|||
|
|
@ -68,4 +68,7 @@ pub(crate) struct LycheeOptions {
|
|||
|
||||
#[options(help = "Request method", default = "get")]
|
||||
pub method: String,
|
||||
|
||||
#[options(help = "Base URL to check relative URls")]
|
||||
pub base_url: Option<String>,
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue