mirror of
https://github.com/Hopiu/lychee.git
synced 2026-03-16 20:50:25 +00:00
Use tokenizer for extraction; add benchmark (#424)
This avoids creating a DOM tree for link extraction and instead uses a `TokenSink` for on-the-fly extraction. In hyperfine benchmarks it was about 10-25% faster than the master. Old: 4.557 s ± 0.404 s New: 3.832 s ± 0.131 s The performance fluctuates a little less as well. Some missing element/attribute pairs were also added, which contain links according to the HTML spec. These occur very rarely, but it's good to parse them for completeness' sake. Furthermore tried to clean up a lot of papercuts around our types. We now differentiate between a `RawUri` (stringy-types) and a Uri, which is a properly parsed `URI` type. The extractor now only deals with extracting `RawUri`s while the collector creates the request objects.
This commit is contained in:
parent
c97ff95575
commit
166c86c30e
40 changed files with 4206 additions and 786 deletions
213
Cargo.lock
generated
213
Cargo.lock
generated
|
|
@ -342,6 +342,14 @@ version = "0.13.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd"
|
||||
|
||||
[[package]]
|
||||
name = "benches"
|
||||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"criterion",
|
||||
"lychee-lib",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "1.3.2"
|
||||
|
|
@ -380,6 +388,7 @@ dependencies = [
|
|||
"lazy_static",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -457,6 +466,15 @@ version = "0.1.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3a4f925191b4367301851c6d99b09890311d74b0d43f274c0b34c86d308a3663"
|
||||
|
||||
[[package]]
|
||||
name = "cast"
|
||||
version = "0.2.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a"
|
||||
dependencies = [
|
||||
"rustc_version",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.0.71"
|
||||
|
|
@ -625,6 +643,42 @@ dependencies = [
|
|||
"cfg-if 1.0.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "criterion"
|
||||
version = "0.3.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10"
|
||||
dependencies = [
|
||||
"atty",
|
||||
"cast",
|
||||
"clap",
|
||||
"criterion-plot",
|
||||
"csv",
|
||||
"itertools",
|
||||
"lazy_static",
|
||||
"num-traits",
|
||||
"oorandom",
|
||||
"plotters",
|
||||
"rayon",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_cbor",
|
||||
"serde_derive",
|
||||
"serde_json",
|
||||
"tinytemplate",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "criterion-plot"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57"
|
||||
dependencies = [
|
||||
"cast",
|
||||
"itertools",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam"
|
||||
version = "0.8.1"
|
||||
|
|
@ -719,6 +773,28 @@ dependencies = [
|
|||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv"
|
||||
version = "1.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
|
||||
dependencies = [
|
||||
"bstr",
|
||||
"csv-core",
|
||||
"itoa 0.4.8",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv-core"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ctor"
|
||||
version = "0.1.21"
|
||||
|
|
@ -882,6 +958,14 @@ version = "2.5.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f7531096570974c3a9dcf9e4b8e1cede1ec26cf5046219fb3b9d897503b9be59"
|
||||
|
||||
[[package]]
|
||||
name = "extract"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"lychee-lib",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fast-socks5"
|
||||
version = "0.4.3"
|
||||
|
|
@ -1191,6 +1275,12 @@ dependencies = [
|
|||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "half"
|
||||
version = "1.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.11.2"
|
||||
|
|
@ -1673,7 +1763,6 @@ dependencies = [
|
|||
"jwalk",
|
||||
"linkify",
|
||||
"log",
|
||||
"markup5ever_rcdom",
|
||||
"once_cell",
|
||||
"openssl-sys",
|
||||
"par-stream",
|
||||
|
|
@ -1723,18 +1812,6 @@ dependencies = [
|
|||
"tendril",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "markup5ever_rcdom"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f015da43bcd8d4f144559a3423f4591d69b8ce0652c905374da7205df336ae2b"
|
||||
dependencies = [
|
||||
"html5ever",
|
||||
"markup5ever",
|
||||
"tendril",
|
||||
"xml5ever",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "match_cfg"
|
||||
version = "0.1.0"
|
||||
|
|
@ -1935,6 +2012,12 @@ version = "1.9.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5"
|
||||
|
||||
[[package]]
|
||||
name = "oorandom"
|
||||
version = "11.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
|
||||
|
||||
[[package]]
|
||||
name = "opaque-debug"
|
||||
version = "0.3.0"
|
||||
|
|
@ -2182,6 +2265,34 @@ version = "0.3.22"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "12295df4f294471248581bc09bef3c38a5e46f1e36d6a37353621a0c6c357e1f"
|
||||
|
||||
[[package]]
|
||||
name = "plotters"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
"plotters-backend",
|
||||
"plotters-svg",
|
||||
"wasm-bindgen",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "plotters-backend"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c"
|
||||
|
||||
[[package]]
|
||||
name = "plotters-svg"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9"
|
||||
dependencies = [
|
||||
"plotters-backend",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "polling"
|
||||
version = "2.1.0"
|
||||
|
|
@ -2548,12 +2659,30 @@ dependencies = [
|
|||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc_version"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
|
||||
dependencies = [
|
||||
"semver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
|
||||
|
||||
[[package]]
|
||||
name = "same-file"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
|
||||
dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "schannel"
|
||||
version = "0.1.19"
|
||||
|
|
@ -2593,6 +2722,12 @@ dependencies = [
|
|||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "568a8e6258aa33c13358f81fd834adb854c6f7c9468520910a9b1e8fac068012"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.130"
|
||||
|
|
@ -2602,6 +2737,16 @@ dependencies = [
|
|||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_cbor"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5"
|
||||
dependencies = [
|
||||
"half",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.130"
|
||||
|
|
@ -2959,6 +3104,16 @@ dependencies = [
|
|||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinytemplate"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.5.0"
|
||||
|
|
@ -3288,6 +3443,17 @@ version = "1.1.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca"
|
||||
|
||||
[[package]]
|
||||
name = "walkdir"
|
||||
version = "2.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56"
|
||||
dependencies = [
|
||||
"same-file",
|
||||
"winapi",
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "want"
|
||||
version = "0.3.0"
|
||||
|
|
@ -3417,6 +3583,15 @@ version = "0.4.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-util"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
|
|
@ -3469,15 +3644,3 @@ dependencies = [
|
|||
"serde_json",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xml5ever"
|
||||
version = "0.16.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9234163818fd8e2418fcde330655e757900d4236acd8cc70fef345ef91f6d865"
|
||||
dependencies = [
|
||||
"log",
|
||||
"mac",
|
||||
"markup5ever",
|
||||
"time",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ members = [
|
|||
"lychee-bin",
|
||||
"lychee-lib",
|
||||
"examples/*",
|
||||
"benches",
|
||||
]
|
||||
|
||||
[patch.crates-io]
|
||||
|
|
|
|||
17
benches/Cargo.toml
Normal file
17
benches/Cargo.toml
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
[package]
|
||||
name = "benches"
|
||||
version = "0.0.0"
|
||||
authors = ["Matthias Endler <matthias@endler.dev>"]
|
||||
license = "Apache-2.0/MIT"
|
||||
description = "Criterion benchmarks of the lychee crates"
|
||||
edition = "2018"
|
||||
publish = false
|
||||
|
||||
[dependencies]
|
||||
lychee-lib = { path = "../lychee-lib"}
|
||||
criterion = "0.3"
|
||||
|
||||
[[bench]]
|
||||
name = "extract"
|
||||
path = "src/extract.rs"
|
||||
harness = false
|
||||
8
benches/README.md
Normal file
8
benches/README.md
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
# Benchmarks
|
||||
|
||||
Testing critical sections of lychee for performance.
|
||||
Run with
|
||||
|
||||
```
|
||||
cargo bench -p benches
|
||||
```
|
||||
24
benches/src/extract.rs
Normal file
24
benches/src/extract.rs
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
use criterion::{black_box, criterion_group, criterion_main, Criterion};
|
||||
use lychee_lib::extract::Extractor;
|
||||
use lychee_lib::{FileType, InputContent};
|
||||
use std::fs;
|
||||
|
||||
fn extract(input: &str) {
|
||||
let extracted = Extractor::extract(&InputContent::from_string(input, FileType::Html));
|
||||
println!("{}", extracted.len());
|
||||
}
|
||||
|
||||
fn benchmark(c: &mut Criterion) {
|
||||
// Currently Wikipedia's biggest featured article
|
||||
let elvis = fs::read_to_string("../fixtures/elvis.html").unwrap();
|
||||
c.bench_function("extract from large doc", |b| {
|
||||
b.iter(|| extract(black_box(&elvis)))
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
name = benches;
|
||||
config = Criterion::default().sample_size(10);
|
||||
targets = benchmark
|
||||
);
|
||||
criterion_main!(benches);
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
use lychee_lib::{ClientBuilder, Input, Request, Result, Uri};
|
||||
use lychee_lib::{ClientBuilder, Request, Result};
|
||||
use std::convert::TryFrom;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
|
|
@ -13,15 +13,11 @@ async fn main() -> Result<()> {
|
|||
let (send_resp, mut recv_resp) = mpsc::channel(CONCURRENT_REQUESTS);
|
||||
|
||||
// Add as many requests as you like
|
||||
let requests = vec![Request::new(
|
||||
Uri::try_from("https://example.org")?,
|
||||
Input::Stdin,
|
||||
)];
|
||||
let requests = vec![Request::try_from("https://example.org")?];
|
||||
|
||||
// Queue requests
|
||||
tokio::spawn(async move {
|
||||
for request in requests {
|
||||
println!("Sending {}", request);
|
||||
send_req.send(request).await.unwrap();
|
||||
}
|
||||
});
|
||||
|
|
|
|||
|
|
@ -13,4 +13,4 @@ tokio = { version = "1.14.0", features = ["full"] }
|
|||
regex = "1.4.6"
|
||||
http = "0.2.5"
|
||||
tokio-stream = "0.1.7"
|
||||
reqwest = { version = "0.11.7", features = ["gzip"] }
|
||||
reqwest = { version = "0.11.7", features = ["gzip"] }
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
use lychee_lib::{Collector, Input, Result};
|
||||
use lychee_lib::{Collector, Input, InputSource, Result};
|
||||
use reqwest::Url;
|
||||
use std::path::PathBuf;
|
||||
use tokio_stream::StreamExt;
|
||||
|
|
@ -8,10 +8,16 @@ use tokio_stream::StreamExt;
|
|||
async fn main() -> Result<()> {
|
||||
// Collect all links from the following inputs
|
||||
let inputs = vec![
|
||||
Input::RemoteUrl(Box::new(
|
||||
Url::parse("https://github.com/lycheeverse/lychee").unwrap(),
|
||||
)),
|
||||
Input::FsPath(PathBuf::from("fixtures/TEST.md")),
|
||||
Input {
|
||||
source: InputSource::RemoteUrl(Box::new(
|
||||
Url::parse("https://github.com/lycheeverse/lychee").unwrap(),
|
||||
)),
|
||||
file_type_hint: None,
|
||||
},
|
||||
Input {
|
||||
source: InputSource::FsPath(PathBuf::from("fixtures/TEST.md")),
|
||||
file_type_hint: None,
|
||||
},
|
||||
];
|
||||
|
||||
let links = Collector::new(
|
||||
|
|
|
|||
12
examples/extract/Cargo.toml
Normal file
12
examples/extract/Cargo.toml
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
[package]
|
||||
name = "extract"
|
||||
version = "0.1.0"
|
||||
edition = "2018"
|
||||
|
||||
[[example]]
|
||||
name = "extract"
|
||||
path = "extract.rs"
|
||||
|
||||
[dependencies]
|
||||
lychee-lib = { path = "../../lychee-lib", version = "0.8.1" }
|
||||
tokio = { version = "1.14.0", features = ["full"] }
|
||||
13
examples/extract/extract.rs
Normal file
13
examples/extract/extract.rs
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
use lychee_lib::extract::Extractor;
|
||||
use lychee_lib::Result;
|
||||
use lychee_lib::{FileType, InputContent};
|
||||
use std::fs;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let input = fs::read_to_string("fixtures/elvis.html").unwrap();
|
||||
let links = Extractor::extract(&InputContent::from_string(&input, FileType::Html));
|
||||
println!("{:#?}", links);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
2893
fixtures/elvis.html
Normal file
2893
fixtures/elvis.html
Normal file
File diff suppressed because one or more lines are too long
|
|
@ -32,7 +32,12 @@ where
|
|||
max_concurrency,
|
||||
|request: Result<Request>| async {
|
||||
let request: Request = request.expect("cannot read request");
|
||||
let response = client.check(request).await.expect("cannot check request");
|
||||
// This can panic. See when the Url could not be parsed as a Uri.
|
||||
// See https://github.com/servo/rust-url/issues/554
|
||||
// See https://github.com/seanmonstar/reqwest/issues/668
|
||||
// TODO: Handle error as soon as https://github.com/seanmonstar/reqwest/pull/1399 got merged
|
||||
let response = client.check(request).await.expect("cannot check URI");
|
||||
|
||||
send_resp
|
||||
.send(response)
|
||||
.await
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
use std::io::{self, StdoutLock, Write};
|
||||
|
||||
use lychee_lib::Result;
|
||||
use lychee_lib::{Client, Request};
|
||||
use std::io::{self, Write};
|
||||
use tokio_stream::StreamExt;
|
||||
|
||||
use crate::ExitCode;
|
||||
|
|
@ -11,10 +10,6 @@ pub(crate) async fn dump<'a, S>(client: Client, requests: S, verbose: bool) -> R
|
|||
where
|
||||
S: futures::Stream<Item = Result<Request>>,
|
||||
{
|
||||
// Lock stdout for better performance
|
||||
let stdout = io::stdout();
|
||||
let mut handle = stdout.lock();
|
||||
|
||||
tokio::pin!(requests);
|
||||
|
||||
while let Some(request) = requests.next().await {
|
||||
|
|
@ -28,7 +23,7 @@ where
|
|||
// See https://github.com/rust-lang/rust/issues/46016
|
||||
// This can occur when piping the output of lychee
|
||||
// to another program like `grep`.
|
||||
if let Err(e) = write(&mut handle, &request, verbose) {
|
||||
if let Err(e) = write(&request, verbose) {
|
||||
if e.kind() != io::ErrorKind::BrokenPipe {
|
||||
eprintln!("{}", e);
|
||||
return Ok(ExitCode::UnexpectedFailure);
|
||||
|
|
@ -42,11 +37,11 @@ where
|
|||
/// Dump request to stdout
|
||||
/// Only print source in verbose mode. This way the normal link output
|
||||
/// can be fed into another tool without data mangling.
|
||||
fn write(handle: &mut StdoutLock<'_>, request: &Request, verbose: bool) -> io::Result<()> {
|
||||
fn write(request: &Request, verbose: bool) -> io::Result<()> {
|
||||
let output = if verbose {
|
||||
request.to_string()
|
||||
} else {
|
||||
request.uri.to_string()
|
||||
};
|
||||
writeln!(*handle, "{}", output)
|
||||
writeln!(io::stdout(), "{}", output)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -112,7 +112,7 @@ impl LycheeOptions {
|
|||
pub(crate) fn inputs(&self) -> Vec<Input> {
|
||||
self.raw_inputs
|
||||
.iter()
|
||||
.map(|s| Input::new(s, self.config.glob_ignore_case))
|
||||
.map(|s| Input::new(s, None, self.config.glob_ignore_case))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use lychee_lib::{Input, Response, ResponseBody, Status};
|
||||
use lychee_lib::{InputSource, Response, ResponseBody, Status};
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::color::{DIM, GREEN, NORMAL, PINK, YELLOW};
|
||||
|
|
@ -26,7 +26,7 @@ pub(crate) struct ResponseStats {
|
|||
pub(crate) redirects: usize,
|
||||
pub(crate) excludes: usize,
|
||||
pub(crate) errors: usize,
|
||||
pub(crate) fail_map: HashMap<Input, HashSet<ResponseBody>>,
|
||||
pub(crate) fail_map: HashMap<InputSource, HashSet<ResponseBody>>,
|
||||
}
|
||||
|
||||
impl ResponseStats {
|
||||
|
|
@ -80,7 +80,7 @@ mod test {
|
|||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use http::StatusCode;
|
||||
use lychee_lib::{ClientBuilder, Input, Response, ResponseBody, Status, Uri};
|
||||
use lychee_lib::{ClientBuilder, InputSource, Response, ResponseBody, Status, Uri};
|
||||
use pretty_assertions::assert_eq;
|
||||
use reqwest::Url;
|
||||
use wiremock::{matchers::path, Mock, MockServer, ResponseTemplate};
|
||||
|
|
@ -117,7 +117,7 @@ mod test {
|
|||
assert!(stats.is_empty());
|
||||
|
||||
stats.add(Response(
|
||||
Input::Stdin,
|
||||
InputSource::Stdin,
|
||||
ResponseBody {
|
||||
uri: website("https://example.org/ok"),
|
||||
status: Status::Ok(StatusCode::OK),
|
||||
|
|
@ -140,11 +140,11 @@ mod test {
|
|||
stats.add(get_mock_status_response(status).await);
|
||||
}
|
||||
|
||||
let mut expected_map: HashMap<Input, HashSet<ResponseBody>> = HashMap::new();
|
||||
let mut expected_map: HashMap<InputSource, HashSet<ResponseBody>> = HashMap::new();
|
||||
for status in &status_codes {
|
||||
if status.is_server_error() || status.is_client_error() || status.is_redirection() {
|
||||
let Response(input, response_body) = get_mock_status_response(status).await;
|
||||
let entry = expected_map.entry(input).or_default();
|
||||
let Response(source, response_body) = get_mock_status_response(status).await;
|
||||
let entry = expected_map.entry(source).or_default();
|
||||
entry.insert(response_body);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -44,8 +44,8 @@ impl Display for CompactResponseStats {
|
|||
stats.fail_map.len()
|
||||
)?;
|
||||
}
|
||||
for (input, responses) in &stats.fail_map {
|
||||
color!(f, BOLD_YELLOW, "[{}]:\n", input)?;
|
||||
for (source, responses) in &stats.fail_map {
|
||||
color!(f, BOLD_YELLOW, "[{}]:\n", source)?;
|
||||
for response in responses {
|
||||
writeln!(f, "{}", color_response(response))?;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,10 +39,10 @@ impl Display for DetailedResponseStats {
|
|||
write_stat(f, "\u{2753} Unknown", stats.unknown, true)?; //❓
|
||||
write_stat(f, "\u{1f6ab} Errors", stats.errors + stats.failures, false)?; // 🚫
|
||||
|
||||
for (input, responses) in &stats.fail_map {
|
||||
for (source, responses) in &stats.fail_map {
|
||||
// Using leading newlines over trailing ones (e.g. `writeln!`)
|
||||
// lets us avoid extra newlines without any additional logic.
|
||||
write!(f, "\n\nErrors in {}", input)?;
|
||||
write!(f, "\n\nErrors in {}", source)?;
|
||||
for response in responses {
|
||||
write!(f, "\n{}", color_response(response))?;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -66,10 +66,10 @@ impl Display for MarkdownResponseStats {
|
|||
if !&stats.fail_map.is_empty() {
|
||||
writeln!(f)?;
|
||||
writeln!(f, "## Errors per input")?;
|
||||
for (input, responses) in &stats.fail_map {
|
||||
for (source, responses) in &stats.fail_map {
|
||||
// Using leading newlines over trailing ones (e.g. `writeln!`)
|
||||
// lets us avoid extra newlines without any additional logic.
|
||||
writeln!(f, "### Errors in {}", input)?;
|
||||
writeln!(f, "### Errors in {}", source)?;
|
||||
for response in responses {
|
||||
writeln!(
|
||||
f,
|
||||
|
|
|
|||
|
|
@ -24,7 +24,6 @@ html5ever = "0.25.1"
|
|||
http = "0.2.5"
|
||||
hubcaps = "0.6.2"
|
||||
linkify = "0.8.0"
|
||||
markup5ever_rcdom = "0.1.0"
|
||||
openssl-sys = "0.9.72"
|
||||
pulldown-cmark = "0.8.0"
|
||||
regex = "1.4.6"
|
||||
|
|
|
|||
|
|
@ -4,7 +4,11 @@
|
|||
clippy::default_trait_access,
|
||||
clippy::used_underscore_binding
|
||||
)]
|
||||
use std::{collections::HashSet, convert::TryFrom, time::Duration};
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
convert::{TryFrom, TryInto},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
|
||||
use http::{
|
||||
|
|
@ -190,7 +194,14 @@ impl Client {
|
|||
Request: TryFrom<T, Error = E>,
|
||||
ErrorKind: From<E>,
|
||||
{
|
||||
let Request { uri, source } = Request::try_from(request)?;
|
||||
let Request {
|
||||
uri,
|
||||
source,
|
||||
element: _element,
|
||||
attribute: _attribute,
|
||||
} = request.try_into()?;
|
||||
// TODO: Allow filtering based on element and attribute
|
||||
|
||||
let status = if self.filter.is_excluded(&uri) {
|
||||
Status::Excluded
|
||||
} else if uri.is_file() {
|
||||
|
|
|
|||
|
|
@ -1,10 +1,11 @@
|
|||
use crate::{extract::Extractor, Base, Input, Request, Result};
|
||||
use crate::{
|
||||
extract::Extractor, helpers::request, types::raw_uri::RawUri, Base, Input, Request, Result,
|
||||
};
|
||||
use futures::{
|
||||
stream::{self, Stream},
|
||||
StreamExt, TryStreamExt,
|
||||
};
|
||||
use par_stream::ParStreamExt;
|
||||
use std::collections::HashSet;
|
||||
|
||||
/// Collector keeps the state of link collection
|
||||
/// It drives the link extraction from inputs
|
||||
|
|
@ -35,18 +36,19 @@ impl Collector {
|
|||
let skip_missing_inputs = self.skip_missing_inputs;
|
||||
let contents = stream::iter(inputs)
|
||||
.par_then_unordered(None, move |input| async move {
|
||||
input.get_contents(None, skip_missing_inputs).await
|
||||
input.get_contents(skip_missing_inputs).await
|
||||
})
|
||||
.flatten();
|
||||
|
||||
let extractor = Extractor::new(self.base);
|
||||
let base = self.base;
|
||||
contents
|
||||
.par_then_unordered(None, move |content| {
|
||||
let mut extractor = extractor.clone();
|
||||
// send to parallel worker
|
||||
let base = base.clone();
|
||||
async move {
|
||||
let content = content?;
|
||||
let requests: HashSet<Request> = extractor.extract(&content)?;
|
||||
let uris: Vec<RawUri> = Extractor::extract(&content);
|
||||
let requests = request::create(uris, &content, &base)?;
|
||||
Result::Ok(stream::iter(requests.into_iter().map(Ok)))
|
||||
}
|
||||
})
|
||||
|
|
@ -56,7 +58,7 @@ impl Collector {
|
|||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::{fs::File, io::Write};
|
||||
use std::{collections::HashSet, convert::TryFrom, fs::File, io::Write, iter::FromIterator};
|
||||
|
||||
use http::StatusCode;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
|
@ -65,11 +67,17 @@ mod test {
|
|||
use super::*;
|
||||
use crate::{
|
||||
mock_server,
|
||||
test_utils::{mail, website},
|
||||
types::{FileType, Input},
|
||||
test_utils::{load_fixture, mail, website},
|
||||
types::{FileType, Input, InputSource},
|
||||
Result, Uri,
|
||||
};
|
||||
|
||||
// Helper function to run the collector on the given inputs
|
||||
async fn collect(inputs: Vec<Input>, base: Option<Base>) -> HashSet<Uri> {
|
||||
let responses = Collector::new(base, false).collect_links(inputs).await;
|
||||
responses.map(|r| r.unwrap().uri).collect().await
|
||||
}
|
||||
|
||||
const TEST_STRING: &str = "http://test-string.com";
|
||||
const TEST_URL: &str = "https://test-url.org";
|
||||
const TEST_FILE: &str = "https://test-file.io";
|
||||
|
|
@ -82,12 +90,8 @@ mod test {
|
|||
// Treat as plaintext file (no extension)
|
||||
let file_path = temp_dir.path().join("README");
|
||||
let _file = File::create(&file_path)?;
|
||||
let input = Input::new(&file_path.as_path().display().to_string(), true);
|
||||
let contents: Vec<_> = input
|
||||
.get_contents(None, true)
|
||||
.await
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
let input = Input::new(&file_path.as_path().display().to_string(), None, true);
|
||||
let contents: Vec<_> = input.get_contents(true).await.collect::<Vec<_>>().await;
|
||||
|
||||
assert_eq!(contents.len(), 1);
|
||||
assert_eq!(contents[0].as_ref().unwrap().file_type, FileType::Plaintext);
|
||||
|
|
@ -96,12 +100,8 @@ mod test {
|
|||
|
||||
#[tokio::test]
|
||||
async fn test_url_without_extension_is_html() -> Result<()> {
|
||||
let input = Input::new("https://example.org/", true);
|
||||
let contents: Vec<_> = input
|
||||
.get_contents(None, true)
|
||||
.await
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
let input = Input::new("https://example.org/", None, true);
|
||||
let contents: Vec<_> = input.get_contents(true).await.collect::<Vec<_>>().await;
|
||||
|
||||
assert_eq!(contents.len(), 1);
|
||||
assert_eq!(contents[0].as_ref().unwrap().file_type, FileType::Html);
|
||||
|
|
@ -128,32 +128,191 @@ mod test {
|
|||
let mock_server = mock_server!(StatusCode::OK, set_body_string(TEST_URL));
|
||||
|
||||
let inputs = vec![
|
||||
Input::String(TEST_STRING.to_owned()),
|
||||
Input::RemoteUrl(Box::new(
|
||||
Url::parse(&mock_server.uri()).map_err(|e| (mock_server.uri(), e))?,
|
||||
)),
|
||||
Input::FsPath(file_path),
|
||||
Input::FsGlob {
|
||||
pattern: temp_dir_path.join("glob*").to_str().unwrap().to_owned(),
|
||||
ignore_case: true,
|
||||
Input {
|
||||
source: InputSource::String(TEST_STRING.to_owned()),
|
||||
file_type_hint: None,
|
||||
},
|
||||
Input {
|
||||
source: InputSource::RemoteUrl(Box::new(
|
||||
Url::parse(&mock_server.uri()).map_err(|e| (mock_server.uri(), e))?,
|
||||
)),
|
||||
file_type_hint: None,
|
||||
},
|
||||
Input {
|
||||
source: InputSource::FsPath(file_path),
|
||||
file_type_hint: None,
|
||||
},
|
||||
Input {
|
||||
source: InputSource::FsGlob {
|
||||
pattern: temp_dir_path.join("glob*").to_str().unwrap().to_owned(),
|
||||
ignore_case: true,
|
||||
},
|
||||
file_type_hint: None,
|
||||
},
|
||||
];
|
||||
|
||||
let responses = Collector::new(None, false).collect_links(inputs).await;
|
||||
let mut links: Vec<Uri> = responses.map(|r| r.unwrap().uri).collect().await;
|
||||
let links = collect(inputs, None).await;
|
||||
|
||||
let mut expected_links = vec![
|
||||
let expected_links = HashSet::from_iter([
|
||||
website(TEST_STRING),
|
||||
website(TEST_URL),
|
||||
website(TEST_FILE),
|
||||
website(TEST_GLOB_1),
|
||||
mail(TEST_GLOB_2_MAIL),
|
||||
];
|
||||
]);
|
||||
|
||||
links.sort();
|
||||
expected_links.sort();
|
||||
assert_eq!(links, expected_links);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_collect_markdown_links() {
|
||||
let base = Base::try_from("https://github.com/hello-rust/lychee/").unwrap();
|
||||
let input = Input {
|
||||
source: InputSource::String("This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)".to_string()),
|
||||
file_type_hint: Some(FileType::Markdown),
|
||||
};
|
||||
let links = collect(vec![input], Some(base)).await;
|
||||
|
||||
let expected_links = HashSet::from_iter([
|
||||
website("https://endler.dev"),
|
||||
website("https://github.com/hello-rust/lychee/relative_link"),
|
||||
]);
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_collect_html_links() {
|
||||
let base = Base::try_from("https://github.com/lycheeverse/").unwrap();
|
||||
let input = Input {
|
||||
source: InputSource::String(
|
||||
r#"<html>
|
||||
<div class="row">
|
||||
<a href="https://github.com/lycheeverse/lychee/">
|
||||
<a href="blob/master/README.md">README</a>
|
||||
</div>
|
||||
</html>"#
|
||||
.to_string(),
|
||||
),
|
||||
file_type_hint: Some(FileType::Html),
|
||||
};
|
||||
let links = collect(vec![input], Some(base)).await;
|
||||
|
||||
let expected_links = HashSet::from_iter([
|
||||
website("https://github.com/lycheeverse/lychee/"),
|
||||
website("https://github.com/lycheeverse/blob/master/README.md"),
|
||||
]);
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_collect_html_srcset() {
|
||||
let base = Base::try_from("https://example.com/").unwrap();
|
||||
let input = Input {
|
||||
source: InputSource::String(
|
||||
r#"
|
||||
<img
|
||||
src="/static/image.png"
|
||||
srcset="
|
||||
/static/image300.png 300w,
|
||||
/static/image600.png 600w,
|
||||
"
|
||||
/>
|
||||
"#
|
||||
.to_string(),
|
||||
),
|
||||
file_type_hint: Some(FileType::Html),
|
||||
};
|
||||
let links = collect(vec![input], Some(base)).await;
|
||||
|
||||
let expected_links = HashSet::from_iter([
|
||||
website("https://example.com/static/image.png"),
|
||||
website("https://example.com/static/image300.png"),
|
||||
website("https://example.com/static/image600.png"),
|
||||
]);
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_markdown_internal_url() {
|
||||
let base = Base::try_from("https://localhost.com/").unwrap();
|
||||
|
||||
let input = Input {
|
||||
source: InputSource::String(
|
||||
r#"This is [an internal url](@/internal.md)
|
||||
This is [an internal url](@/internal.markdown)
|
||||
This is [an internal url](@/internal.markdown#example)
|
||||
This is [an internal url](@/internal.md#example)"#
|
||||
.to_string(),
|
||||
),
|
||||
file_type_hint: Some(FileType::Markdown),
|
||||
};
|
||||
|
||||
let links = collect(vec![input], Some(base)).await;
|
||||
|
||||
let expected = HashSet::from_iter([
|
||||
website("https://localhost.com/@/internal.md"),
|
||||
website("https://localhost.com/@/internal.markdown"),
|
||||
website("https://localhost.com/@/internal.md#example"),
|
||||
website("https://localhost.com/@/internal.markdown#example"),
|
||||
]);
|
||||
|
||||
assert_eq!(links, expected);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_extract_html5_not_valid_xml_relative_links() {
|
||||
let base = Base::try_from("https://example.org").unwrap();
|
||||
let input = load_fixture("TEST_HTML5.html");
|
||||
|
||||
let input = Input {
|
||||
source: InputSource::String(input),
|
||||
file_type_hint: Some(FileType::Html),
|
||||
};
|
||||
let links = collect(vec![input], Some(base)).await;
|
||||
|
||||
let expected_links = HashSet::from_iter([
|
||||
// the body links wouldn't be present if the file was parsed strictly as XML
|
||||
website("https://example.org/body/a"),
|
||||
website("https://example.org/body/div_empty_a"),
|
||||
website("https://example.org/css/style_full_url.css"),
|
||||
website("https://example.org/css/style_relative_url.css"),
|
||||
website("https://example.org/head/home"),
|
||||
website("https://example.org/images/icon.png"),
|
||||
website("https://example.org/js/script.js"),
|
||||
]);
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_relative_url_with_base_extracted_from_input() {
|
||||
let contents = r#"<html>
|
||||
<div class="row">
|
||||
<a href="https://github.com/lycheeverse/lychee/">Github</a>
|
||||
<a href="/about">About</a>
|
||||
</div>
|
||||
</html>"#;
|
||||
let mock_server = mock_server!(StatusCode::OK, set_body_string(contents));
|
||||
|
||||
let server_uri = Url::parse(&mock_server.uri()).unwrap();
|
||||
|
||||
let input = Input {
|
||||
source: InputSource::RemoteUrl(Box::new(server_uri.clone())),
|
||||
file_type_hint: None,
|
||||
};
|
||||
|
||||
let links = collect(vec![input], None).await;
|
||||
|
||||
let expected_urls = HashSet::from_iter([
|
||||
website("https://github.com/lycheeverse/lychee/"),
|
||||
website(&format!("{}about", server_uri)),
|
||||
]);
|
||||
|
||||
assert_eq!(links, expected_urls);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,566 +0,0 @@
|
|||
use std::{collections::HashSet, convert::TryFrom, path::Path, path::PathBuf};
|
||||
|
||||
use html5ever::{
|
||||
parse_document,
|
||||
tendril::{StrTendril, TendrilSink},
|
||||
};
|
||||
use log::info;
|
||||
use markup5ever_rcdom::{Handle, NodeData, RcDom};
|
||||
use percent_encoding::percent_decode_str;
|
||||
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
|
||||
use reqwest::Url;
|
||||
|
||||
use crate::{
|
||||
helpers::{path, url},
|
||||
types::{FileType, InputContent},
|
||||
Base, ErrorKind, Input, Request, Result, Uri,
|
||||
};
|
||||
|
||||
/// A handler for extracting links from various input formats like Markdown and
|
||||
/// HTML. Allocations are avoided if possible as this is a performance-critical
|
||||
/// section of the library.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Extractor {
|
||||
/// Base URL or Path
|
||||
pub base: Option<Base>,
|
||||
}
|
||||
|
||||
impl Extractor {
|
||||
pub(crate) const fn new(base: Option<Base>) -> Self {
|
||||
Extractor { base }
|
||||
}
|
||||
|
||||
/// Main entrypoint for extracting links from various sources
|
||||
/// (Markdown, HTML, and plaintext)
|
||||
pub(crate) fn extract(&mut self, input_content: &InputContent) -> Result<HashSet<Request>> {
|
||||
let urls = match input_content.file_type {
|
||||
FileType::Markdown => self.extract_markdown(&input_content.content),
|
||||
FileType::Html => self.extract_html(&input_content.content)?,
|
||||
FileType::Plaintext => self.extract_plaintext(&input_content.content),
|
||||
};
|
||||
self.create_requests(&urls, input_content)
|
||||
}
|
||||
|
||||
/// Create requests out of the collected URLs.
|
||||
/// Only keeps "valid" URLs. This filters out anchors for example.
|
||||
fn create_requests(
|
||||
&self,
|
||||
urls: &[StrTendril],
|
||||
input_content: &InputContent,
|
||||
) -> Result<HashSet<Request>> {
|
||||
let mut requests: HashSet<Request> = HashSet::with_capacity(urls.len());
|
||||
|
||||
let base_input = match &input_content.input {
|
||||
Input::RemoteUrl(url) => Some(Url::parse(&format!(
|
||||
"{}://{}",
|
||||
url.scheme(),
|
||||
url.host_str().ok_or(ErrorKind::InvalidUrlHost)?
|
||||
))?),
|
||||
_ => None,
|
||||
// other inputs do not have a URL to extract a base
|
||||
};
|
||||
|
||||
for url in urls {
|
||||
let req = if let Ok(uri) = Uri::try_from(url.as_ref()) {
|
||||
Request::new(uri, input_content.input.clone())
|
||||
} else if let Some(url) = self.base.as_ref().and_then(|u| u.join(url)) {
|
||||
Request::new(Uri { url }, input_content.input.clone())
|
||||
} else if let Input::FsPath(root) = &input_content.input {
|
||||
if url::is_anchor(url) {
|
||||
// Silently ignore anchor links for now
|
||||
continue;
|
||||
}
|
||||
match self.create_uri_from_path(root, url)? {
|
||||
Some(url) => Request::new(Uri { url }, input_content.input.clone()),
|
||||
None => {
|
||||
// In case we cannot create a URI from a path but we didn't receive an error,
|
||||
// it means that some preconditions were not met, e.g. the `base_url` wasn't set.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else if let Some(url) = base_input.as_ref().map(|u| u.join(url)) {
|
||||
if self.base.is_some() {
|
||||
continue;
|
||||
}
|
||||
Request::new(Uri { url: url? }, input_content.input.clone())
|
||||
} else {
|
||||
info!("Handling of {} not implemented yet", &url);
|
||||
continue;
|
||||
};
|
||||
requests.insert(req);
|
||||
}
|
||||
Ok(requests)
|
||||
}
|
||||
|
||||
/// Extract unparsed URL strings from a Markdown string.
|
||||
fn extract_markdown(&self, input: &str) -> Vec<StrTendril> {
|
||||
let parser = Parser::new(input);
|
||||
parser
|
||||
.flat_map(|event| match event {
|
||||
MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => {
|
||||
vec![StrTendril::from(url.as_ref())]
|
||||
}
|
||||
MDEvent::Text(txt) => self.extract_plaintext(&txt),
|
||||
MDEvent::Html(html) => self.extract_plaintext(&html.to_string()),
|
||||
_ => vec![],
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Extract unparsed URL strings from an HTML string.
|
||||
fn extract_html(&mut self, input: &str) -> Result<Vec<StrTendril>> {
|
||||
let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default())
|
||||
.from_utf8()
|
||||
.read_from(&mut input.as_bytes())?;
|
||||
|
||||
Ok(self.walk_html_links(&rc_dom.document))
|
||||
}
|
||||
|
||||
/// Recursively walk links in a HTML document, aggregating URL strings in `urls`.
|
||||
fn walk_html_links(&mut self, node: &Handle) -> Vec<StrTendril> {
|
||||
let mut all_urls = Vec::new();
|
||||
match node.data {
|
||||
NodeData::Text { ref contents } => {
|
||||
all_urls.append(&mut self.extract_plaintext(&contents.borrow()));
|
||||
}
|
||||
|
||||
NodeData::Comment { ref contents } => {
|
||||
all_urls.append(&mut self.extract_plaintext(contents));
|
||||
}
|
||||
NodeData::Element {
|
||||
ref name,
|
||||
ref attrs,
|
||||
..
|
||||
} => {
|
||||
for attr in attrs.borrow().iter() {
|
||||
let urls = url::extract_links_from_elem_attr(
|
||||
attr.name.local.as_ref(),
|
||||
name.local.as_ref(),
|
||||
attr.value.as_ref(),
|
||||
);
|
||||
|
||||
if urls.is_empty() {
|
||||
self.extract_plaintext(&attr.value);
|
||||
} else {
|
||||
all_urls.extend(urls.into_iter().map(StrTendril::from).collect::<Vec<_>>());
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
// recursively traverse the document's nodes -- this doesn't need any extra
|
||||
// exit conditions, because the document is a tree
|
||||
for child in node.children.borrow().iter() {
|
||||
let urls = self.walk_html_links(child);
|
||||
all_urls.extend(urls);
|
||||
}
|
||||
|
||||
all_urls
|
||||
}
|
||||
|
||||
/// Extract unparsed URL strings from plaintext
|
||||
// Allow &self here for consistency with the other extractors
|
||||
#[allow(clippy::unused_self)]
|
||||
fn extract_plaintext(&self, input: &str) -> Vec<StrTendril> {
|
||||
url::find_links(input)
|
||||
.map(|l| StrTendril::from(l.as_str()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn create_uri_from_path(&self, src: &Path, dst: &str) -> Result<Option<Url>> {
|
||||
let dst = url::remove_get_params_and_fragment(dst);
|
||||
// Avoid double-encoding already encoded destination paths by removing any
|
||||
// potential encoding (e.g. `web%20site` becomes `web site`).
|
||||
// That's because Url::from_file_path will encode the full URL in the end.
|
||||
// This behavior cannot be configured.
|
||||
// See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
|
||||
// TODO: This is not a perfect solution.
|
||||
// Ideally, only `src` and `base` should be URL encoded (as is done by
|
||||
// `from_file_path` at the moment) while `dst` is left untouched and simply
|
||||
// appended to the end.
|
||||
let decoded = percent_decode_str(dst).decode_utf8()?;
|
||||
let resolved = path::resolve(src, &PathBuf::from(&*decoded), &self.base)?;
|
||||
match resolved {
|
||||
Some(path) => Url::from_file_path(&path)
|
||||
.map(Some)
|
||||
.map_err(|_e| ErrorKind::InvalidUrlFromPath(path)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::{
|
||||
array,
|
||||
collections::HashSet,
|
||||
fs::File,
|
||||
io::{BufReader, Read},
|
||||
path::Path,
|
||||
};
|
||||
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use super::*;
|
||||
use crate::{
|
||||
helpers::url::find_links,
|
||||
test_utils::{mail, website},
|
||||
Uri,
|
||||
};
|
||||
use crate::{
|
||||
types::{FileType, InputContent},
|
||||
Base,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_create_uri_from_path() {
|
||||
let extractor = Extractor::new(None);
|
||||
let result = extractor
|
||||
.create_uri_from_path(&PathBuf::from("/README.md"), "test+encoding")
|
||||
.unwrap();
|
||||
assert_eq!(result.unwrap().as_str(), "file:///test+encoding");
|
||||
}
|
||||
|
||||
fn load_fixture(filename: &str) -> String {
|
||||
let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("fixtures")
|
||||
.join(filename);
|
||||
|
||||
let file = File::open(fixture_path).expect("Unable to open fixture file");
|
||||
let mut buf_reader = BufReader::new(file);
|
||||
let mut content = String::new();
|
||||
|
||||
buf_reader
|
||||
.read_to_string(&mut content)
|
||||
.expect("Unable to read fixture file contents");
|
||||
|
||||
content
|
||||
}
|
||||
|
||||
fn extract_uris(input: &str, file_type: FileType, base_url: Option<&str>) -> HashSet<Uri> {
|
||||
let base = base_url.map(|url| Base::Remote(Url::parse(url).unwrap()));
|
||||
let mut extractor = Extractor::new(base);
|
||||
extractor
|
||||
.extract(&InputContent::from_string(input, file_type))
|
||||
// unwrap is fine here as this helper function is only used in tests
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|r| r.uri)
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_file_type() {
|
||||
// FIXME: Assume plaintext in case a path has no extension
|
||||
// assert_eq!(FileType::from(Path::new("/")), FileType::Plaintext);
|
||||
assert_eq!(FileType::from("test.md"), FileType::Markdown);
|
||||
assert_eq!(FileType::from("test.markdown"), FileType::Markdown);
|
||||
assert_eq!(FileType::from("test.html"), FileType::Html);
|
||||
assert_eq!(FileType::from("test.txt"), FileType::Plaintext);
|
||||
assert_eq!(FileType::from("test.something"), FileType::Plaintext);
|
||||
assert_eq!(
|
||||
FileType::from("/absolute/path/to/test.something"),
|
||||
FileType::Plaintext
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_link_at_end_of_line() {
|
||||
let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
|
||||
let link = input.trim_end();
|
||||
let mut extractor = Extractor::new(None);
|
||||
|
||||
let urls = extractor.extract_markdown(input);
|
||||
assert_eq!(vec![StrTendril::from(link)], urls);
|
||||
|
||||
let urls = extractor.extract_plaintext(input);
|
||||
assert_eq!(vec![StrTendril::from(link)], urls);
|
||||
|
||||
let urls = extractor.extract_html(input).unwrap();
|
||||
assert_eq!(vec![StrTendril::from(link)], urls);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_markdown_links() {
|
||||
let links = extract_uris(
|
||||
"This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)",
|
||||
FileType::Markdown,
|
||||
Some("https://github.com/hello-rust/lychee/"),
|
||||
);
|
||||
|
||||
let expected_links = array::IntoIter::new([
|
||||
website("https://endler.dev"),
|
||||
website("https://github.com/hello-rust/lychee/relative_link"),
|
||||
])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html_links() {
|
||||
let links = extract_uris(
|
||||
r#"<html>
|
||||
<div class="row">
|
||||
<a href="https://github.com/lycheeverse/lychee/">
|
||||
<a href="blob/master/README.md">README</a>
|
||||
</div>
|
||||
</html>"#,
|
||||
FileType::Html,
|
||||
Some("https://github.com/lycheeverse/"),
|
||||
);
|
||||
|
||||
let expected_links = array::IntoIter::new([
|
||||
website("https://github.com/lycheeverse/lychee/"),
|
||||
website("https://github.com/lycheeverse/blob/master/README.md"),
|
||||
])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html_srcset() {
|
||||
let links = extract_uris(
|
||||
r#"
|
||||
<img
|
||||
src="/static/image.png"
|
||||
srcset="
|
||||
/static/image300.png 300w,
|
||||
/static/image600.png 600w,
|
||||
"
|
||||
/>
|
||||
"#,
|
||||
FileType::Html,
|
||||
Some("https://example.com/"),
|
||||
);
|
||||
|
||||
let expected_links = array::IntoIter::new([
|
||||
website("https://example.com/static/image.png"),
|
||||
website("https://example.com/static/image300.png"),
|
||||
website("https://example.com/static/image600.png"),
|
||||
])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_markdown_anchors() {
|
||||
let links = extract_uris("This is [a test](#lol).", FileType::Markdown, None);
|
||||
|
||||
assert!(links.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_markdown_internal_urls() {
|
||||
let links = extract_uris("This is [a test](./internal).", FileType::Markdown, None);
|
||||
|
||||
assert!(links.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_markdown_internal_url() {
|
||||
let base_url = "https://localhost.com/";
|
||||
let input = "This is [an internal url](@/internal.md) \
|
||||
This is [an internal url](@/internal.markdown) \
|
||||
This is [an internal url](@/internal.markdown#example) \
|
||||
This is [an internal url](@/internal.md#example)";
|
||||
|
||||
let links = extract_uris(input, FileType::Markdown, Some(base_url));
|
||||
|
||||
let expected = array::IntoIter::new([
|
||||
website("https://localhost.com/@/internal.md"),
|
||||
website("https://localhost.com/@/internal.markdown"),
|
||||
website("https://localhost.com/@/internal.md#example"),
|
||||
website("https://localhost.com/@/internal.markdown#example"),
|
||||
])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_markdown_email() {
|
||||
let input = "Get in touch - [Contact Us](mailto:test@test.com)";
|
||||
let links = extract_uris(input, FileType::Markdown, None);
|
||||
let expected = array::IntoIter::new([mail("test@test.com")]).collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_non_markdown_links() {
|
||||
let input =
|
||||
"https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.org";
|
||||
let links: HashSet<Uri> = extract_uris(input, FileType::Plaintext, None);
|
||||
|
||||
let expected = array::IntoIter::new([
|
||||
website("https://endler.dev"),
|
||||
website("https://hello-rust.show/foo/bar?lol=1"),
|
||||
mail("test@example.org"),
|
||||
])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_md_escape() {
|
||||
let input = r#"http://msdn.microsoft.com/library/ie/ms535874\(v=vs.85\).aspx"#;
|
||||
let links: Vec<_> = find_links(input).collect();
|
||||
let expected = "http://msdn.microsoft.com/library/ie/ms535874(v=vs.85).aspx)";
|
||||
|
||||
matches!(&links[..], [link] if link.as_str() == expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html5_not_valid_xml() {
|
||||
let input = load_fixture("TEST_HTML5.html");
|
||||
let links = extract_uris(&input, FileType::Html, None);
|
||||
|
||||
let expected_links = array::IntoIter::new([
|
||||
website("https://example.org/head/home"),
|
||||
website("https://example.org/css/style_full_url.css"),
|
||||
// the body links wouldn't be present if the file was parsed strictly as XML
|
||||
website("https://example.org/body/a"),
|
||||
website("https://example.org/body/div_empty_a"),
|
||||
])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html5_not_valid_xml_relative_links() {
|
||||
let input = load_fixture("TEST_HTML5.html");
|
||||
let links = extract_uris(&input, FileType::Html, Some("https://example.org"));
|
||||
|
||||
let expected_links = array::IntoIter::new([
|
||||
website("https://example.org/head/home"),
|
||||
website("https://example.org/images/icon.png"),
|
||||
website("https://example.org/css/style_relative_url.css"),
|
||||
website("https://example.org/css/style_full_url.css"),
|
||||
website("https://example.org/js/script.js"),
|
||||
// the body links wouldn't be present if the file was parsed strictly as XML
|
||||
website("https://example.org/body/a"),
|
||||
website("https://example.org/body/div_empty_a"),
|
||||
])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_relative_url_with_base_extracted_from_input() {
|
||||
let input = Input::RemoteUrl(Box::new(
|
||||
Url::parse("https://example.org/some-post").unwrap(),
|
||||
));
|
||||
|
||||
let contents = r#"<html>
|
||||
<div class="row">
|
||||
<a href="https://github.com/lycheeverse/lychee/">Github</a>
|
||||
<a href="/about">About</a>
|
||||
</div>
|
||||
</html>"#;
|
||||
|
||||
let input_content = &InputContent {
|
||||
input,
|
||||
file_type: FileType::Html,
|
||||
content: contents.to_string(),
|
||||
};
|
||||
|
||||
let mut extractor = Extractor::new(None);
|
||||
let links = extractor.extract(input_content);
|
||||
let urls = links
|
||||
.unwrap()
|
||||
.iter()
|
||||
.map(|x| x.uri.url.as_str().to_string())
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
let expected_urls = array::IntoIter::new([
|
||||
String::from("https://github.com/lycheeverse/lychee/"),
|
||||
String::from("https://example.org/about"),
|
||||
])
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
assert_eq!(urls, expected_urls);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html5_lowercase_doctype() {
|
||||
// this has been problematic with previous XML based parser
|
||||
let input = load_fixture("TEST_HTML5_LOWERCASE_DOCTYPE.html");
|
||||
let links = extract_uris(&input, FileType::Html, None);
|
||||
|
||||
let expected_links =
|
||||
array::IntoIter::new([website("https://example.org/body/a")]).collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html5_minified() {
|
||||
// minified HTML with some quirky elements such as href attribute values specified without quotes
|
||||
let input = load_fixture("TEST_HTML5_MINIFIED.html");
|
||||
let links = extract_uris(&input, FileType::Html, None);
|
||||
|
||||
let expected_links = array::IntoIter::new([
|
||||
website("https://example.org/"),
|
||||
website("https://example.org/favicon.ico"),
|
||||
website("https://fonts.externalsite.com"),
|
||||
website("https://example.org/docs/"),
|
||||
website("https://example.org/forum"),
|
||||
])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html5_malformed() {
|
||||
// malformed links shouldn't stop the parser from further parsing
|
||||
let input = load_fixture("TEST_HTML5_MALFORMED_LINKS.html");
|
||||
let links = extract_uris(&input, FileType::Html, None);
|
||||
|
||||
let expected_links =
|
||||
array::IntoIter::new([website("https://example.org/valid")]).collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html5_custom_elements() {
|
||||
// the element name shouldn't matter for attributes like href, src, cite etc
|
||||
let input = load_fixture("TEST_HTML5_CUSTOM_ELEMENTS.html");
|
||||
let links = extract_uris(&input, FileType::Html, None);
|
||||
|
||||
let expected_links = array::IntoIter::new([
|
||||
website("https://example.org/some-weird-element"),
|
||||
website("https://example.org/even-weirder-src"),
|
||||
website("https://example.org/even-weirder-href"),
|
||||
website("https://example.org/citations"),
|
||||
])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_urls_with_at_sign_properly() {
|
||||
// note that these used to parse as emails
|
||||
let input = "https://example.com/@test/test http://otherdomain.com/test/@test".to_string();
|
||||
let links = extract_uris(&input, FileType::Plaintext, None);
|
||||
|
||||
let expected_links = array::IntoIter::new([
|
||||
website("https://example.com/@test/test"),
|
||||
website("http://otherdomain.com/test/@test"),
|
||||
])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
}
|
||||
145
lychee-lib/src/extract/html.rs
Normal file
145
lychee-lib/src/extract/html.rs
Normal file
|
|
@ -0,0 +1,145 @@
|
|||
use html5ever::{
|
||||
buffer_queue::BufferQueue,
|
||||
tendril::StrTendril,
|
||||
tokenizer::{Tag, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
|
||||
};
|
||||
|
||||
use super::plaintext::extract_plaintext;
|
||||
use crate::types::raw_uri::RawUri;
|
||||
|
||||
#[derive(Clone)]
|
||||
struct LinkExtractor {
|
||||
links: Vec<RawUri>,
|
||||
}
|
||||
|
||||
impl TokenSink for LinkExtractor {
|
||||
type Handle = ();
|
||||
|
||||
#[allow(clippy::match_same_arms)]
|
||||
fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
|
||||
match token {
|
||||
Token::CharacterTokens(raw) => self.links.append(&mut extract_plaintext(&raw)),
|
||||
Token::TagToken(tag) => {
|
||||
let Tag {
|
||||
kind: _kind,
|
||||
name,
|
||||
self_closing: _self_closing,
|
||||
attrs,
|
||||
} = tag;
|
||||
|
||||
for attr in attrs {
|
||||
let urls = extract_urls_from_elem_attr(
|
||||
attr.name.local.as_ref(),
|
||||
name.as_ref(),
|
||||
attr.value.as_ref(),
|
||||
);
|
||||
|
||||
if urls.is_empty() {
|
||||
extract_plaintext(&attr.value);
|
||||
} else {
|
||||
self.links.extend(
|
||||
urls.into_iter()
|
||||
.map(|url| RawUri {
|
||||
text: url,
|
||||
element: Some(name.to_string()),
|
||||
attribute: Some(attr.name.local.to_string()),
|
||||
})
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Token::ParseError(_err) => {
|
||||
// Silently ignore parse errors
|
||||
}
|
||||
Token::CommentToken(_raw) => (),
|
||||
Token::NullCharacterToken => (),
|
||||
Token::DoctypeToken(_doctype) => (),
|
||||
Token::EOFToken => (),
|
||||
}
|
||||
TokenSinkResult::Continue
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract all semantically known links from a given html attribute.
|
||||
#[allow(clippy::unnested_or_patterns)]
|
||||
pub(crate) fn extract_urls_from_elem_attr(
|
||||
attr_name: &str,
|
||||
elem_name: &str,
|
||||
attr_value: &str,
|
||||
) -> Vec<String> {
|
||||
let mut urls = Vec::new();
|
||||
|
||||
// For a comprehensive list of elements that might contain URLs/URIs
|
||||
// see https://www.w3.org/TR/REC-html40/index/attributes.html
|
||||
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
|
||||
match (elem_name, attr_name) {
|
||||
// Common element/attribute combinations for links
|
||||
(_, "href" | "src" | "cite" | "usemap")
|
||||
// Less common (but still valid!) combinations
|
||||
| ("applet", "codebase")
|
||||
| ("body", "background")
|
||||
| ("button", "formaction")
|
||||
| ("command", "icon")
|
||||
| ("form", "action")
|
||||
| ("frame", "longdesc")
|
||||
| ("head", "profile")
|
||||
| ("html", "manifest")
|
||||
| ("iframe", "longdesc")
|
||||
| ("img", "longdesc")
|
||||
| ("input", "formaction")
|
||||
| ("object", "classid")
|
||||
| ("object", "codebase")
|
||||
| ("object", "data")
|
||||
| ("video", "poster") => {
|
||||
urls.push(attr_value.to_owned());
|
||||
}
|
||||
(_, "srcset") => {
|
||||
for image_candidate_string in attr_value.trim().split(',') {
|
||||
for part in image_candidate_string.split_ascii_whitespace() {
|
||||
if part.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
urls.push(part.to_owned());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
urls
|
||||
}
|
||||
|
||||
/// Extract unparsed URL strings from an HTML string.
|
||||
pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
|
||||
let mut tokenizer = Tokenizer::new(
|
||||
LinkExtractor { links: Vec::new() },
|
||||
TokenizerOpts::default(),
|
||||
);
|
||||
|
||||
let mut input = BufferQueue::new();
|
||||
input.push_back(StrTendril::from(buf));
|
||||
|
||||
let _handle = tokenizer.feed(&mut input);
|
||||
tokenizer.end();
|
||||
|
||||
tokenizer.sink.links
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_extract_link_at_end_of_line() {
|
||||
let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
|
||||
let link = input.trim_end();
|
||||
|
||||
let uris: Vec<String> = extract_html(input)
|
||||
.into_iter()
|
||||
.map(|raw_uri| raw_uri.text)
|
||||
.collect();
|
||||
assert_eq!(vec![link.to_string()], uris);
|
||||
}
|
||||
}
|
||||
35
lychee-lib/src/extract/markdown.rs
Normal file
35
lychee-lib/src/extract/markdown.rs
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
|
||||
|
||||
use crate::{extract::plaintext::extract_plaintext, types::raw_uri::RawUri};
|
||||
|
||||
/// Extract unparsed URL strings from a Markdown string.
|
||||
pub(crate) fn extract_markdown(input: &str) -> Vec<RawUri> {
|
||||
let parser = Parser::new(input);
|
||||
parser
|
||||
.flat_map(|event| match event {
|
||||
MDEvent::Start(Tag::Link(_, uri, _)) => {
|
||||
vec![RawUri {
|
||||
text: uri.to_string(),
|
||||
// Emulate `<a href="...">` tag here to be compatible with
|
||||
// HTML links. We might consider using the actual Markdown
|
||||
// `LinkType` for better granularity in the future
|
||||
element: Some("a".to_string()),
|
||||
attribute: Some("href".to_string()),
|
||||
}]
|
||||
}
|
||||
MDEvent::Start(Tag::Image(_, uri, _)) => {
|
||||
vec![RawUri {
|
||||
text: uri.to_string(),
|
||||
// Emulate `<img src="...">` tag here to be compatible with
|
||||
// HTML links. We might consider using the actual Markdown
|
||||
// `LinkType` for better granularity in the future
|
||||
element: Some("img".to_string()),
|
||||
attribute: Some("src".to_string()),
|
||||
}]
|
||||
}
|
||||
MDEvent::Text(txt) => extract_plaintext(&txt),
|
||||
MDEvent::Html(html) => extract_plaintext(&html.to_string()),
|
||||
_ => vec![],
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
245
lychee-lib/src/extract/mod.rs
Normal file
245
lychee-lib/src/extract/mod.rs
Normal file
|
|
@ -0,0 +1,245 @@
|
|||
use crate::types::{raw_uri::RawUri, FileType, InputContent};
|
||||
|
||||
mod html;
|
||||
mod markdown;
|
||||
mod plaintext;
|
||||
|
||||
use html::extract_html;
|
||||
use markdown::extract_markdown;
|
||||
use plaintext::extract_plaintext;
|
||||
|
||||
/// A handler for extracting links from various input formats like Markdown and
|
||||
/// HTML. Allocations should be avoided if possible as this is a
|
||||
/// performance-critical section of the library.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Extractor;
|
||||
|
||||
impl Extractor {
|
||||
/// Main entrypoint for extracting links from various sources
|
||||
/// (Markdown, HTML, and plaintext)
|
||||
#[must_use]
|
||||
pub fn extract(input_content: &InputContent) -> Vec<RawUri> {
|
||||
match input_content.file_type {
|
||||
FileType::Markdown => extract_markdown(&input_content.content),
|
||||
FileType::Html => extract_html(&input_content.content),
|
||||
FileType::Plaintext => extract_plaintext(&input_content.content),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use pretty_assertions::assert_eq;
|
||||
use reqwest::Url;
|
||||
use std::{array, collections::HashSet, convert::TryFrom, path::Path};
|
||||
|
||||
use super::*;
|
||||
use crate::{
|
||||
helpers::url::find_links,
|
||||
test_utils::{load_fixture, mail, website},
|
||||
types::{FileType, InputContent, InputSource},
|
||||
Uri,
|
||||
};
|
||||
|
||||
fn extract_uris(input: &str, file_type: FileType) -> HashSet<Uri> {
|
||||
let input_content = InputContent::from_string(input, file_type);
|
||||
Extractor::extract(&input_content)
|
||||
.into_iter()
|
||||
.filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_file_type() {
|
||||
assert_eq!(FileType::from(Path::new("/")), FileType::Plaintext);
|
||||
assert_eq!(FileType::from("test.md"), FileType::Markdown);
|
||||
assert_eq!(FileType::from("test.markdown"), FileType::Markdown);
|
||||
assert_eq!(FileType::from("test.html"), FileType::Html);
|
||||
assert_eq!(FileType::from("test.txt"), FileType::Plaintext);
|
||||
assert_eq!(FileType::from("test.something"), FileType::Plaintext);
|
||||
assert_eq!(
|
||||
FileType::from("/absolute/path/to/test.something"),
|
||||
FileType::Plaintext
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_markdown_anchors() {
|
||||
let links = extract_uris("This is [a test](#lol).", FileType::Markdown);
|
||||
|
||||
assert!(links.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_markdown_internal_urls() {
|
||||
let links = extract_uris("This is [a test](./internal).", FileType::Markdown);
|
||||
|
||||
assert!(links.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_markdown_email() {
|
||||
let input = "Get in touch - [Contact Us](mailto:test@test.com)";
|
||||
let links = extract_uris(input, FileType::Markdown);
|
||||
let expected = array::IntoIter::new([mail("test@test.com")]).collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn relative_urls() {
|
||||
let links = extract_uris("This is [a test](/internal).", FileType::Markdown);
|
||||
|
||||
assert!(links.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_non_markdown_links() {
|
||||
let input =
|
||||
"https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.org";
|
||||
let links: HashSet<Uri> = extract_uris(input, FileType::Plaintext);
|
||||
|
||||
let expected = array::IntoIter::new([
|
||||
website("https://endler.dev"),
|
||||
website("https://hello-rust.show/foo/bar?lol=1"),
|
||||
mail("test@example.org"),
|
||||
])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_md_escape() {
|
||||
let input = r#"http://msdn.microsoft.com/library/ie/ms535874\(v=vs.85\).aspx"#;
|
||||
let links: Vec<_> = find_links(input).collect();
|
||||
let expected = "http://msdn.microsoft.com/library/ie/ms535874(v=vs.85).aspx)";
|
||||
|
||||
matches!(&links[..], [link] if link.as_str() == expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html5_not_valid_xml() {
|
||||
let input = load_fixture("TEST_HTML5.html");
|
||||
let links = extract_uris(&input, FileType::Html);
|
||||
|
||||
let expected_links = array::IntoIter::new([
|
||||
website("https://example.org/head/home"),
|
||||
website("https://example.org/css/style_full_url.css"),
|
||||
// the body links wouldn't be present if the file was parsed strictly as XML
|
||||
website("https://example.org/body/a"),
|
||||
website("https://example.org/body/div_empty_a"),
|
||||
])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_relative_url() {
|
||||
let source = InputSource::RemoteUrl(Box::new(
|
||||
Url::parse("https://example.org/some-post").unwrap(),
|
||||
));
|
||||
|
||||
let contents = r#"<html>
|
||||
<div class="row">
|
||||
<a href="https://github.com/lycheeverse/lychee/">Github</a>
|
||||
<a href="/about">About</a>
|
||||
</div>
|
||||
</html>"#;
|
||||
|
||||
let input_content = &InputContent {
|
||||
source,
|
||||
file_type: FileType::Html,
|
||||
content: contents.to_string(),
|
||||
};
|
||||
|
||||
let links = Extractor::extract(input_content);
|
||||
let urls = links
|
||||
.into_iter()
|
||||
.map(|raw_uri| raw_uri.text)
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
let expected_urls = array::IntoIter::new([
|
||||
String::from("https://github.com/lycheeverse/lychee/"),
|
||||
String::from("/about"),
|
||||
])
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
assert_eq!(urls, expected_urls);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html5_lowercase_doctype() {
|
||||
// this has been problematic with previous XML based parser
|
||||
let input = load_fixture("TEST_HTML5_LOWERCASE_DOCTYPE.html");
|
||||
let links = extract_uris(&input, FileType::Html);
|
||||
|
||||
let expected_links =
|
||||
array::IntoIter::new([website("https://example.org/body/a")]).collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html5_minified() {
|
||||
// minified HTML with some quirky elements such as href attribute values specified without quotes
|
||||
let input = load_fixture("TEST_HTML5_MINIFIED.html");
|
||||
let links = extract_uris(&input, FileType::Html);
|
||||
|
||||
let expected_links = array::IntoIter::new([
|
||||
website("https://example.org/"),
|
||||
website("https://example.org/favicon.ico"),
|
||||
website("https://fonts.externalsite.com"),
|
||||
website("https://example.org/docs/"),
|
||||
website("https://example.org/forum"),
|
||||
])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html5_malformed() {
|
||||
// malformed links shouldn't stop the parser from further parsing
|
||||
let input = load_fixture("TEST_HTML5_MALFORMED_LINKS.html");
|
||||
let links = extract_uris(&input, FileType::Html);
|
||||
|
||||
let expected_links =
|
||||
array::IntoIter::new([website("https://example.org/valid")]).collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html5_custom_elements() {
|
||||
// the element name shouldn't matter for attributes like href, src, cite etc
|
||||
let input = load_fixture("TEST_HTML5_CUSTOM_ELEMENTS.html");
|
||||
let links = extract_uris(&input, FileType::Html);
|
||||
|
||||
let expected_links = array::IntoIter::new([
|
||||
website("https://example.org/some-weird-element"),
|
||||
website("https://example.org/even-weirder-src"),
|
||||
website("https://example.org/even-weirder-href"),
|
||||
website("https://example.org/citations"),
|
||||
])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_urls_with_at_sign_properly() {
|
||||
// note that these used to parse as emails
|
||||
let input = "https://example.com/@test/test http://otherdomain.com/test/@test".to_string();
|
||||
let links = extract_uris(&input, FileType::Plaintext);
|
||||
|
||||
let expected_links = array::IntoIter::new([
|
||||
website("https://example.com/@test/test"),
|
||||
website("http://otherdomain.com/test/@test"),
|
||||
])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
}
|
||||
22
lychee-lib/src/extract/plaintext.rs
Normal file
22
lychee-lib/src/extract/plaintext.rs
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
use crate::{helpers::url, types::raw_uri::RawUri};
|
||||
|
||||
/// Extract unparsed URL strings from plaintext
|
||||
pub(crate) fn extract_plaintext(input: &str) -> Vec<RawUri> {
|
||||
url::find_links(input)
|
||||
.map(|uri| RawUri::from(uri.as_str()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_extract_link_at_end_of_line() {
|
||||
let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
|
||||
let uri = RawUri::from(input.trim_end());
|
||||
|
||||
let uris: Vec<RawUri> = extract_plaintext(input);
|
||||
assert_eq!(vec![uri], uris);
|
||||
}
|
||||
}
|
||||
|
|
@ -36,8 +36,8 @@ pub struct Filter {
|
|||
/// URIs excluded from checking
|
||||
pub excludes: Option<Excludes>,
|
||||
/// Only check URIs with the given schemes (e.g. `https` and `http`)
|
||||
// TODO: includes scheme and excludes scheme
|
||||
// TODO: excludes_mail should be merged to excludes scheme
|
||||
// TODO: includes_scheme and excludes_scheme
|
||||
// TODO: excludes_mail should be an alias for exclude_scheme=mailto
|
||||
pub schemes: HashSet<String>,
|
||||
/// Example: 192.168.0.1
|
||||
pub exclude_private_ips: bool,
|
||||
|
|
|
|||
|
|
@ -1,2 +1,3 @@
|
|||
pub(crate) mod path;
|
||||
pub(crate) mod request;
|
||||
pub(crate) mod url;
|
||||
|
|
|
|||
114
lychee-lib/src/helpers/request.rs
Normal file
114
lychee-lib/src/helpers/request.rs
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
use html5ever::tendril::StrTendril;
|
||||
use log::info;
|
||||
use percent_encoding::percent_decode_str;
|
||||
use reqwest::Url;
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
convert::TryFrom,
|
||||
iter::FromIterator,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
helpers::{path, url},
|
||||
types::{raw_uri::RawUri, InputContent, InputSource},
|
||||
Base, ErrorKind, Request, Result, Uri,
|
||||
};
|
||||
|
||||
const MAX_TRUNCATED_STR_LEN: usize = 100;
|
||||
|
||||
/// Create requests out of the collected URLs.
|
||||
/// Only keeps "valid" URLs. This filters out anchors for example.
|
||||
pub(crate) fn create(
|
||||
uris: Vec<RawUri>,
|
||||
input_content: &InputContent,
|
||||
base: &Option<Base>,
|
||||
) -> Result<HashSet<Request>> {
|
||||
let base_input = Base::from_source(&input_content.source);
|
||||
|
||||
let requests: Result<Vec<Option<Request>>> = uris
|
||||
.into_iter()
|
||||
.map(|raw_uri| {
|
||||
let is_anchor = raw_uri.is_anchor();
|
||||
let text = StrTendril::from(raw_uri.text.clone());
|
||||
let element = raw_uri.element.clone();
|
||||
let attribute = raw_uri.attribute.clone();
|
||||
|
||||
// Truncate the source in case it gets too long Ideally we should
|
||||
// avoid the initial String allocation for `source` altogether
|
||||
let source = match &input_content.source {
|
||||
InputSource::String(s) => {
|
||||
InputSource::String(s.chars().take(MAX_TRUNCATED_STR_LEN).collect())
|
||||
}
|
||||
// Cloning is cheap here
|
||||
c => c.clone(),
|
||||
};
|
||||
|
||||
if let Ok(uri) = Uri::try_from(raw_uri) {
|
||||
Ok(Some(Request::new(uri, source, element, attribute)))
|
||||
} else if let Some(url) = base.as_ref().and_then(|u| u.join(&text)) {
|
||||
Ok(Some(Request::new(Uri { url }, source, element, attribute)))
|
||||
} else if let InputSource::FsPath(root) = &input_content.source {
|
||||
if is_anchor {
|
||||
// Silently ignore anchor links for now
|
||||
Ok(None)
|
||||
} else if let Some(url) = create_uri_from_path(root, &text, base)? {
|
||||
Ok(Some(Request::new(Uri { url }, source, element, attribute)))
|
||||
} else {
|
||||
// In case we cannot create a URI from a path but we didn't receive an error,
|
||||
// it means that some preconditions were not met, e.g. the `base_url` wasn't set.
|
||||
Ok(None)
|
||||
}
|
||||
} else if let Some(url) = base_input.as_ref().map(|u| u.join(&text)) {
|
||||
if base.is_some() {
|
||||
Ok(None)
|
||||
} else {
|
||||
Ok(Some(Request::new(
|
||||
Uri { url: url? },
|
||||
source,
|
||||
element,
|
||||
attribute,
|
||||
)))
|
||||
}
|
||||
} else {
|
||||
info!("Handling of `{}` not implemented yet", text);
|
||||
Ok(None)
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let requests: Vec<Request> = requests?.into_iter().flatten().collect();
|
||||
Ok(HashSet::from_iter(requests))
|
||||
}
|
||||
|
||||
fn create_uri_from_path(src: &Path, dst: &str, base: &Option<Base>) -> Result<Option<Url>> {
|
||||
let dst = url::remove_get_params_and_fragment(dst);
|
||||
// Avoid double-encoding already encoded destination paths by removing any
|
||||
// potential encoding (e.g. `web%20site` becomes `web site`).
|
||||
// That's because Url::from_file_path will encode the full URL in the end.
|
||||
// This behavior cannot be configured.
|
||||
// See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
|
||||
// TODO: This is not a perfect solution.
|
||||
// Ideally, only `src` and `base` should be URL encoded (as is done by
|
||||
// `from_file_path` at the moment) while `dst` gets left untouched and simply
|
||||
// appended to the end.
|
||||
let decoded = percent_decode_str(dst).decode_utf8()?;
|
||||
let resolved = path::resolve(src, &PathBuf::from(&*decoded), base)?;
|
||||
match resolved {
|
||||
Some(path) => Url::from_file_path(&path)
|
||||
.map(Some)
|
||||
.map_err(|_e| ErrorKind::InvalidUrlFromPath(path)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_create_uri_from_path() {
|
||||
let result =
|
||||
create_uri_from_path(&PathBuf::from("/README.md"), "test+encoding", &None).unwrap();
|
||||
assert_eq!(result.unwrap().as_str(), "file:///test+encoding");
|
||||
}
|
||||
}
|
||||
|
|
@ -18,44 +18,6 @@ pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str {
|
|||
path
|
||||
}
|
||||
|
||||
/// Extract all semantically-known links from a given html attribute. Pattern-based extraction from
|
||||
/// unstructured plaintext is done elsewhere.
|
||||
pub(crate) fn extract_links_from_elem_attr(
|
||||
attr_name: &str,
|
||||
elem_name: &str,
|
||||
attr_value: &str,
|
||||
) -> Vec<String> {
|
||||
// See a comprehensive list of attributes that might contain URLs/URIs
|
||||
// over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
|
||||
let mut urls = Vec::new();
|
||||
|
||||
match (attr_name, elem_name) {
|
||||
("href" | "src" | "cite", _) | ("data", "object") | ("onhashchange", "body") => {
|
||||
urls.push(attr_value.to_owned());
|
||||
}
|
||||
("srcset", _) => {
|
||||
for image_candidate_string in attr_value.trim().split(',') {
|
||||
for part in image_candidate_string.split_ascii_whitespace() {
|
||||
if part.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
urls.push(part.to_owned());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
|
||||
urls
|
||||
}
|
||||
|
||||
// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
|
||||
pub(crate) fn is_anchor(url: &str) -> bool {
|
||||
url.starts_with('#')
|
||||
}
|
||||
|
||||
// Use `LinkFinder` to offload the raw link searching in plaintext
|
||||
pub(crate) fn find_links(input: &str) -> impl Iterator<Item = linkify::Link> {
|
||||
LINK_FINDER.links(input)
|
||||
|
|
@ -65,12 +27,6 @@ pub(crate) fn find_links(input: &str) -> impl Iterator<Item = linkify::Link> {
|
|||
mod test_fs_tree {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_is_anchor() {
|
||||
assert!(is_anchor("#anchor"));
|
||||
assert!(!is_anchor("notan#anchor"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remove_get_params_and_fragment() {
|
||||
assert_eq!(remove_get_params_and_fragment("/"), "/");
|
||||
|
|
|
|||
|
|
@ -75,5 +75,8 @@ pub use crate::{
|
|||
client::{check, Client, ClientBuilder},
|
||||
collector::Collector,
|
||||
filter::{Excludes, Filter, Includes},
|
||||
types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri},
|
||||
types::{
|
||||
Base, ErrorKind, FileType, Input, InputContent, InputSource, Request, Response,
|
||||
ResponseBody, Result, Status, Uri,
|
||||
},
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
use std::convert::TryFrom;
|
||||
use std::{convert::TryFrom, fs, path::Path};
|
||||
|
||||
use reqwest::Url;
|
||||
|
||||
|
|
@ -45,3 +45,12 @@ pub(crate) fn mail(address: &str) -> Uri {
|
|||
.expect("Expected valid Mail Address")
|
||||
.into()
|
||||
}
|
||||
|
||||
pub(crate) fn load_fixture(filename: &str) -> String {
|
||||
let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("fixtures")
|
||||
.join(filename);
|
||||
fs::read_to_string(fixture_path).unwrap()
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ use reqwest::Url;
|
|||
use serde::{Deserialize, Serialize};
|
||||
use std::{convert::TryFrom, path::PathBuf};
|
||||
|
||||
use crate::ErrorKind;
|
||||
use crate::{ErrorKind, InputSource};
|
||||
|
||||
/// When encountering links without a full domain in a document,
|
||||
/// the base determines where this resource can be found.
|
||||
|
|
@ -19,7 +19,7 @@ pub enum Base {
|
|||
impl Base {
|
||||
/// Join link with base url
|
||||
#[must_use]
|
||||
pub fn join(&self, link: &str) -> Option<Url> {
|
||||
pub(crate) fn join(&self, link: &str) -> Option<Url> {
|
||||
match self {
|
||||
Self::Remote(url) => url.join(link).ok(),
|
||||
Self::Local(_) => None,
|
||||
|
|
@ -28,12 +28,30 @@ impl Base {
|
|||
|
||||
/// Return the directory if the base is local
|
||||
#[must_use]
|
||||
pub fn dir(&self) -> Option<PathBuf> {
|
||||
pub(crate) fn dir(&self) -> Option<PathBuf> {
|
||||
match self {
|
||||
Self::Remote(_) => None,
|
||||
Self::Local(d) => Some(d.clone()),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn from_source(source: &InputSource) -> Option<Url> {
|
||||
match &source {
|
||||
InputSource::RemoteUrl(url) => {
|
||||
// TODO: This should be refactored.
|
||||
// Cases like https://user:pass@example.com are not handled
|
||||
// We can probably use the original URL and just replace the
|
||||
// path component in the caller of this function
|
||||
if let Some(port) = url.port() {
|
||||
Url::parse(&format!("{}://{}:{}", url.scheme(), url.host_str()?, port)).ok()
|
||||
} else {
|
||||
Url::parse(&format!("{}://{}", url.scheme(), url.host_str()?)).ok()
|
||||
}
|
||||
}
|
||||
// other inputs do not have a URL to extract a base
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&str> for Base {
|
||||
|
|
@ -80,4 +98,24 @@ mod test_base {
|
|||
Base::try_from(dir.as_ref().to_str().unwrap())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_base_from_url() {
|
||||
for (url, expected) in &[
|
||||
("https://example.org", "https://example.org"),
|
||||
("https://example.org?query=something", "https://example.org"),
|
||||
("https://example.org/#anchor", "https://example.org"),
|
||||
("https://example.org/foo/bar", "https://example.org"),
|
||||
(
|
||||
"https://example.org:1234/foo/bar",
|
||||
"https://example.org:1234",
|
||||
),
|
||||
] {
|
||||
let url = Url::parse(url).unwrap();
|
||||
let source = InputSource::RemoteUrl(Box::new(url.clone()));
|
||||
let base = Base::from_source(&source);
|
||||
let expected = Url::parse(expected).unwrap();
|
||||
assert_eq!(base, Some(expected));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
use std::path::Path;
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||
/// `FileType` defines which file types lychee can handle
|
||||
pub enum FileType {
|
||||
/// File in HTML format
|
||||
|
|
|
|||
|
|
@ -19,10 +19,34 @@ fn valid_extension(p: &Path) -> bool {
|
|||
matches!(FileType::from(p), FileType::Markdown | FileType::Html)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
#[derive(Debug)]
|
||||
/// Encapsulates the content for a given input
|
||||
pub struct InputContent {
|
||||
/// Input source
|
||||
pub source: InputSource,
|
||||
/// File type of given input
|
||||
pub file_type: FileType,
|
||||
/// Raw UTF-8 string content
|
||||
pub content: String,
|
||||
}
|
||||
|
||||
impl InputContent {
|
||||
#[must_use]
|
||||
/// Create an instance of `InputContent` from an input string
|
||||
pub fn from_string(s: &str, file_type: FileType) -> Self {
|
||||
// TODO: consider using Cow (to avoid one .clone() for String types)
|
||||
Self {
|
||||
source: InputSource::String(s.to_owned()),
|
||||
file_type,
|
||||
content: s.to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize)]
|
||||
#[non_exhaustive]
|
||||
/// An exhaustive list of input sources, which lychee accepts
|
||||
pub enum Input {
|
||||
/// Input types which lychee supports
|
||||
pub enum InputSource {
|
||||
/// URL (of HTTP/HTTPS scheme).
|
||||
RemoteUrl(Box<Url>),
|
||||
/// Unix shell-style glob pattern.
|
||||
|
|
@ -40,49 +64,25 @@ pub enum Input {
|
|||
String(String),
|
||||
}
|
||||
|
||||
impl Serialize for Input {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
serializer.collect_str(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Input {
|
||||
impl Display for InputSource {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(match self {
|
||||
Input::RemoteUrl(url) => url.as_str(),
|
||||
Input::FsGlob { pattern, .. } => pattern,
|
||||
Input::FsPath(path) => path.to_str().unwrap_or_default(),
|
||||
Input::Stdin => "stdin",
|
||||
Input::String(_) => "raw input string",
|
||||
Self::RemoteUrl(url) => url.as_str(),
|
||||
Self::FsGlob { pattern, .. } => pattern,
|
||||
Self::FsPath(path) => path.to_str().unwrap_or_default(),
|
||||
Self::Stdin => "stdin",
|
||||
Self::String(s) => s,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
/// Encapsulates the content for a given input
|
||||
pub struct InputContent {
|
||||
/// Input source
|
||||
pub input: Input,
|
||||
/// File type of given input
|
||||
pub file_type: FileType,
|
||||
/// Raw UTF-8 string content
|
||||
pub content: String,
|
||||
}
|
||||
|
||||
impl InputContent {
|
||||
#[must_use]
|
||||
/// Create an instance of `InputContent` from an input string
|
||||
pub fn from_string(s: &str, file_type: FileType) -> Self {
|
||||
// TODO: consider using Cow (to avoid one .clone() for String types)
|
||||
Self {
|
||||
input: Input::String(s.to_owned()),
|
||||
file_type,
|
||||
content: s.to_owned(),
|
||||
}
|
||||
}
|
||||
/// Lychee Input with optional file hint for parsing
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub struct Input {
|
||||
/// Origin of input
|
||||
pub source: InputSource,
|
||||
/// Hint to indicate which extractor to use
|
||||
pub file_type_hint: Option<FileType>,
|
||||
}
|
||||
|
||||
impl Input {
|
||||
|
|
@ -90,23 +90,27 @@ impl Input {
|
|||
/// Construct a new `Input` source. In case the input is a `glob` pattern,
|
||||
/// `glob_ignore_case` decides whether matching files against the `glob` is
|
||||
/// case-insensitive or not
|
||||
pub fn new(value: &str, glob_ignore_case: bool) -> Self {
|
||||
if value == STDIN {
|
||||
Self::Stdin
|
||||
pub fn new(value: &str, file_type_hint: Option<FileType>, glob_ignore_case: bool) -> Self {
|
||||
let source = if value == STDIN {
|
||||
InputSource::Stdin
|
||||
} else if let Ok(url) = Url::parse(value) {
|
||||
Self::RemoteUrl(Box::new(url))
|
||||
InputSource::RemoteUrl(Box::new(url))
|
||||
} else {
|
||||
// this seems to be the only way to determine if this is a glob pattern
|
||||
let is_glob = glob::Pattern::escape(value) != value;
|
||||
|
||||
if is_glob {
|
||||
Self::FsGlob {
|
||||
InputSource::FsGlob {
|
||||
pattern: value.to_owned(),
|
||||
ignore_case: glob_ignore_case,
|
||||
}
|
||||
} else {
|
||||
Self::FsPath(value.into())
|
||||
InputSource::FsPath(value.into())
|
||||
}
|
||||
};
|
||||
Self {
|
||||
source,
|
||||
file_type_hint,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -120,16 +124,15 @@ impl Input {
|
|||
/// network request or retrieving the contents from the file system)
|
||||
pub async fn get_contents(
|
||||
self,
|
||||
file_type_hint: Option<FileType>,
|
||||
skip_missing: bool,
|
||||
) -> impl Stream<Item = Result<InputContent>> {
|
||||
try_stream! {
|
||||
match self {
|
||||
Input::RemoteUrl(ref url) => {
|
||||
match self.source {
|
||||
InputSource::RemoteUrl(ref url) => {
|
||||
let contents: InputContent = Self::url_contents(url).await?;
|
||||
yield contents;
|
||||
},
|
||||
Input::FsGlob {
|
||||
InputSource::FsGlob {
|
||||
ref pattern,
|
||||
ignore_case,
|
||||
} => {
|
||||
|
|
@ -138,7 +141,7 @@ impl Input {
|
|||
yield content;
|
||||
}
|
||||
}
|
||||
Input::FsPath(ref path) => {
|
||||
InputSource::FsPath(ref path) => {
|
||||
if path.is_dir() {
|
||||
for entry in WalkDir::new(path).skip_hidden(true)
|
||||
.process_read_dir(|_, _, _, children| {
|
||||
|
|
@ -179,12 +182,12 @@ impl Input {
|
|||
};
|
||||
}
|
||||
},
|
||||
Input::Stdin => {
|
||||
let content = Self::stdin_content(file_type_hint).await?;
|
||||
InputSource::Stdin => {
|
||||
let content = Self::stdin_content(self.file_type_hint).await?;
|
||||
yield content;
|
||||
},
|
||||
Input::String(ref s) => {
|
||||
let content = Self::string_content(s, file_type_hint);
|
||||
InputSource::String(ref s) => {
|
||||
let content = Self::string_content(s, self.file_type_hint);
|
||||
yield content;
|
||||
},
|
||||
}
|
||||
|
|
@ -201,7 +204,7 @@ impl Input {
|
|||
|
||||
let res = reqwest::get(url.clone()).await?;
|
||||
let input_content = InputContent {
|
||||
input: Input::RemoteUrl(Box::new(url.clone())),
|
||||
source: InputSource::RemoteUrl(Box::new(url.clone())),
|
||||
file_type,
|
||||
content: res.text().await?,
|
||||
};
|
||||
|
|
@ -251,8 +254,8 @@ impl Input {
|
|||
.map_err(|e| (path.clone().into(), e))?;
|
||||
let input_content = InputContent {
|
||||
file_type: FileType::from(path.as_ref()),
|
||||
source: InputSource::FsPath(path.into()),
|
||||
content,
|
||||
input: Input::FsPath(path.into()),
|
||||
};
|
||||
|
||||
Ok(input_content)
|
||||
|
|
@ -264,7 +267,7 @@ impl Input {
|
|||
stdin.read_to_string(&mut content).await?;
|
||||
|
||||
let input_content = InputContent {
|
||||
input: Input::Stdin,
|
||||
source: InputSource::Stdin,
|
||||
file_type: file_type_hint.unwrap_or_default(),
|
||||
content,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ mod base;
|
|||
mod error;
|
||||
mod file;
|
||||
mod input;
|
||||
pub(crate) mod raw_uri;
|
||||
mod request;
|
||||
mod response;
|
||||
mod status;
|
||||
|
|
@ -12,7 +13,7 @@ mod uri;
|
|||
pub use base::Base;
|
||||
pub use error::ErrorKind;
|
||||
pub use file::FileType;
|
||||
pub use input::{Input, InputContent};
|
||||
pub use input::{Input, InputContent, InputSource};
|
||||
pub use request::Request;
|
||||
pub use response::{Response, ResponseBody};
|
||||
pub use status::Status;
|
||||
|
|
|
|||
56
lychee-lib/src/types/raw_uri.rs
Normal file
56
lychee-lib/src/types/raw_uri.rs
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
use std::fmt::Display;
|
||||
|
||||
/// A raw URI that got extracted from a document with a fuzzy parser.
|
||||
/// Note that this can still be invalid according to stricter URI standards
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct RawUri {
|
||||
/// Unparsed URI represented as a `String`. There is no guarantee that it
|
||||
/// can be parsed into a URI object
|
||||
pub text: String,
|
||||
/// Name of the element that contained the URI (e.g. `a` for the <a> tag).
|
||||
/// This is a way to classify links to make it easier to offer fine control
|
||||
/// over the links that will be checked e.g. by trying to filter out links
|
||||
/// that were found in unwanted tags like `<pre>` or `<code>`.
|
||||
pub element: Option<String>,
|
||||
/// Name of the attribute that contained the URI (e.g. `src`). This is a way
|
||||
/// to classify links to make it easier to offer fine control over the links
|
||||
/// that will be checked e.g. by trying to filter out links that were found
|
||||
/// in unwanted attributes like `srcset` or `manifest`.
|
||||
pub attribute: Option<String>,
|
||||
}
|
||||
|
||||
impl RawUri {
|
||||
// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
|
||||
pub(crate) fn is_anchor(&self) -> bool {
|
||||
self.text.starts_with('#')
|
||||
}
|
||||
}
|
||||
impl Display for RawUri {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{} (Attribute: {:?})", self.text, self.attribute)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for RawUri {
|
||||
fn from(text: &str) -> Self {
|
||||
RawUri {
|
||||
text: text.to_string(),
|
||||
element: None,
|
||||
attribute: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_is_anchor() {
|
||||
let raw_uri = RawUri::from("#anchor");
|
||||
assert!(raw_uri.is_anchor());
|
||||
|
||||
let raw_uri = RawUri::from("notan#anchor");
|
||||
assert!(!raw_uri.is_anchor());
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,8 @@
|
|||
use std::{convert::TryFrom, fmt::Display};
|
||||
|
||||
use crate::{ErrorKind, Input, Uri};
|
||||
use crate::{ErrorKind, Uri};
|
||||
|
||||
use super::InputSource;
|
||||
|
||||
/// A request type that can be handle by lychee
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||
|
|
@ -9,15 +11,31 @@ pub struct Request {
|
|||
/// checked with lychee
|
||||
pub uri: Uri,
|
||||
/// The resource which contained the given URI
|
||||
pub source: Input,
|
||||
pub source: InputSource,
|
||||
/// Specifies how the URI was rendered inside a document
|
||||
/// (for example `img`, `a`, `pre`, or `code`).
|
||||
/// In case of plaintext input the field is `None`.
|
||||
pub element: Option<String>,
|
||||
/// Specifies the attribute (e.g. `href`) that contained the URI
|
||||
pub attribute: Option<String>,
|
||||
}
|
||||
|
||||
impl Request {
|
||||
/// Instantiate a new `Request` object
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub const fn new(uri: Uri, source: Input) -> Self {
|
||||
Request { uri, source }
|
||||
pub const fn new(
|
||||
uri: Uri,
|
||||
source: InputSource,
|
||||
element: Option<String>,
|
||||
attribute: Option<String>,
|
||||
) -> Self {
|
||||
Request {
|
||||
uri,
|
||||
source,
|
||||
element,
|
||||
attribute,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -27,12 +45,25 @@ impl Display for Request {
|
|||
}
|
||||
}
|
||||
|
||||
impl TryFrom<Uri> for Request {
|
||||
type Error = ErrorKind;
|
||||
|
||||
fn try_from(uri: Uri) -> Result<Self, Self::Error> {
|
||||
Ok(Request::new(
|
||||
uri.clone(),
|
||||
InputSource::RemoteUrl(Box::new(uri.url)),
|
||||
None,
|
||||
None,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<String> for Request {
|
||||
type Error = ErrorKind;
|
||||
|
||||
fn try_from(s: String) -> Result<Self, Self::Error> {
|
||||
let uri = Uri::try_from(s.as_str())?;
|
||||
Ok(Request::new(uri, Input::String(s)))
|
||||
Ok(Request::new(uri, InputSource::String(s), None, None))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -41,6 +72,11 @@ impl TryFrom<&str> for Request {
|
|||
|
||||
fn try_from(s: &str) -> Result<Self, Self::Error> {
|
||||
let uri = Uri::try_from(s)?;
|
||||
Ok(Request::new(uri, Input::String(s.to_owned())))
|
||||
Ok(Request::new(
|
||||
uri,
|
||||
InputSource::String(s.to_owned()),
|
||||
None,
|
||||
None,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,17 +2,17 @@ use std::fmt::Display;
|
|||
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::{Input, Status, Uri};
|
||||
use crate::{InputSource, Status, Uri};
|
||||
|
||||
/// Response type returned by lychee after checking a URI
|
||||
#[derive(Debug)]
|
||||
pub struct Response(pub Input, pub ResponseBody);
|
||||
pub struct Response(pub InputSource, pub ResponseBody);
|
||||
|
||||
impl Response {
|
||||
#[inline]
|
||||
#[must_use]
|
||||
/// Create new response
|
||||
pub const fn new(uri: Uri, status: Status, source: Input) -> Self {
|
||||
pub const fn new(uri: Uri, status: Status, source: InputSource) -> Self {
|
||||
Response(source, ResponseBody { uri, status })
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,10 @@ use url::Url;
|
|||
|
||||
use crate::{ErrorKind, Result};
|
||||
|
||||
/// Lychee's own representation of a URI, which encapsulates all support formats.
|
||||
use super::raw_uri::RawUri;
|
||||
|
||||
/// Lychee's own representation of a URI, which encapsulates all supported
|
||||
/// formats.
|
||||
///
|
||||
/// If the scheme is `mailto`, it's a mail address.
|
||||
/// Otherwise it's treated as a website URL.
|
||||
|
|
@ -138,6 +141,23 @@ impl TryFrom<&str> for Uri {
|
|||
}
|
||||
}
|
||||
|
||||
impl TryFrom<RawUri> for Uri {
|
||||
type Error = ErrorKind;
|
||||
|
||||
fn try_from(raw_uri: RawUri) -> Result<Self> {
|
||||
let s = raw_uri.text;
|
||||
let s = s.trim_start_matches("mailto:");
|
||||
if let Err(mail_err) = parse_email(s) {
|
||||
match Url::parse(s) {
|
||||
Ok(uri) => Ok(uri.into()),
|
||||
Err(url_err) => Err((s.to_owned(), url_err, mail_err).into()),
|
||||
}
|
||||
} else {
|
||||
Ok(Url::parse(&(String::from("mailto:") + s)).unwrap().into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Uri {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.as_str())
|
||||
|
|
|
|||
Loading…
Reference in a new issue