Use builder pattern and channels (fixes #12) (#33)

This implements a basic builder for the Checker struct as discussed in #12.
It is using derive_builder and uses a custom build method to instantiate the more elaborate fields like reqwest::Client.
It also adds deadpool and tokio::mpsc as dependencies to handle a pool of clients to query websites.
This commit is contained in:
Matthias 2020-11-24 21:30:06 +01:00 committed by GitHub
parent 8025a2eedb
commit b0f7a805ef
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 705 additions and 508 deletions

253
Cargo.lock generated
View file

@ -146,9 +146,9 @@ dependencies = [
[[package]]
name = "async-channel"
version = "1.4.0"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43de69555a39d52918e2bc33a408d3c0a86c829b212d898f4ca25d21a6387478"
checksum = "59740d83946db6a5af71ae25ddf9562c2b176b2ca42cf99a455f09f4a220d6b9"
dependencies = [
"concurrent-queue",
"event-listener",
@ -165,7 +165,7 @@ dependencies = [
"flate2",
"futures-core",
"memchr",
"pin-project-lite",
"pin-project-lite 0.1.7",
]
[[package]]
@ -195,7 +195,7 @@ dependencies = [
"httparse",
"lazy_static",
"log",
"pin-project-lite",
"pin-project-lite 0.1.7",
]
[[package]]
@ -204,7 +204,7 @@ version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ae22a338d28c75b53702b66f77979062cb29675db376d99e451af4fa79dedb3"
dependencies = [
"cfg-if",
"cfg-if 0.1.10",
"concurrent-queue",
"futures-lite",
"libc",
@ -273,7 +273,7 @@ dependencies = [
"async-mutex",
"async-task",
"blocking",
"crossbeam-utils",
"crossbeam-utils 0.7.2",
"futures-channel",
"futures-core",
"futures-io",
@ -284,7 +284,7 @@ dependencies = [
"memchr",
"num_cpus",
"once_cell",
"pin-project-lite",
"pin-project-lite 0.1.7",
"pin-utils",
"slab",
"wasm-bindgen-futures",
@ -349,7 +349,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46254cf2fdcdf1badb5934448c1bcbe046a56537b3987d96c51a7afc5d03f293"
dependencies = [
"addr2line",
"cfg-if",
"cfg-if 0.1.10",
"libc",
"miniz_oxide",
"object",
@ -383,7 +383,7 @@ dependencies = [
"anyhow",
"async-mutex",
"bastion-executor 0.4.0",
"crossbeam-queue",
"crossbeam-queue 0.2.3",
"futures",
"futures-timer",
"fxhash",
@ -409,7 +409,7 @@ dependencies = [
"bastion-utils",
"crossbeam-channel",
"crossbeam-epoch",
"crossbeam-utils",
"crossbeam-utils 0.7.2",
"futures-timer",
"lazy_static",
"libc",
@ -429,8 +429,8 @@ dependencies = [
"bastion-utils",
"crossbeam-channel",
"crossbeam-epoch",
"crossbeam-queue",
"crossbeam-utils",
"crossbeam-queue 0.2.3",
"crossbeam-utils 0.7.2",
"futures-timer",
"lazy_static",
"lever",
@ -525,7 +525,7 @@ version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38e98299d518ec351ca016363e0cbfc77059dcd08dfa9700d15e405536097a"
dependencies = [
"crossbeam-queue",
"crossbeam-queue 0.2.3",
"stable_deref_trait",
]
@ -575,6 +575,12 @@ version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "check-if-email-exists"
version = "0.8.13"
@ -615,7 +621,7 @@ dependencies = [
"ansi_term 0.11.0",
"atty",
"bitflags",
"strsim",
"strsim 0.8.0",
"textwrap",
"unicode-width",
"vec_map",
@ -639,6 +645,17 @@ dependencies = [
"cache-padded",
]
[[package]]
name = "config"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19b076e143e1d9538dde65da30f8481c2a6c44040edb8e02b9bf1351edb92ce3"
dependencies = [
"lazy_static",
"nom",
"serde",
]
[[package]]
name = "console"
version = "0.12.0"
@ -707,7 +724,7 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1"
dependencies = [
"cfg-if",
"cfg-if 0.1.10",
]
[[package]]
@ -716,8 +733,8 @@ version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ee0cc8804d5393478d743b035099520087a5186f3b93fa58cec08fa62407b6"
dependencies = [
"cfg-if",
"crossbeam-utils",
"cfg-if 0.1.10",
"crossbeam-utils 0.7.2",
]
[[package]]
@ -727,8 +744,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace"
dependencies = [
"autocfg",
"cfg-if",
"crossbeam-utils",
"cfg-if 0.1.10",
"crossbeam-utils 0.7.2",
"lazy_static",
"maybe-uninit",
"memoffset",
@ -741,11 +758,21 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "774ba60a54c213d409d5353bda12d49cd68d14e45036a285234c8d6f91f92570"
dependencies = [
"cfg-if",
"crossbeam-utils",
"cfg-if 0.1.10",
"crossbeam-utils 0.7.2",
"maybe-uninit",
]
[[package]]
name = "crossbeam-queue"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b2a58563f049aa3bae172bc4120f093b5901161c629f280a1f40ba55317d774"
dependencies = [
"cfg-if 1.0.0",
"crossbeam-utils 0.8.0",
]
[[package]]
name = "crossbeam-utils"
version = "0.7.2"
@ -753,7 +780,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8"
dependencies = [
"autocfg",
"cfg-if",
"cfg-if 0.1.10",
"lazy_static",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec91540d98355f690a86367e566ecad2e9e579f230230eb7c21398372be73ea5"
dependencies = [
"autocfg",
"cfg-if 1.0.0",
"const_fn",
"lazy_static",
]
@ -767,12 +806,86 @@ dependencies = [
"subtle",
]
[[package]]
name = "darling"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0d706e75d87e35569db781a9b5e2416cff1236a47ed380831f959382ccd5f858"
dependencies = [
"darling_core",
"darling_macro",
]
[[package]]
name = "darling_core"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0c960ae2da4de88a91b2d920c2a7233b400bc33cb28453a2987822d8392519b"
dependencies = [
"fnv",
"ident_case",
"proc-macro2",
"quote",
"strsim 0.9.3",
"syn",
]
[[package]]
name = "darling_macro"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b5a2f4ac4969822c62224815d069952656cadc7084fdca9751e6d959189b72"
dependencies = [
"darling_core",
"quote",
"syn",
]
[[package]]
name = "data-encoding"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4d0e2d24e5ee3b23a01de38eefdcd978907890701f08ffffd4cb457ca4ee8d6"
[[package]]
name = "deadpool"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f1ea999b4bd6f33b668ea6c847823fc95105be8799c0cf3611ac4025871e004"
dependencies = [
"async-trait",
"config",
"crossbeam-queue 0.3.0",
"num_cpus",
"serde",
"tokio 0.3.4",
]
[[package]]
name = "derive_builder"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2658621297f2cf68762a6f7dc0bb7e1ff2cfd6583daef8ee0fed6f7ec468ec0"
dependencies = [
"darling",
"derive_builder_core",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "derive_builder_core"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2791ea3e372c8495c0bc2033991d76b512cd799d07491fbd6890124db9458bef"
dependencies = [
"darling",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "difference"
version = "2.0.0"
@ -833,7 +946,7 @@ version = "0.8.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8ac63f94732332f44fe654443c46f6375d1939684c17b0afb6cb56b0456e171"
dependencies = [
"cfg-if",
"cfg-if 0.1.10",
]
[[package]]
@ -907,7 +1020,7 @@ version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "766d0e77a2c1502169d4a93ff3b8c15a71fd946cd0126309752104e5f3c46d94"
dependencies = [
"cfg-if",
"cfg-if 0.1.10",
"crc32fast",
"libc",
"miniz_oxide",
@ -1028,7 +1141,7 @@ dependencies = [
"futures-io",
"memchr",
"parking 2.0.0",
"pin-project-lite",
"pin-project-lite 0.1.7",
"waker-fn",
]
@ -1132,7 +1245,7 @@ version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb"
dependencies = [
"cfg-if",
"cfg-if 0.1.10",
"libc",
"wasi",
]
@ -1185,7 +1298,7 @@ dependencies = [
"http",
"indexmap",
"slab",
"tokio",
"tokio 0.2.22",
"tokio-util",
"tracing",
]
@ -1315,7 +1428,7 @@ dependencies = [
"cookie",
"http",
"infer",
"pin-project-lite",
"pin-project-lite 0.1.7",
"rand",
"serde",
"serde_json",
@ -1379,7 +1492,7 @@ dependencies = [
"pin-project",
"socket2",
"time 0.1.43",
"tokio",
"tokio 0.2.22",
"tower-service",
"tracing",
"want",
@ -1394,7 +1507,7 @@ dependencies = [
"bytes 0.5.6",
"hyper",
"native-tls",
"tokio",
"tokio 0.2.22",
"tokio-tls",
]
@ -1416,6 +1529,12 @@ dependencies = [
"unicase",
]
[[package]]
name = "ident_case"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]]
name = "idna"
version = "0.2.0"
@ -1570,7 +1689,7 @@ checksum = "db65c6da02e61f55dae90a0ae427b2a5f6b3e8db09f58d10efab23af92592616"
dependencies = [
"arrayvec",
"bitflags",
"cfg-if",
"cfg-if 0.1.10",
"ryu",
"static_assertions",
]
@ -1587,7 +1706,7 @@ version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebf7c202c4da35d579058d845fad7b62d3aff5fcb10c5a57fc175ac632874de0"
dependencies = [
"crossbeam-utils",
"crossbeam-utils 0.7.2",
"pin-utils",
]
@ -1621,7 +1740,7 @@ version = "0.4.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fabed175da42fed1fa0746b0ea71f412aa9d35e76e95e59b192c64b9dc2bf8b"
dependencies = [
"cfg-if",
"cfg-if 0.1.10",
]
[[package]]
@ -1640,6 +1759,8 @@ dependencies = [
"anyhow",
"assert_cmd",
"check-if-email-exists",
"deadpool",
"derive_builder",
"futures",
"glob",
"headers",
@ -1656,7 +1777,7 @@ dependencies = [
"reqwest",
"serde",
"structopt",
"tokio",
"tokio 0.2.22",
"toml",
"url",
"wiremock",
@ -1744,7 +1865,7 @@ version = "0.6.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fce347092656428bc8eaf6201042cb551b8d67855af7374542a92a0fbfcac430"
dependencies = [
"cfg-if",
"cfg-if 0.1.10",
"fuchsia-zircon",
"fuchsia-zircon-sys",
"iovec",
@ -1837,7 +1958,7 @@ version = "0.2.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2ba7c918ac76704fb42afcbbb43891e72731f3dcca3bef2a19786297baf14af7"
dependencies = [
"cfg-if",
"cfg-if 0.1.10",
"libc",
"winapi 0.3.9",
]
@ -1953,7 +2074,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d575eff3665419f9b83678ff2815858ad9d11567e082f5ac1814baba4e2bcb4"
dependencies = [
"bitflags",
"cfg-if",
"cfg-if 0.1.10",
"foreign-types",
"lazy_static",
"libc",
@ -2008,7 +2129,7 @@ version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c361aa727dd08437f2f1447be8b59a33b0edd15e0fcee698f935613d9efbca9b"
dependencies = [
"cfg-if",
"cfg-if 0.1.10",
"cloudabi",
"instant",
"libc",
@ -2060,6 +2181,12 @@ version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282adbf10f2698a7a77f8e983a74b2d18176c19a7fd32a45446139ae7b02b715"
[[package]]
name = "pin-project-lite"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b063f57ec186e6140e2b8b6921e5f1bd89c7356dda5b33acc5401203ca6131c"
[[package]]
name = "pin-utils"
version = "0.1.0"
@ -2078,7 +2205,7 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e09dffb745feffca5be3dea51c02b7b368c4597ab0219a82acaf9799ab3e0d1"
dependencies = [
"cfg-if",
"cfg-if 0.1.10",
"libc",
"wepoll-sys-stjepang",
"winapi 0.3.9",
@ -2090,7 +2217,7 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a5884790f1ce3553ad55fec37b5aaac5882e0e845a2612df744d6c85c9bf046c"
dependencies = [
"cfg-if",
"cfg-if 0.1.10",
"universal-hash",
]
@ -2328,11 +2455,11 @@ dependencies = [
"mime_guess",
"native-tls",
"percent-encoding",
"pin-project-lite",
"pin-project-lite 0.1.7",
"serde",
"serde_json",
"serde_urlencoded 0.6.1",
"tokio",
"tokio 0.2.22",
"tokio-socks",
"tokio-tls",
"url",
@ -2546,7 +2673,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2933378ddfeda7ea26f48c555bdad8bb446bf8a3d17832dc83e380d444cfb8c1"
dependencies = [
"block-buffer 0.9.0",
"cfg-if",
"cfg-if 0.1.10",
"cpuid-bool",
"digest 0.9.0",
"opaque-debug 0.3.0",
@ -2600,7 +2727,7 @@ version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03088793f677dce356f3ccc2edb1b314ad191ab702a5de3faf49304f7e104918"
dependencies = [
"cfg-if",
"cfg-if 0.1.10",
"libc",
"redox_syscall",
"winapi 0.3.9",
@ -2688,6 +2815,12 @@ version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
[[package]]
name = "strsim"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6446ced80d6c486436db5c078dde11a9f73d42b57fb273121e160b84f63d894c"
[[package]]
name = "structopt"
version = "0.3.20"
@ -2735,7 +2868,7 @@ version = "3.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9"
dependencies = [
"cfg-if",
"cfg-if 0.1.10",
"libc",
"rand",
"redox_syscall",
@ -2880,13 +3013,23 @@ dependencies = [
"mio-named-pipes",
"mio-uds",
"num_cpus",
"pin-project-lite",
"pin-project-lite 0.1.7",
"signal-hook-registry",
"slab",
"tokio-macros",
"winapi 0.3.9",
]
[[package]]
name = "tokio"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9dfe2523e6fa84ddf5e688151d4e5fddc51678de9752c6512a24714c23818d61"
dependencies = [
"autocfg",
"pin-project-lite 0.2.0",
]
[[package]]
name = "tokio-macros"
version = "0.2.5"
@ -2908,7 +3051,7 @@ dependencies = [
"either",
"futures",
"thiserror",
"tokio",
"tokio 0.2.22",
]
[[package]]
@ -2918,7 +3061,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a70f4fcd7b3b24fb194f837560168208f669ca8cb70d0c4b862944452396343"
dependencies = [
"native-tls",
"tokio",
"tokio 0.2.22",
]
[[package]]
@ -2931,8 +3074,8 @@ dependencies = [
"futures-core",
"futures-sink",
"log",
"pin-project-lite",
"tokio",
"pin-project-lite 0.1.7",
"tokio 0.2.22",
]
[[package]]
@ -2956,7 +3099,7 @@ version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d79ca061b032d6ce30c660fded31189ca0b9922bf483cd70759f13a2d86786c"
dependencies = [
"cfg-if",
"cfg-if 0.1.10",
"log",
"tracing-attributes",
"tracing-core",
@ -3046,7 +3189,7 @@ dependencies = [
"rand",
"smallvec",
"thiserror",
"tokio",
"tokio 0.2.22",
"url",
]
@ -3057,7 +3200,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f23cdfdc3d8300b3c50c9e84302d3bd6d860fb9529af84ace6cf9665f181b77"
dependencies = [
"backtrace",
"cfg-if",
"cfg-if 0.1.10",
"futures",
"ipconfig",
"lazy_static",
@ -3224,7 +3367,7 @@ version = "0.2.67"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0563a9a4b071746dd5aedbc3a28c6fe9be4586fb3fbadb67c400d4f53c6b16c"
dependencies = [
"cfg-if",
"cfg-if 0.1.10",
"serde",
"serde_json",
"wasm-bindgen-macro",
@ -3251,7 +3394,7 @@ version = "0.4.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95f8d235a77f880bcef268d379810ea6c0af2eacfa90b1ad5af731776e0c4699"
dependencies = [
"cfg-if",
"cfg-if 0.1.10",
"js-sys",
"wasm-bindgen",
"web-sys",

View file

@ -28,6 +28,8 @@ serde = { version = "1.0", features = ["derive"] }
pulldown-cmark = "0.8.0"
quick-xml = "0.20.0"
headers = "0.3.2"
derive_builder = "0.9.0"
deadpool = "0.6.0"
[dependencies.reqwest]
features = ["gzip"]

View file

@ -1,12 +1,9 @@
use crate::{
extract::{self, Uri},
types::{Excludes, RequestMethod, Status},
};
use anyhow::{Context, Result};
use crate::types::{Excludes, Response, Status, Uri};
use anyhow::{anyhow, Context, Result};
use check_if_email_exists::{check_email, CheckEmailInput};
use derive_builder::Builder;
use headers::{HeaderMap, HeaderValue};
use hubcaps::{Credentials, Github};
use indicatif::ProgressBar;
use regex::{Regex, RegexSet};
use reqwest::header;
use std::net::IpAddr;
@ -14,46 +11,60 @@ use std::{collections::HashSet, time::Duration};
use tokio::time::delay_for;
use url::Url;
/// A link checker using an API token for Github links
/// otherwise a normal HTTP client.
pub(crate) struct Checker<'a> {
const DEFAULT_MAX_REDIRECTS: usize = 5;
#[derive(Debug, Clone)]
pub struct Client {
reqwest_client: reqwest::Client,
github: Option<Github>,
includes: Option<RegexSet>,
excludes: Excludes,
scheme: Option<String>,
method: RequestMethod,
method: reqwest::Method,
accepted: Option<HashSet<reqwest::StatusCode>>,
verbose: bool,
progress_bar: Option<&'a ProgressBar>,
}
impl<'a> Checker<'a> {
/// Creates a new link checker
// we should consider adding a config struct for this, so that the list
// of arguments is short
#[allow(clippy::too_many_arguments)]
pub fn try_new(
github_token: Option<String>,
includes: Option<RegexSet>,
excludes: Excludes,
max_redirects: usize,
user_agent: String,
allow_insecure: bool,
scheme: Option<String>,
custom_headers: HeaderMap,
method: RequestMethod,
accepted: Option<HashSet<http::StatusCode>>,
timeout: Option<Duration>,
verbose: bool,
progress_bar: Option<&'a ProgressBar>,
) -> Result<Self> {
/// A link checker using an API token for Github links
/// otherwise a normal HTTP client.
#[derive(Builder, Debug)]
#[builder(build_fn(skip))]
#[builder(setter(into))]
#[builder(name = "ClientBuilder")]
pub struct ClientBuilderInternal {
github_token: Option<String>,
includes: Option<RegexSet>,
excludes: Excludes,
max_redirects: usize,
user_agent: String,
allow_insecure: bool,
scheme: Option<String>,
custom_headers: HeaderMap,
method: reqwest::Method,
accepted: Option<HashSet<http::StatusCode>>,
timeout: Option<Duration>,
verbose: bool,
}
impl ClientBuilder {
pub fn build(&mut self) -> Result<Client> {
let mut headers = HeaderMap::new();
// Faking the user agent is necessary for some websites, unfortunately.
// Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com).
// let user_agent = self.user_agent.as_ref().unwrap_or(&"lychee/0.3.0".to_string());
let user_agent = match self.user_agent {
Some(ref u) => u.clone(),
None => String::from("lychee/0.3.0"),
};
headers.insert(header::USER_AGENT, HeaderValue::from_str(&user_agent)?);
headers.insert(header::TRANSFER_ENCODING, HeaderValue::from_str("chunked")?);
headers.extend(custom_headers);
if let Some(custom) = &self.custom_headers {
headers.extend(custom.clone());
}
let allow_insecure = self.allow_insecure.unwrap_or(false);
let max_redirects = self.max_redirects.unwrap_or(DEFAULT_MAX_REDIRECTS);
let builder = reqwest::ClientBuilder::new()
.gzip(true)
@ -61,36 +72,43 @@ impl<'a> Checker<'a> {
.danger_accept_invalid_certs(allow_insecure)
.redirect(reqwest::redirect::Policy::limited(max_redirects));
let builder = match timeout {
Some(timeout) => builder.timeout(timeout),
let builder = match self.timeout {
Some(t) => builder
.timeout(t.ok_or_else(|| anyhow!("cannot parse timeout: {:?}", self.timeout))?),
None => builder,
};
let reqwest_client = builder.build()?;
let github = match github_token {
let token: Option<String> = self.github_token.clone().unwrap_or_default();
let github = match token {
Some(token) => {
let github = Github::new(user_agent, Credentials::Token(token))?;
Some(github)
if token.is_empty() {
None
} else {
let github = Github::new(user_agent, Credentials::Token(token))?;
Some(github)
}
}
None => None,
};
let scheme = self.scheme.clone().unwrap_or(None);
let scheme = scheme.map(|s| s.to_lowercase());
Ok(Checker {
Ok(Client {
reqwest_client,
github,
includes,
excludes,
includes: self.includes.clone().unwrap_or(None),
excludes: self.excludes.clone().unwrap_or_default(),
scheme,
method,
accepted,
verbose,
progress_bar,
method: self.method.clone().unwrap_or(reqwest::Method::GET),
accepted: self.accepted.clone().unwrap_or(None),
})
}
}
impl Client {
async fn check_github(&self, owner: String, repo: String) -> Status {
match &self.github {
Some(github) => {
@ -110,10 +128,9 @@ impl<'a> Checker<'a> {
}
async fn check_normal(&self, url: &Url) -> Status {
let request = match self.method {
RequestMethod::GET => self.reqwest_client.get(url.as_str()),
RequestMethod::HEAD => self.reqwest_client.head(url.as_str()),
};
let request = self
.reqwest_client
.request(self.method.clone(), url.as_str());
let res = request.send().await;
match res {
Ok(response) => Status::new(response.status(), self.accepted.clone()),
@ -233,48 +250,14 @@ impl<'a> Checker<'a> {
uri.scheme() != self.scheme
}
fn status_message(&self, status: &Status, uri: &Uri) -> Option<String> {
match status {
Status::Ok(code) => {
if self.verbose {
Some(format!("{} [{}]", uri, code))
} else {
None
}
}
Status::Failed(code) => Some(format!("🚫{} [{}]", uri, code)),
Status::Redirected => {
if self.verbose {
Some(format!("🔀️{}", uri))
} else {
None
}
}
Status::Excluded => {
if self.verbose {
Some(format!("👻{}", uri))
} else {
None
}
}
Status::Error(e) => Some(format!("{} ({})", uri, e)),
Status::Timeout => Some(format!("{}", uri)),
}
}
pub async fn check(&self, uri: &extract::Uri) -> Status {
pub async fn check(&self, uri: Uri) -> Response {
if self.excluded(&uri) {
return Status::Excluded;
return Response::new(uri, Status::Excluded);
}
if let Some(pb) = self.progress_bar {
pb.set_message(&uri.to_string());
}
let ret = match uri {
Uri::Website(url) => self.check_real(url).await,
Uri::Mail(address) => {
let valid = self.valid_mail(address).await;
let status = match uri {
Uri::Website(ref url) => self.check_real(&url).await,
Uri::Mail(ref address) => {
let valid = self.valid_mail(&address).await;
if valid {
// TODO: We should not be using a HTTP status code for mail
Status::Ok(http::StatusCode::OK)
@ -283,18 +266,7 @@ impl<'a> Checker<'a> {
}
}
};
if let Some(pb) = self.progress_bar {
pb.inc(1);
// regular println! interferes with progress bar
if let Some(message) = self.status_message(&ret, uri) {
pb.println(message);
}
} else if let Some(message) = self.status_message(&ret, uri) {
println!("{}", message);
}
ret
Response::new(uri, status)
}
}
@ -326,49 +298,28 @@ mod test {
const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]";
const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]";
fn get_checker(allow_insecure: bool, custom_headers: HeaderMap) -> Checker<'static> {
let checker = Checker::try_new(
None,
None,
Excludes::default(),
5,
"curl/7.71.1".to_string(),
allow_insecure,
None,
custom_headers,
RequestMethod::GET,
None,
None,
false,
None,
)
.unwrap();
checker
}
fn website_url(s: &str) -> Uri {
Uri::Website(Url::parse(s).expect("Expected valid Website Uri"))
}
#[tokio::test]
async fn test_nonexistent() {
let res = get_checker(false, HeaderMap::new())
.check(&website_url("https://endler.dev/abcd"))
let res = ClientBuilder::default()
.build()
.unwrap()
.check(website_url("https://endler.dev/abcd"))
.await;
assert!(matches!(res, Status::Failed(_)));
assert!(matches!(res.status, Status::Failed(_)));
}
#[tokio::test]
async fn test_exponential_backoff() {
let start = Instant::now();
let res = get_checker(false, HeaderMap::new())
.check(&Uri::Website(
Url::parse("https://endler.dev/abcd").unwrap(),
))
.await;
let uri = Uri::Website(Url::parse("https://endler.dev/abcd").unwrap());
let res = ClientBuilder::default().build().unwrap().check(uri).await;
let end = start.elapsed();
assert!(matches!(res, Status::Failed(_)));
assert!(matches!(res.status, Status::Failed(_)));
// on slow connections, this might take a bit longer than nominal backed-off timeout (7 secs)
assert!(end.as_secs() >= 7);
@ -378,7 +329,9 @@ mod test {
#[test]
fn test_is_github() {
assert_eq!(
get_checker(false, HeaderMap::new())
ClientBuilder::default()
.build()
.unwrap()
.extract_github("https://github.com/mre/idiomatic-rust")
.unwrap(),
("mre".into(), "idiomatic-rust".into())
@ -387,61 +340,80 @@ mod test {
#[tokio::test]
async fn test_github() {
assert!(matches!(
get_checker(false, HeaderMap::new())
.check(&website_url("https://github.com/mre/idiomatic-rust"))
.await,
ClientBuilder::default()
.build()
.unwrap()
.check(website_url("https://github.com/mre/idiomatic-rust"))
.await
.status,
Status::Ok(_)
));
}
#[tokio::test]
async fn test_github_nonexistent() {
let res = get_checker(false, HeaderMap::new())
.check(&website_url(
let res = ClientBuilder::default()
.build()
.unwrap()
.check(website_url(
"https://github.com/mre/idiomatic-rust-doesnt-exist-man",
))
.await;
.await
.status;
assert!(matches!(res, Status::Error(_)));
}
#[tokio::test]
async fn test_non_github() {
let res = get_checker(false, HeaderMap::new())
.check(&website_url("https://endler.dev"))
.await;
let res = ClientBuilder::default()
.build()
.unwrap()
.check(website_url("https://endler.dev"))
.await
.status;
assert!(matches!(res, Status::Ok(_)));
}
#[tokio::test]
async fn test_invalid_ssl() {
let res = get_checker(false, HeaderMap::new())
.check(&website_url("https://expired.badssl.com/"))
let res = ClientBuilder::default()
.build()
.unwrap()
.check(website_url("https://expired.badssl.com/"))
.await;
assert!(matches!(res, Status::Error(_)));
assert!(matches!(res.status, Status::Error(_)));
// Same, but ignore certificate error
let res = get_checker(true, HeaderMap::new())
.check(&website_url("https://expired.badssl.com/"))
let res = ClientBuilder::default()
.allow_insecure(true)
.build()
.unwrap()
.check(website_url("https://expired.badssl.com/"))
.await;
assert!(matches!(res, Status::Ok(_)));
assert!(matches!(res.status, Status::Ok(_)));
}
#[tokio::test]
async fn test_custom_headers() {
let res = get_checker(false, HeaderMap::new())
.check(&website_url("https://crates.io/keywords/cassandra"))
let res = ClientBuilder::default()
.build()
.unwrap()
.check(website_url("https://crates.io/keywords/cassandra"))
.await;
assert!(matches!(res, Status::Failed(StatusCode::NOT_FOUND)));
assert!(matches!(res.status, Status::Failed(StatusCode::NOT_FOUND)));
// Try again, but with a custom header.
// For example, crates.io requires a custom accept header.
// See https://github.com/rust-lang/crates.io/issues/788
let mut custom = HeaderMap::new();
custom.insert(header::ACCEPT, "text/html".parse().unwrap());
let res = get_checker(true, custom)
.check(&website_url("https://crates.io/keywords/cassandra"))
let res = ClientBuilder::default()
.custom_headers(custom)
.build()
.unwrap()
.check(website_url("https://crates.io/keywords/cassandra"))
.await;
assert!(matches!(res, Status::Ok(_)));
assert!(matches!(res.status, Status::Ok(_)));
}
#[tokio::test]
@ -460,55 +432,29 @@ mod test {
.mount(&mock_server)
.await;
let checker = Checker::try_new(
None,
None,
Excludes::default(),
5,
"curl/7.71.1".to_string(),
true,
None,
HeaderMap::new(),
RequestMethod::GET,
None,
Some(checker_timeout),
false,
None,
)
.expect("Expected successful instantiation");
let client = ClientBuilder::default()
.timeout(checker_timeout)
.build()
.unwrap();
let resp = checker
.check(&Uri::Website(Url::parse(&mock_server.uri()).unwrap()))
let resp = client
.check(Uri::Website(Url::parse(&mock_server.uri()).unwrap()))
.await;
assert!(matches!(resp, Status::Timeout));
assert!(matches!(resp.status, Status::Timeout));
}
#[tokio::test]
async fn test_include_regex() {
let includes = Some(RegexSet::new(&[r"foo.github.com"]).unwrap());
let includes = RegexSet::new(&[r"foo.github.com"]).unwrap();
let client = ClientBuilder::default().includes(includes).build().unwrap();
let checker = Checker::try_new(
None,
includes,
Excludes::default(),
5,
"curl/7.71.1".to_string(),
true,
None,
HeaderMap::new(),
RequestMethod::GET,
None,
None,
false,
None,
)
.unwrap();
assert_eq!(
checker.excluded(&website_url("https://foo.github.com")),
client.excluded(&website_url("https://foo.github.com")),
false
);
assert_eq!(
checker.excluded(&website_url("https://bar.github.com")),
client.excluded(&website_url("https://bar.github.com")),
true
);
}
@ -517,31 +463,21 @@ mod test {
async fn test_exclude_include_regex() {
let mut excludes = Excludes::default();
excludes.regex = Some(RegexSet::new(&[r"github.com"]).unwrap());
let includes = Some(RegexSet::new(&[r"foo.github.com"]).unwrap());
let includes = RegexSet::new(&[r"foo.github.com"]).unwrap();
let client = ClientBuilder::default()
.includes(includes)
.excludes(excludes)
.build()
.unwrap();
let checker = Checker::try_new(
None,
includes,
excludes,
5,
"curl/7.71.1".to_string(),
true,
None,
HeaderMap::new(),
RequestMethod::GET,
None,
None,
false,
None,
)
.unwrap();
assert_eq!(
checker.excluded(&website_url("https://foo.github.com")),
client.excluded(&website_url("https://foo.github.com")),
false
);
assert_eq!(checker.excluded(&website_url("https://github.com")), true);
assert_eq!(client.excluded(&website_url("https://github.com")), true);
assert_eq!(
checker.excluded(&website_url("https://bar.github.com")),
client.excluded(&website_url("https://bar.github.com")),
true
);
}
@ -552,30 +488,16 @@ mod test {
excludes.regex =
Some(RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.com"]).unwrap());
let checker = Checker::try_new(
None,
None,
excludes,
5,
"curl/7.71.1".to_string(),
true,
None,
HeaderMap::new(),
RequestMethod::GET,
None,
None,
false,
None,
)
.unwrap();
assert_eq!(checker.excluded(&website_url("http://github.com")), true);
assert_eq!(checker.excluded(&website_url("http://exclude.org")), true);
let client = ClientBuilder::default().excludes(excludes).build().unwrap();
assert_eq!(client.excluded(&website_url("http://github.com")), true);
assert_eq!(client.excluded(&website_url("http://exclude.org")), true);
assert_eq!(
checker.excluded(&Uri::Mail("mail@example.com".to_string())),
client.excluded(&Uri::Mail("mail@example.com".to_string())),
true
);
assert_eq!(
checker.excluded(&Uri::Mail("foo@bar.dev".to_string())),
client.excluded(&Uri::Mail("foo@bar.dev".to_string())),
false
);
}
@ -610,57 +532,57 @@ mod test {
#[test]
fn test_excludes_no_private_ips_by_default() {
let checker = get_checker(false, HeaderMap::new());
let client = ClientBuilder::default().build().unwrap();
assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_A)), false);
assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_B)), false);
assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_C)), false);
assert_eq!(checker.excluded(&website_url(V4_LINK_LOCAL)), false);
assert_eq!(checker.excluded(&website_url(V4_LOOPBACK)), false);
assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_A)), false);
assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_B)), false);
assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_C)), false);
assert_eq!(client.excluded(&website_url(V4_LINK_LOCAL)), false);
assert_eq!(client.excluded(&website_url(V4_LOOPBACK)), false);
assert_eq!(checker.excluded(&website_url(V6_LOOPBACK)), false);
assert_eq!(client.excluded(&website_url(V6_LOOPBACK)), false);
}
#[test]
fn test_exclude_private() {
let mut checker = get_checker(false, HeaderMap::new());
checker.excludes.private_ips = true;
let mut client = ClientBuilder::default().build().unwrap();
client.excludes.private_ips = true;
assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_A)), true);
assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_B)), true);
assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_C)), true);
assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_A)), true);
assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_B)), true);
assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_C)), true);
}
#[test]
fn test_exclude_link_local() {
let mut checker = get_checker(false, HeaderMap::new());
checker.excludes.link_local_ips = true;
let mut client = ClientBuilder::default().build().unwrap();
client.excludes.link_local_ips = true;
assert_eq!(checker.excluded(&website_url(V4_LINK_LOCAL)), true);
assert_eq!(client.excluded(&website_url(V4_LINK_LOCAL)), true);
}
#[test]
fn test_exclude_loopback() {
let mut checker = get_checker(false, HeaderMap::new());
checker.excludes.loopback_ips = true;
let mut client = ClientBuilder::default().build().unwrap();
client.excludes.loopback_ips = true;
assert_eq!(checker.excluded(&website_url(V4_LOOPBACK)), true);
assert_eq!(checker.excluded(&website_url(V6_LOOPBACK)), true);
assert_eq!(client.excluded(&website_url(V4_LOOPBACK)), true);
assert_eq!(client.excluded(&website_url(V6_LOOPBACK)), true);
}
#[test]
fn test_exclude_ip_v4_mapped_ip_v6_not_supported() {
let mut checker = get_checker(false, HeaderMap::new());
checker.excludes.private_ips = true;
checker.excludes.link_local_ips = true;
let mut client = ClientBuilder::default().build().unwrap();
client.excludes.private_ips = true;
client.excludes.link_local_ips = true;
// if these were pure IPv4, we would exclude
assert_eq!(
checker.excluded(&website_url(V6_MAPPED_V4_PRIVATE_CLASS_A)),
client.excluded(&website_url(V6_MAPPED_V4_PRIVATE_CLASS_A)),
false
);
assert_eq!(
checker.excluded(&website_url(V6_MAPPED_V4_LINK_LOCAL)),
client.excluded(&website_url(V6_MAPPED_V4_LINK_LOCAL)),
false
);
}

33
src/client_pool.rs Normal file
View file

@ -0,0 +1,33 @@
use client::Client;
use deadpool::unmanaged::Pool;
use tokio::sync::mpsc;
use crate::{client, types};
pub struct ClientPool {
tx: mpsc::Sender<types::Response>,
rx: mpsc::Receiver<types::Uri>,
pool: deadpool::unmanaged::Pool<client::Client>,
}
impl ClientPool {
pub fn new(
tx: mpsc::Sender<types::Response>,
rx: mpsc::Receiver<types::Uri>,
clients: Vec<Client>,
) -> Self {
let pool = Pool::from(clients);
ClientPool { tx, rx, pool }
}
pub async fn listen(&mut self) {
while let Some(req) = self.rx.recv().await {
let client = self.pool.get().await;
let mut tx = self.tx.clone();
tokio::spawn(async move {
let resp = client.check(req).await;
tx.send(resp).await.unwrap();
});
}
}
}

View file

@ -1,21 +1,17 @@
use crate::extract::{self, extract_links, FileType};
use crate::extract::{extract_links, FileType};
use crate::types::Uri;
use anyhow::Result;
use extract::Uri;
use glob::glob;
use reqwest::Url;
use std::path::Path;
use std::{collections::HashSet, fs};
use std::{ffi::OsStr, path::Path};
/// Detect if the given path points to a Markdown, HTML, or plaintext file.
fn resolve_file_type_by_path<P: AsRef<Path>>(p: P) -> FileType {
let path = p.as_ref();
match path.extension() {
Some(ext) => match ext.to_str().unwrap() {
"md" => FileType::Markdown,
"html" | "htm" => FileType::HTML,
_ => FileType::Plaintext,
},
None => FileType::Plaintext,
fn resolve_file_type_by_path<P: AsRef<Path>>(path: P) -> FileType {
match path.as_ref().extension().and_then(OsStr::to_str) {
Some("md") => FileType::Markdown,
Some("html") => FileType::HTML,
_ => FileType::Plaintext,
}
}
@ -57,7 +53,7 @@ pub(crate) async fn collect_links(
base_url.clone(),
));
}
Err(e) => println!("{:?}", e),
Err(e) => println!("Error handling file pattern {}: {:?}", input, e),
}
}
}

View file

@ -1,17 +1,11 @@
use crate::types::Uri;
use linkify::LinkFinder;
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
use quick_xml::{events::Event as HTMLEvent, Reader};
use std::net::IpAddr;
use std::collections::HashSet;
use std::path::Path;
use std::{collections::HashSet, fmt::Display};
use url::Url;
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub(crate) enum Uri {
Website(Url),
Mail(String),
}
#[derive(Clone, Debug)]
pub(crate) enum FileType {
HTML,
@ -19,39 +13,6 @@ pub(crate) enum FileType {
Plaintext,
}
impl Uri {
pub fn as_str(&self) -> &str {
match self {
Uri::Website(url) => url.as_str(),
Uri::Mail(address) => address.as_str(),
}
}
pub fn scheme(&self) -> Option<String> {
match self {
Uri::Website(url) => Some(url.scheme().to_string()),
Uri::Mail(_address) => None,
}
}
pub fn host_ip(&self) -> Option<IpAddr> {
match self {
Self::Website(url) => match url.host()? {
url::Host::Ipv4(v4_addr) => Some(v4_addr.into()),
url::Host::Ipv6(v6_addr) => Some(v6_addr.into()),
_ => None,
},
Self::Mail(_) => None,
}
}
}
impl Display for Uri {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.as_str())
}
}
// Use LinkFinder here to offload the actual link searching
fn find_links(input: &str) -> Vec<linkify::Link> {
let finder = LinkFinder::new();
@ -168,7 +129,7 @@ pub(crate) fn extract_links(
uris.insert(Uri::Mail(link));
} else if !Path::new(&link).exists() {
if let Some(base_url) = &base_url {
if let Ok(new_url) = base_url.clone().join(&link) {
if let Ok(new_url) = base_url.join(&link) {
uris.insert(Uri::Website(new_url));
}
}
@ -176,8 +137,6 @@ pub(crate) fn extract_links(
}
};
}
debug!("Found: {:#?}", uris);
uris
}
@ -185,7 +144,6 @@ pub(crate) fn extract_links(
mod test {
use super::*;
use std::iter::FromIterator;
use std::net::{Ipv4Addr, Ipv6Addr};
#[test]
fn test_extract_markdown_links() {
@ -277,30 +235,4 @@ mod test {
assert!(links.len() == 1);
assert_eq!(links[0].as_str(), expected);
}
#[test]
fn test_uri_host_ip_v4() {
let uri =
Uri::Website(Url::parse("http://127.0.0.1").expect("Expected URI with valid IPv4"));
let ip = uri.host_ip().expect("Expected a valid IPv4");
assert_eq!(ip, IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)));
}
#[test]
fn test_uri_host_ip_v6() {
let uri =
Uri::Website(Url::parse("https://[2020::0010]").expect("Expected URI with valid IPv6"));
let ip = uri.host_ip().expect("Expected a valid IPv6");
assert_eq!(
ip,
IpAddr::V6(Ipv6Addr::new(0x2020, 0, 0, 0, 0, 0, 0, 0x10))
);
}
#[test]
fn test_uri_host_ip_no_ip() {
let uri = Uri::Website(Url::parse("https://some.cryptic/url").expect("Expected valid URI"));
let ip = uri.host_ip();
assert!(ip.is_none());
}
}

View file

@ -2,23 +2,28 @@
extern crate log;
use anyhow::{anyhow, Result};
use futures::future::join_all;
use headers::authorization::Basic;
use headers::{Authorization, HeaderMap, HeaderMapExt, HeaderName};
use indicatif::{ProgressBar, ProgressStyle};
use regex::RegexSet;
use std::{collections::HashSet, convert::TryInto, time::Duration};
use std::str::FromStr;
use std::{collections::HashSet, time::Duration};
use structopt::StructOpt;
use tokio::sync::mpsc;
mod checker;
mod client;
mod client_pool;
mod collector;
mod extract;
mod options;
mod stats;
mod types;
use checker::Checker;
use extract::Uri;
use client::ClientBuilder;
use client_pool::ClientPool;
use options::{Config, LycheeOptions};
use stats::ResponseStats;
use types::Response;
use types::{Excludes, Status};
/// A C-like enum that can be cast to `i32` and used as process exit code.
@ -32,27 +37,6 @@ enum ExitCode {
LinkCheckFailure = 2,
}
fn print_summary(found: &HashSet<Uri>, results: &[Status]) {
let found = found.len();
let excluded: usize = results
.iter()
.filter(|l| matches!(l, Status::Excluded))
.count();
let success: usize = results
.iter()
.filter(|l| matches!(l, Status::Ok(_)))
.count();
let errors: usize = found - excluded - success;
println!();
println!("📝Summary");
println!("-------------------");
println!("🔍Found: {}", found);
println!("👻Excluded: {}", excluded);
println!("✅Successful: {}", success);
println!("🚫Errors: {}", errors);
}
fn main() -> Result<()> {
pretty_env_logger::init();
let opts = LycheeOptions::from_args();
@ -79,23 +63,50 @@ fn main() -> Result<()> {
std::process::exit(errorcode);
}
async fn run(cfg: Config, inputs: Vec<String>) -> Result<i32> {
let includes = RegexSet::new(&cfg.include).ok();
let excludes = Excludes::from_options(&cfg);
let mut headers = parse_headers(cfg.headers)?;
fn show_progress(progress_bar: &Option<ProgressBar>, response: &Response, verbose: bool) {
let message = status_message(&response, verbose);
if let Some(pb) = progress_bar {
pb.inc(1);
// regular println! interferes with progress bar
if let Some(message) = message {
pb.println(message);
}
} else if let Some(message) = message {
println!("{}", message);
};
}
if let Some(auth) = cfg.basic_auth {
async fn run(cfg: Config, inputs: Vec<String>) -> Result<i32> {
let mut headers = parse_headers(&cfg.headers)?;
if let Some(auth) = &cfg.basic_auth {
let auth_header = parse_basic_auth(&auth)?;
headers.typed_insert(auth_header);
}
let accepted = match cfg.accept {
Some(accept) => parse_statuscodes(accept)?,
None => None,
};
let timeout = parse_timeout(cfg.timeout)?;
let links = collector::collect_links(inputs, cfg.base_url).await?;
let progress_bar = if cfg.progress {
let accepted = cfg.accept.clone().and_then(|a| parse_statuscodes(&a).ok());
let timeout = parse_timeout(&cfg.timeout)?;
let max_concurrency = cfg.max_concurrency.parse()?;
let method: reqwest::Method = reqwest::Method::from_str(&cfg.method.to_uppercase())?;
let includes = RegexSet::new(&cfg.include)?;
let excludes = Excludes::from_options(&cfg);
let client = ClientBuilder::default()
.includes(includes)
.excludes(excludes)
.max_redirects(cfg.max_redirects)
.user_agent(cfg.user_agent)
.allow_insecure(cfg.insecure)
.custom_headers(headers)
.method(method)
.timeout(timeout)
.verbose(cfg.verbose)
.github_token(cfg.github_token)
.scheme(cfg.scheme)
.accepted(accepted)
.build()?;
let links = collector::collect_links(inputs, cfg.base_url.clone()).await?;
let pb = if cfg.progress {
Some(
ProgressBar::new(links.len() as u64)
.with_style(
@ -107,43 +118,51 @@ async fn run(cfg: Config, inputs: Vec<String>) -> Result<i32> {
} else {
None
};
let checker = Checker::try_new(
cfg.github_token,
includes,
excludes,
cfg.max_redirects,
cfg.user_agent,
cfg.insecure,
cfg.scheme,
headers,
cfg.method.try_into()?,
accepted,
Some(timeout),
cfg.verbose,
progress_bar.as_ref(),
)?;
let futures: Vec<_> = links.iter().map(|l| checker.check(l)).collect();
let results = join_all(futures).await;
let (mut send_req, recv_req) = mpsc::channel(max_concurrency);
let (send_resp, mut recv_resp) = mpsc::channel(max_concurrency);
// note that prints may interfere progress bar so this must go before summary
if let Some(progress_bar) = progress_bar {
progress_bar.finish_and_clear();
let mut stats = ResponseStats::new();
let bar = pb.clone();
tokio::spawn(async move {
for link in links {
if let Some(pb) = &bar {
pb.set_message(&link.to_string());
};
send_req.send(link).await.unwrap();
}
});
tokio::spawn(async move {
// Start receiving requests
let clients: Vec<_> = (0..max_concurrency).map(|_| client.clone()).collect();
let mut clients = ClientPool::new(send_resp, recv_req, clients);
clients.listen().await;
});
while let Some(response) = recv_resp.recv().await {
show_progress(&pb, &response, cfg.verbose);
stats.add(response);
}
// Note that print statements may interfere with the progress bar, so this
// must go before printing the stats
if let Some(pb) = &pb {
pb.finish_and_clear();
}
if cfg.verbose {
print_summary(&links, &results);
println!("\n{}", stats);
}
let success = results.iter().all(|r| r.is_success() || r.is_excluded());
match success {
match stats.is_success() {
true => Ok(ExitCode::Success as i32),
false => Ok(ExitCode::LinkCheckFailure as i32),
}
}
fn read_header(input: String) -> Result<(String, String)> {
fn read_header(input: &str) -> Result<(String, String)> {
let elements: Vec<_> = input.split('=').collect();
if elements.len() != 2 {
return Err(anyhow!(
@ -154,14 +173,14 @@ fn read_header(input: String) -> Result<(String, String)> {
Ok((elements[0].into(), elements[1].into()))
}
fn parse_timeout(timeout: String) -> Result<Duration> {
Ok(Duration::from_secs(timeout.parse::<u64>()?))
fn parse_timeout<S: AsRef<str>>(timeout: S) -> Result<Duration> {
Ok(Duration::from_secs(timeout.as_ref().parse::<u64>()?))
}
fn parse_headers(headers: Vec<String>) -> Result<HeaderMap> {
fn parse_headers<T: AsRef<str>>(headers: &[T]) -> Result<HeaderMap> {
let mut out = HeaderMap::new();
for header in headers {
let (key, val) = read_header(header)?;
let (key, val) = read_header(header.as_ref())?;
out.insert(
HeaderName::from_bytes(key.as_bytes())?,
val.parse().unwrap(),
@ -170,13 +189,13 @@ fn parse_headers(headers: Vec<String>) -> Result<HeaderMap> {
Ok(out)
}
fn parse_statuscodes(accept: String) -> Result<Option<HashSet<http::StatusCode>>> {
fn parse_statuscodes<T: AsRef<str>>(accept: T) -> Result<HashSet<http::StatusCode>> {
let mut statuscodes = HashSet::new();
for code in accept.split(',').into_iter() {
for code in accept.as_ref().split(',').into_iter() {
let code: reqwest::StatusCode = reqwest::StatusCode::from_bytes(code.as_bytes())?;
statuscodes.insert(code);
}
Ok(Some(statuscodes))
Ok(statuscodes)
}
fn parse_basic_auth(auth: &str) -> Result<Authorization<Basic>> {
@ -190,6 +209,18 @@ fn parse_basic_auth(auth: &str) -> Result<Authorization<Basic>> {
Ok(Authorization::basic(params[0], params[1]))
}
fn status_message(response: &Response, verbose: bool) -> Option<String> {
match &response.status {
Status::Ok(code) if verbose => Some(format!("{} [{}]", response.uri, code)),
Status::Redirected if verbose => Some(format!("🔀️ {}", response.uri)),
Status::Excluded if verbose => Some(format!("👻 {}", response.uri)),
Status::Failed(code) => Some(format!("🚫 {} [{}]", response.uri, code)),
Status::Error(e) => Some(format!("{} ({})", response.uri, e)),
Status::Timeout => Some(format!("{}", response.uri)),
_ => None,
}
}
#[cfg(test)]
mod test {
use super::*;
@ -200,25 +231,20 @@ mod test {
fn test_parse_custom_headers() {
let mut custom = HeaderMap::new();
custom.insert(header::ACCEPT, "text/html".parse().unwrap());
assert_eq!(
parse_headers(vec!["accept=text/html".into()]).unwrap(),
custom
);
assert_eq!(parse_headers(&["accept=text/html"]).unwrap(), custom);
}
#[test]
fn test_parse_statuscodes() {
let actual = parse_statuscodes("200,204,301".into()).unwrap();
let expected: Option<HashSet<StatusCode>> = Some(
[
StatusCode::OK,
StatusCode::NO_CONTENT,
StatusCode::MOVED_PERMANENTLY,
]
.iter()
.cloned()
.collect(),
);
let actual = parse_statuscodes("200,204,301").unwrap();
let expected: HashSet<StatusCode> = [
StatusCode::OK,
StatusCode::NO_CONTENT,
StatusCode::MOVED_PERMANENTLY,
]
.iter()
.cloned()
.collect();
assert_eq!(actual, expected);
}

View file

@ -6,6 +6,7 @@ use structopt::StructOpt;
const USER_AGENT: &str = "curl/7.71.1";
const METHOD: &str = "get";
const TIMEOUT: &str = "20";
const MAX_CONCURRENCY: &str = "128";
// Macro for generating default functions to be used by serde
macro_rules! default_function {
@ -30,10 +31,7 @@ macro_rules! fold_in {
}
#[derive(Debug, StructOpt)]
#[structopt(
name = "lychee",
about = "A boring link checker for my projects (and maybe yours)"
)]
#[structopt(name = "lychee", about = "A glorious link checker")]
pub(crate) struct LycheeOptions {
/// Input files
pub inputs: Vec<String>,
@ -47,7 +45,7 @@ pub(crate) struct LycheeOptions {
}
#[derive(Debug, Deserialize, StructOpt)]
pub(crate) struct Config {
pub struct Config {
/// Verbose program output
#[structopt(short, long)]
#[serde(default)]
@ -63,6 +61,11 @@ pub(crate) struct Config {
#[serde(default)]
pub max_redirects: usize,
/// Maximum number of concurrent network requests
#[structopt(long, default_value = MAX_CONCURRENCY)]
#[serde(default)]
pub max_concurrency: String,
/// Number of threads to utilize.
/// Defaults to number of cores available to the system
#[structopt(short = "T", long)]
@ -131,7 +134,8 @@ pub(crate) struct Config {
pub timeout: String,
/// Request method
#[structopt(short = "M", long, default_value = METHOD)]
// Using `-X` as a short param similar to curl
#[structopt(short = "X", long, default_value = METHOD)]
#[serde(default = "method")]
pub method: String,
@ -139,7 +143,7 @@ pub(crate) struct Config {
#[serde(default)]
pub base_url: Option<String>,
#[structopt(long, help = "Basic autentication support. Ex 'username:password'")]
#[structopt(long, help = "Basic authentication support. Ex 'username:password'")]
#[serde(default)]
pub basic_auth: Option<String>,
@ -185,6 +189,7 @@ impl Config {
verbose: false;
progress: false;
max_redirects: 10;
max_concurrency: MAX_CONCURRENCY;
threads: None;
user_agent: USER_AGENT;
insecure: false;

64
src/stats.rs Normal file
View file

@ -0,0 +1,64 @@
use std::{
collections::HashSet,
fmt::{self, Display},
};
use crate::types::Response;
use crate::types::Status::*;
use crate::types::Uri;
pub struct ResponseStats {
total: usize,
successful: usize,
failures: HashSet<Uri>,
timeouts: HashSet<Uri>,
redirects: HashSet<Uri>,
excludes: HashSet<Uri>,
errors: HashSet<Uri>,
}
impl ResponseStats {
pub fn new() -> Self {
ResponseStats {
total: 0,
successful: 0,
failures: HashSet::new(),
timeouts: HashSet::new(),
redirects: HashSet::new(),
excludes: HashSet::new(),
errors: HashSet::new(),
}
}
pub fn add(&mut self, response: Response) {
self.total += 1;
let uri = response.uri;
if !match response.status {
Failed(_) => self.failures.insert(uri),
Timeout => self.timeouts.insert(uri),
Redirected => self.redirects.insert(uri),
Excluded => self.excludes.insert(uri),
Error(_) => self.errors.insert(uri),
_ => false,
} {
self.successful += 1;
}
}
pub fn is_success(&self) -> bool {
self.total == self.successful + self.excludes.len()
}
}
impl Display for ResponseStats {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "📝 Summary")?;
writeln!(f, "-------------------")?;
writeln!(f, "🔍 Total: {}", self.total)?;
writeln!(f, "✅ Successful: {}", self.successful)?;
writeln!(f, "⏳ Timeouts: {}", self.timeouts.len())?;
writeln!(f, "🔀 Redirected: {}", self.redirects.len())?;
writeln!(f, "👻 Excluded: {}", self.excludes.len())?;
writeln!(f, "🚫 Errors: {}", self.errors.len() + self.failures.len())
}
}

View file

@ -1,8 +1,48 @@
use crate::options::Config;
use anyhow::anyhow;
use std::{collections::HashSet, convert::TryFrom};
use regex::RegexSet;
use std::net::IpAddr;
use std::{collections::HashSet, convert::TryFrom, fmt::Display};
use url::Url;
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub enum Uri {
Website(Url),
Mail(String),
}
impl Uri {
pub fn as_str(&self) -> &str {
match self {
Uri::Website(url) => url.as_str(),
Uri::Mail(address) => address.as_str(),
}
}
pub fn scheme(&self) -> Option<String> {
match self {
Uri::Website(url) => Some(url.scheme().to_string()),
Uri::Mail(_address) => None,
}
}
pub fn host_ip(&self) -> Option<IpAddr> {
match self {
Self::Website(url) => match url.host()? {
url::Host::Ipv4(v4_addr) => Some(v4_addr.into()),
url::Host::Ipv6(v6_addr) => Some(v6_addr.into()),
_ => None,
},
Self::Mail(_) => None,
}
}
}
impl Display for Uri {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.as_str())
}
}
/// Specifies how requests to websites will be made
pub(crate) enum RequestMethod {
@ -21,14 +61,32 @@ impl TryFrom<String> for RequestMethod {
}
}
#[derive(Debug)]
pub struct Response {
pub uri: Uri,
pub status: Status,
}
impl Response {
pub fn new(uri: Uri, status: Status) -> Self {
Response { uri, status }
}
}
/// Response status of the request
#[derive(Debug)]
pub enum Status {
/// Request was successful
Ok(http::StatusCode),
/// Request failed with HTTP error code
Failed(http::StatusCode),
/// Request timed out
Timeout,
/// Got redirected to different resource
Redirected,
/// Resource was excluded from checking
Excluded,
/// Low-level error while loading resource
Error(String),
}
@ -51,10 +109,6 @@ impl Status {
pub fn is_success(&self) -> bool {
matches!(self, Status::Ok(_))
}
pub fn is_excluded(&self) -> bool {
matches!(self, Status::Excluded)
}
}
impl From<reqwest::Error> for Status {
@ -69,7 +123,8 @@ impl From<reqwest::Error> for Status {
/// Exclude configuration for the link checker.
/// You can ignore links based on
pub(crate) struct Excludes {
#[derive(Clone, Debug)]
pub struct Excludes {
pub regex: Option<RegexSet>,
/// Example: 192.168.0.1
pub private_ips: bool,
@ -105,3 +160,35 @@ impl Default for Excludes {
}
}
}
#[cfg(test)]
mod test {
use super::*;
use std::net::{Ipv4Addr, Ipv6Addr};
#[test]
fn test_uri_host_ip_v4() {
let uri =
Uri::Website(Url::parse("http://127.0.0.1").expect("Expected URI with valid IPv4"));
let ip = uri.host_ip().expect("Expected a valid IPv4");
assert_eq!(ip, IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)));
}
#[test]
fn test_uri_host_ip_v6() {
let uri =
Uri::Website(Url::parse("https://[2020::0010]").expect("Expected URI with valid IPv6"));
let ip = uri.host_ip().expect("Expected a valid IPv6");
assert_eq!(
ip,
IpAddr::V6(Ipv6Addr::new(0x2020, 0, 0, 0, 0, 0, 0, 0x10))
);
}
#[test]
fn test_uri_host_ip_no_ip() {
let uri = Uri::Website(Url::parse("https://some.cryptic/url").expect("Expected valid URI"));
let ip = uri.host_ip();
assert!(ip.is_none());
}
}

View file

@ -22,7 +22,7 @@ mod cli {
.arg(test_all_private_path)
.assert()
.success()
.stdout(contains("Found: 7"))
.stdout(contains("Total: 7"))
.stdout(contains("Excluded: 7"))
.stdout(contains("Successful: 0"))
.stdout(contains("Errors: 0"));
@ -44,26 +44,12 @@ mod cli {
.arg(test_github_path)
.assert()
.success()
.stdout(contains("Found: 1"))
.stdout(contains("Total: 1"))
.stdout(contains("Excluded: 0"))
.stdout(contains("Successful: 1"))
.stdout(contains("Errors: 0"));
}
#[test]
fn test_failure_invalid_method() {
let mut cmd =
Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name");
cmd.arg("--method=invalid-method")
.assert()
.failure()
.code(1)
.stderr(contains(
"Error: Only `get` and `head` allowed, got invalid-method",
));
}
#[test]
fn test_failure_404_link() {
let mut cmd =
@ -90,6 +76,7 @@ mod cli {
.join("TEST_GITHUB_404.md");
cmd.arg(test_github_404_path)
.env_clear()
.assert()
.failure()
.code(2)