From b0f7a805efec7867c2f4599653aa027b71c5b024 Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 24 Nov 2020 21:30:06 +0100 Subject: [PATCH] Use builder pattern and channels (fixes #12) (#33) This implements a basic builder for the Checker struct as discussed in #12. It is using derive_builder and uses a custom build method to instantiate the more elaborate fields like reqwest::Client. It also adds deadpool and tokio::mpsc as dependencies to handle a pool of clients to query websites. --- Cargo.lock | 253 +++++++++++++++----- Cargo.toml | 2 + src/{checker.rs => client.rs} | 430 ++++++++++++++-------------------- src/client_pool.rs | 33 +++ src/collector.rs | 22 +- src/extract.rs | 74 +----- src/main.rs | 196 +++++++++------- src/options.rs | 19 +- src/stats.rs | 64 +++++ src/types.rs | 101 +++++++- tests/cli.rs | 19 +- 11 files changed, 705 insertions(+), 508 deletions(-) rename src/{checker.rs => client.rs} (57%) create mode 100644 src/client_pool.rs create mode 100644 src/stats.rs diff --git a/Cargo.lock b/Cargo.lock index bcdabbe..c1c8260 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -146,9 +146,9 @@ dependencies = [ [[package]] name = "async-channel" -version = "1.4.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43de69555a39d52918e2bc33a408d3c0a86c829b212d898f4ca25d21a6387478" +checksum = "59740d83946db6a5af71ae25ddf9562c2b176b2ca42cf99a455f09f4a220d6b9" dependencies = [ "concurrent-queue", "event-listener", @@ -165,7 +165,7 @@ dependencies = [ "flate2", "futures-core", "memchr", - "pin-project-lite", + "pin-project-lite 0.1.7", ] [[package]] @@ -195,7 +195,7 @@ dependencies = [ "httparse", "lazy_static", "log", - "pin-project-lite", + "pin-project-lite 0.1.7", ] [[package]] @@ -204,7 +204,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3ae22a338d28c75b53702b66f77979062cb29675db376d99e451af4fa79dedb3" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", "concurrent-queue", "futures-lite", "libc", @@ -273,7 +273,7 @@ dependencies = [ "async-mutex", "async-task", "blocking", - "crossbeam-utils", + "crossbeam-utils 0.7.2", "futures-channel", "futures-core", "futures-io", @@ -284,7 +284,7 @@ dependencies = [ "memchr", "num_cpus", "once_cell", - "pin-project-lite", + "pin-project-lite 0.1.7", "pin-utils", "slab", "wasm-bindgen-futures", @@ -349,7 +349,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46254cf2fdcdf1badb5934448c1bcbe046a56537b3987d96c51a7afc5d03f293" dependencies = [ "addr2line", - "cfg-if", + "cfg-if 0.1.10", "libc", "miniz_oxide", "object", @@ -383,7 +383,7 @@ dependencies = [ "anyhow", "async-mutex", "bastion-executor 0.4.0", - "crossbeam-queue", + "crossbeam-queue 0.2.3", "futures", "futures-timer", "fxhash", @@ -409,7 +409,7 @@ dependencies = [ "bastion-utils", "crossbeam-channel", "crossbeam-epoch", - "crossbeam-utils", + "crossbeam-utils 0.7.2", "futures-timer", "lazy_static", "libc", @@ -429,8 +429,8 @@ dependencies = [ "bastion-utils", "crossbeam-channel", "crossbeam-epoch", - "crossbeam-queue", - "crossbeam-utils", + "crossbeam-queue 0.2.3", + "crossbeam-utils 0.7.2", "futures-timer", "lazy_static", "lever", @@ -525,7 +525,7 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38e98299d518ec351ca016363e0cbfc77059dcd08dfa9700d15e405536097a" dependencies = [ - "crossbeam-queue", + "crossbeam-queue 0.2.3", "stable_deref_trait", ] @@ -575,6 +575,12 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + [[package]] name = "check-if-email-exists" version = "0.8.13" @@ -615,7 +621,7 @@ dependencies = [ "ansi_term 0.11.0", "atty", "bitflags", - "strsim", + "strsim 0.8.0", "textwrap", "unicode-width", "vec_map", @@ -639,6 +645,17 @@ dependencies = [ "cache-padded", ] +[[package]] +name = "config" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b076e143e1d9538dde65da30f8481c2a6c44040edb8e02b9bf1351edb92ce3" +dependencies = [ + "lazy_static", + "nom", + "serde", +] + [[package]] name = "console" version = "0.12.0" @@ -707,7 +724,7 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", ] [[package]] @@ -716,8 +733,8 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ee0cc8804d5393478d743b035099520087a5186f3b93fa58cec08fa62407b6" dependencies = [ - "cfg-if", - "crossbeam-utils", + "cfg-if 0.1.10", + "crossbeam-utils 0.7.2", ] [[package]] @@ -727,8 +744,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace" dependencies = [ "autocfg", - "cfg-if", - "crossbeam-utils", + "cfg-if 0.1.10", + "crossbeam-utils 0.7.2", "lazy_static", "maybe-uninit", "memoffset", @@ -741,11 +758,21 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "774ba60a54c213d409d5353bda12d49cd68d14e45036a285234c8d6f91f92570" dependencies = [ - "cfg-if", - "crossbeam-utils", + "cfg-if 0.1.10", + "crossbeam-utils 0.7.2", "maybe-uninit", ] +[[package]] +name = "crossbeam-queue" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b2a58563f049aa3bae172bc4120f093b5901161c629f280a1f40ba55317d774" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-utils 0.8.0", +] + [[package]] name = "crossbeam-utils" version = "0.7.2" @@ -753,7 +780,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8" dependencies = [ "autocfg", - "cfg-if", + "cfg-if 0.1.10", + "lazy_static", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec91540d98355f690a86367e566ecad2e9e579f230230eb7c21398372be73ea5" +dependencies = [ + "autocfg", + "cfg-if 1.0.0", + "const_fn", "lazy_static", ] @@ -767,12 +806,86 @@ dependencies = [ "subtle", ] +[[package]] +name = "darling" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d706e75d87e35569db781a9b5e2416cff1236a47ed380831f959382ccd5f858" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0c960ae2da4de88a91b2d920c2a7233b400bc33cb28453a2987822d8392519b" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim 0.9.3", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b5a2f4ac4969822c62224815d069952656cadc7084fdca9751e6d959189b72" +dependencies = [ + "darling_core", + "quote", + "syn", +] + [[package]] name = "data-encoding" version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4d0e2d24e5ee3b23a01de38eefdcd978907890701f08ffffd4cb457ca4ee8d6" +[[package]] +name = "deadpool" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f1ea999b4bd6f33b668ea6c847823fc95105be8799c0cf3611ac4025871e004" +dependencies = [ + "async-trait", + "config", + "crossbeam-queue 0.3.0", + "num_cpus", + "serde", + "tokio 0.3.4", +] + +[[package]] +name = "derive_builder" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2658621297f2cf68762a6f7dc0bb7e1ff2cfd6583daef8ee0fed6f7ec468ec0" +dependencies = [ + "darling", + "derive_builder_core", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "derive_builder_core" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2791ea3e372c8495c0bc2033991d76b512cd799d07491fbd6890124db9458bef" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "difference" version = "2.0.0" @@ -833,7 +946,7 @@ version = "0.8.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8ac63f94732332f44fe654443c46f6375d1939684c17b0afb6cb56b0456e171" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", ] [[package]] @@ -907,7 +1020,7 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "766d0e77a2c1502169d4a93ff3b8c15a71fd946cd0126309752104e5f3c46d94" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", "crc32fast", "libc", "miniz_oxide", @@ -1028,7 +1141,7 @@ dependencies = [ "futures-io", "memchr", "parking 2.0.0", - "pin-project-lite", + "pin-project-lite 0.1.7", "waker-fn", ] @@ -1132,7 +1245,7 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", "libc", "wasi", ] @@ -1185,7 +1298,7 @@ dependencies = [ "http", "indexmap", "slab", - "tokio", + "tokio 0.2.22", "tokio-util", "tracing", ] @@ -1315,7 +1428,7 @@ dependencies = [ "cookie", "http", "infer", - "pin-project-lite", + "pin-project-lite 0.1.7", "rand", "serde", "serde_json", @@ -1379,7 +1492,7 @@ dependencies = [ "pin-project", "socket2", "time 0.1.43", - "tokio", + "tokio 0.2.22", "tower-service", "tracing", "want", @@ -1394,7 +1507,7 @@ dependencies = [ "bytes 0.5.6", "hyper", "native-tls", - "tokio", + "tokio 0.2.22", "tokio-tls", ] @@ -1416,6 +1529,12 @@ dependencies = [ "unicase", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "0.2.0" @@ -1570,7 +1689,7 @@ checksum = "db65c6da02e61f55dae90a0ae427b2a5f6b3e8db09f58d10efab23af92592616" dependencies = [ "arrayvec", "bitflags", - "cfg-if", + "cfg-if 0.1.10", "ryu", "static_assertions", ] @@ -1587,7 +1706,7 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebf7c202c4da35d579058d845fad7b62d3aff5fcb10c5a57fc175ac632874de0" dependencies = [ - "crossbeam-utils", + "crossbeam-utils 0.7.2", "pin-utils", ] @@ -1621,7 +1740,7 @@ version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fabed175da42fed1fa0746b0ea71f412aa9d35e76e95e59b192c64b9dc2bf8b" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", ] [[package]] @@ -1640,6 +1759,8 @@ dependencies = [ "anyhow", "assert_cmd", "check-if-email-exists", + "deadpool", + "derive_builder", "futures", "glob", "headers", @@ -1656,7 +1777,7 @@ dependencies = [ "reqwest", "serde", "structopt", - "tokio", + "tokio 0.2.22", "toml", "url", "wiremock", @@ -1744,7 +1865,7 @@ version = "0.6.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fce347092656428bc8eaf6201042cb551b8d67855af7374542a92a0fbfcac430" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", "fuchsia-zircon", "fuchsia-zircon-sys", "iovec", @@ -1837,7 +1958,7 @@ version = "0.2.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2ba7c918ac76704fb42afcbbb43891e72731f3dcca3bef2a19786297baf14af7" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", "libc", "winapi 0.3.9", ] @@ -1953,7 +2074,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d575eff3665419f9b83678ff2815858ad9d11567e082f5ac1814baba4e2bcb4" dependencies = [ "bitflags", - "cfg-if", + "cfg-if 0.1.10", "foreign-types", "lazy_static", "libc", @@ -2008,7 +2129,7 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c361aa727dd08437f2f1447be8b59a33b0edd15e0fcee698f935613d9efbca9b" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", "cloudabi", "instant", "libc", @@ -2060,6 +2181,12 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282adbf10f2698a7a77f8e983a74b2d18176c19a7fd32a45446139ae7b02b715" +[[package]] +name = "pin-project-lite" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b063f57ec186e6140e2b8b6921e5f1bd89c7356dda5b33acc5401203ca6131c" + [[package]] name = "pin-utils" version = "0.1.0" @@ -2078,7 +2205,7 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e09dffb745feffca5be3dea51c02b7b368c4597ab0219a82acaf9799ab3e0d1" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", "libc", "wepoll-sys-stjepang", "winapi 0.3.9", @@ -2090,7 +2217,7 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a5884790f1ce3553ad55fec37b5aaac5882e0e845a2612df744d6c85c9bf046c" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", "universal-hash", ] @@ -2328,11 +2455,11 @@ dependencies = [ "mime_guess", "native-tls", "percent-encoding", - "pin-project-lite", + "pin-project-lite 0.1.7", "serde", "serde_json", "serde_urlencoded 0.6.1", - "tokio", + "tokio 0.2.22", "tokio-socks", "tokio-tls", "url", @@ -2546,7 +2673,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2933378ddfeda7ea26f48c555bdad8bb446bf8a3d17832dc83e380d444cfb8c1" dependencies = [ "block-buffer 0.9.0", - "cfg-if", + "cfg-if 0.1.10", "cpuid-bool", "digest 0.9.0", "opaque-debug 0.3.0", @@ -2600,7 +2727,7 @@ version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03088793f677dce356f3ccc2edb1b314ad191ab702a5de3faf49304f7e104918" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", "libc", "redox_syscall", "winapi 0.3.9", @@ -2688,6 +2815,12 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" +[[package]] +name = "strsim" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6446ced80d6c486436db5c078dde11a9f73d42b57fb273121e160b84f63d894c" + [[package]] name = "structopt" version = "0.3.20" @@ -2735,7 +2868,7 @@ version = "3.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", "libc", "rand", "redox_syscall", @@ -2880,13 +3013,23 @@ dependencies = [ "mio-named-pipes", "mio-uds", "num_cpus", - "pin-project-lite", + "pin-project-lite 0.1.7", "signal-hook-registry", "slab", "tokio-macros", "winapi 0.3.9", ] +[[package]] +name = "tokio" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dfe2523e6fa84ddf5e688151d4e5fddc51678de9752c6512a24714c23818d61" +dependencies = [ + "autocfg", + "pin-project-lite 0.2.0", +] + [[package]] name = "tokio-macros" version = "0.2.5" @@ -2908,7 +3051,7 @@ dependencies = [ "either", "futures", "thiserror", - "tokio", + "tokio 0.2.22", ] [[package]] @@ -2918,7 +3061,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a70f4fcd7b3b24fb194f837560168208f669ca8cb70d0c4b862944452396343" dependencies = [ "native-tls", - "tokio", + "tokio 0.2.22", ] [[package]] @@ -2931,8 +3074,8 @@ dependencies = [ "futures-core", "futures-sink", "log", - "pin-project-lite", - "tokio", + "pin-project-lite 0.1.7", + "tokio 0.2.22", ] [[package]] @@ -2956,7 +3099,7 @@ version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d79ca061b032d6ce30c660fded31189ca0b9922bf483cd70759f13a2d86786c" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", "log", "tracing-attributes", "tracing-core", @@ -3046,7 +3189,7 @@ dependencies = [ "rand", "smallvec", "thiserror", - "tokio", + "tokio 0.2.22", "url", ] @@ -3057,7 +3200,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0f23cdfdc3d8300b3c50c9e84302d3bd6d860fb9529af84ace6cf9665f181b77" dependencies = [ "backtrace", - "cfg-if", + "cfg-if 0.1.10", "futures", "ipconfig", "lazy_static", @@ -3224,7 +3367,7 @@ version = "0.2.67" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0563a9a4b071746dd5aedbc3a28c6fe9be4586fb3fbadb67c400d4f53c6b16c" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", "serde", "serde_json", "wasm-bindgen-macro", @@ -3251,7 +3394,7 @@ version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95f8d235a77f880bcef268d379810ea6c0af2eacfa90b1ad5af731776e0c4699" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", "js-sys", "wasm-bindgen", "web-sys", diff --git a/Cargo.toml b/Cargo.toml index cd6d220..e5625cc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,8 @@ serde = { version = "1.0", features = ["derive"] } pulldown-cmark = "0.8.0" quick-xml = "0.20.0" headers = "0.3.2" +derive_builder = "0.9.0" +deadpool = "0.6.0" [dependencies.reqwest] features = ["gzip"] diff --git a/src/checker.rs b/src/client.rs similarity index 57% rename from src/checker.rs rename to src/client.rs index b4bd212..156209d 100644 --- a/src/checker.rs +++ b/src/client.rs @@ -1,12 +1,9 @@ -use crate::{ - extract::{self, Uri}, - types::{Excludes, RequestMethod, Status}, -}; -use anyhow::{Context, Result}; +use crate::types::{Excludes, Response, Status, Uri}; +use anyhow::{anyhow, Context, Result}; use check_if_email_exists::{check_email, CheckEmailInput}; +use derive_builder::Builder; use headers::{HeaderMap, HeaderValue}; use hubcaps::{Credentials, Github}; -use indicatif::ProgressBar; use regex::{Regex, RegexSet}; use reqwest::header; use std::net::IpAddr; @@ -14,46 +11,60 @@ use std::{collections::HashSet, time::Duration}; use tokio::time::delay_for; use url::Url; -/// A link checker using an API token for Github links -/// otherwise a normal HTTP client. -pub(crate) struct Checker<'a> { +const DEFAULT_MAX_REDIRECTS: usize = 5; + +#[derive(Debug, Clone)] +pub struct Client { reqwest_client: reqwest::Client, github: Option, includes: Option, excludes: Excludes, scheme: Option, - method: RequestMethod, + method: reqwest::Method, accepted: Option>, - verbose: bool, - progress_bar: Option<&'a ProgressBar>, } -impl<'a> Checker<'a> { - /// Creates a new link checker - // we should consider adding a config struct for this, so that the list - // of arguments is short - #[allow(clippy::too_many_arguments)] - pub fn try_new( - github_token: Option, - includes: Option, - excludes: Excludes, - max_redirects: usize, - user_agent: String, - allow_insecure: bool, - scheme: Option, - custom_headers: HeaderMap, - method: RequestMethod, - accepted: Option>, - timeout: Option, - verbose: bool, - progress_bar: Option<&'a ProgressBar>, - ) -> Result { +/// A link checker using an API token for Github links +/// otherwise a normal HTTP client. +#[derive(Builder, Debug)] +#[builder(build_fn(skip))] +#[builder(setter(into))] +#[builder(name = "ClientBuilder")] +pub struct ClientBuilderInternal { + github_token: Option, + includes: Option, + excludes: Excludes, + max_redirects: usize, + user_agent: String, + allow_insecure: bool, + scheme: Option, + custom_headers: HeaderMap, + method: reqwest::Method, + accepted: Option>, + timeout: Option, + verbose: bool, +} + +impl ClientBuilder { + pub fn build(&mut self) -> Result { let mut headers = HeaderMap::new(); + // Faking the user agent is necessary for some websites, unfortunately. // Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com). + // let user_agent = self.user_agent.as_ref().unwrap_or(&"lychee/0.3.0".to_string()); + let user_agent = match self.user_agent { + Some(ref u) => u.clone(), + None => String::from("lychee/0.3.0"), + }; + headers.insert(header::USER_AGENT, HeaderValue::from_str(&user_agent)?); headers.insert(header::TRANSFER_ENCODING, HeaderValue::from_str("chunked")?); - headers.extend(custom_headers); + if let Some(custom) = &self.custom_headers { + headers.extend(custom.clone()); + } + + let allow_insecure = self.allow_insecure.unwrap_or(false); + let max_redirects = self.max_redirects.unwrap_or(DEFAULT_MAX_REDIRECTS); let builder = reqwest::ClientBuilder::new() .gzip(true) @@ -61,36 +72,43 @@ impl<'a> Checker<'a> { .danger_accept_invalid_certs(allow_insecure) .redirect(reqwest::redirect::Policy::limited(max_redirects)); - let builder = match timeout { - Some(timeout) => builder.timeout(timeout), + let builder = match self.timeout { + Some(t) => builder + .timeout(t.ok_or_else(|| anyhow!("cannot parse timeout: {:?}", self.timeout))?), None => builder, }; let reqwest_client = builder.build()?; - let github = match github_token { + let token: Option = self.github_token.clone().unwrap_or_default(); + let github = match token { Some(token) => { - let github = Github::new(user_agent, Credentials::Token(token))?; - Some(github) + if token.is_empty() { + None + } else { + let github = Github::new(user_agent, Credentials::Token(token))?; + Some(github) + } } None => None, }; + let scheme = self.scheme.clone().unwrap_or(None); let scheme = scheme.map(|s| s.to_lowercase()); - Ok(Checker { + Ok(Client { reqwest_client, github, - includes, - excludes, + includes: self.includes.clone().unwrap_or(None), + excludes: self.excludes.clone().unwrap_or_default(), scheme, - method, - accepted, - verbose, - progress_bar, + method: self.method.clone().unwrap_or(reqwest::Method::GET), + accepted: self.accepted.clone().unwrap_or(None), }) } +} +impl Client { async fn check_github(&self, owner: String, repo: String) -> Status { match &self.github { Some(github) => { @@ -110,10 +128,9 @@ impl<'a> Checker<'a> { } async fn check_normal(&self, url: &Url) -> Status { - let request = match self.method { - RequestMethod::GET => self.reqwest_client.get(url.as_str()), - RequestMethod::HEAD => self.reqwest_client.head(url.as_str()), - }; + let request = self + .reqwest_client + .request(self.method.clone(), url.as_str()); let res = request.send().await; match res { Ok(response) => Status::new(response.status(), self.accepted.clone()), @@ -233,48 +250,14 @@ impl<'a> Checker<'a> { uri.scheme() != self.scheme } - fn status_message(&self, status: &Status, uri: &Uri) -> Option { - match status { - Status::Ok(code) => { - if self.verbose { - Some(format!("✅{} [{}]", uri, code)) - } else { - None - } - } - Status::Failed(code) => Some(format!("🚫{} [{}]", uri, code)), - Status::Redirected => { - if self.verbose { - Some(format!("🔀️{}", uri)) - } else { - None - } - } - Status::Excluded => { - if self.verbose { - Some(format!("👻{}", uri)) - } else { - None - } - } - Status::Error(e) => Some(format!("⚡ {} ({})", uri, e)), - Status::Timeout => Some(format!("⌛{}", uri)), - } - } - - pub async fn check(&self, uri: &extract::Uri) -> Status { + pub async fn check(&self, uri: Uri) -> Response { if self.excluded(&uri) { - return Status::Excluded; + return Response::new(uri, Status::Excluded); } - - if let Some(pb) = self.progress_bar { - pb.set_message(&uri.to_string()); - } - - let ret = match uri { - Uri::Website(url) => self.check_real(url).await, - Uri::Mail(address) => { - let valid = self.valid_mail(address).await; + let status = match uri { + Uri::Website(ref url) => self.check_real(&url).await, + Uri::Mail(ref address) => { + let valid = self.valid_mail(&address).await; if valid { // TODO: We should not be using a HTTP status code for mail Status::Ok(http::StatusCode::OK) @@ -283,18 +266,7 @@ impl<'a> Checker<'a> { } } }; - - if let Some(pb) = self.progress_bar { - pb.inc(1); - // regular println! interferes with progress bar - if let Some(message) = self.status_message(&ret, uri) { - pb.println(message); - } - } else if let Some(message) = self.status_message(&ret, uri) { - println!("{}", message); - } - - ret + Response::new(uri, status) } } @@ -326,49 +298,28 @@ mod test { const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]"; const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]"; - fn get_checker(allow_insecure: bool, custom_headers: HeaderMap) -> Checker<'static> { - let checker = Checker::try_new( - None, - None, - Excludes::default(), - 5, - "curl/7.71.1".to_string(), - allow_insecure, - None, - custom_headers, - RequestMethod::GET, - None, - None, - false, - None, - ) - .unwrap(); - checker - } - fn website_url(s: &str) -> Uri { Uri::Website(Url::parse(s).expect("Expected valid Website Uri")) } #[tokio::test] async fn test_nonexistent() { - let res = get_checker(false, HeaderMap::new()) - .check(&website_url("https://endler.dev/abcd")) + let res = ClientBuilder::default() + .build() + .unwrap() + .check(website_url("https://endler.dev/abcd")) .await; - assert!(matches!(res, Status::Failed(_))); + assert!(matches!(res.status, Status::Failed(_))); } #[tokio::test] async fn test_exponential_backoff() { let start = Instant::now(); - let res = get_checker(false, HeaderMap::new()) - .check(&Uri::Website( - Url::parse("https://endler.dev/abcd").unwrap(), - )) - .await; + let uri = Uri::Website(Url::parse("https://endler.dev/abcd").unwrap()); + let res = ClientBuilder::default().build().unwrap().check(uri).await; let end = start.elapsed(); - assert!(matches!(res, Status::Failed(_))); + assert!(matches!(res.status, Status::Failed(_))); // on slow connections, this might take a bit longer than nominal backed-off timeout (7 secs) assert!(end.as_secs() >= 7); @@ -378,7 +329,9 @@ mod test { #[test] fn test_is_github() { assert_eq!( - get_checker(false, HeaderMap::new()) + ClientBuilder::default() + .build() + .unwrap() .extract_github("https://github.com/mre/idiomatic-rust") .unwrap(), ("mre".into(), "idiomatic-rust".into()) @@ -387,61 +340,80 @@ mod test { #[tokio::test] async fn test_github() { assert!(matches!( - get_checker(false, HeaderMap::new()) - .check(&website_url("https://github.com/mre/idiomatic-rust")) - .await, + ClientBuilder::default() + .build() + .unwrap() + .check(website_url("https://github.com/mre/idiomatic-rust")) + .await + .status, Status::Ok(_) )); } #[tokio::test] async fn test_github_nonexistent() { - let res = get_checker(false, HeaderMap::new()) - .check(&website_url( + let res = ClientBuilder::default() + .build() + .unwrap() + .check(website_url( "https://github.com/mre/idiomatic-rust-doesnt-exist-man", )) - .await; + .await + .status; assert!(matches!(res, Status::Error(_))); } #[tokio::test] async fn test_non_github() { - let res = get_checker(false, HeaderMap::new()) - .check(&website_url("https://endler.dev")) - .await; + let res = ClientBuilder::default() + .build() + .unwrap() + .check(website_url("https://endler.dev")) + .await + .status; assert!(matches!(res, Status::Ok(_))); } #[tokio::test] async fn test_invalid_ssl() { - let res = get_checker(false, HeaderMap::new()) - .check(&website_url("https://expired.badssl.com/")) + let res = ClientBuilder::default() + .build() + .unwrap() + .check(website_url("https://expired.badssl.com/")) .await; - assert!(matches!(res, Status::Error(_))); + assert!(matches!(res.status, Status::Error(_))); // Same, but ignore certificate error - let res = get_checker(true, HeaderMap::new()) - .check(&website_url("https://expired.badssl.com/")) + let res = ClientBuilder::default() + .allow_insecure(true) + .build() + .unwrap() + .check(website_url("https://expired.badssl.com/")) .await; - assert!(matches!(res, Status::Ok(_))); + assert!(matches!(res.status, Status::Ok(_))); } #[tokio::test] async fn test_custom_headers() { - let res = get_checker(false, HeaderMap::new()) - .check(&website_url("https://crates.io/keywords/cassandra")) + let res = ClientBuilder::default() + .build() + .unwrap() + .check(website_url("https://crates.io/keywords/cassandra")) .await; - assert!(matches!(res, Status::Failed(StatusCode::NOT_FOUND))); + assert!(matches!(res.status, Status::Failed(StatusCode::NOT_FOUND))); // Try again, but with a custom header. // For example, crates.io requires a custom accept header. // See https://github.com/rust-lang/crates.io/issues/788 let mut custom = HeaderMap::new(); custom.insert(header::ACCEPT, "text/html".parse().unwrap()); - let res = get_checker(true, custom) - .check(&website_url("https://crates.io/keywords/cassandra")) + let res = ClientBuilder::default() + .custom_headers(custom) + .build() + .unwrap() + .check(website_url("https://crates.io/keywords/cassandra")) .await; - assert!(matches!(res, Status::Ok(_))); + assert!(matches!(res.status, Status::Ok(_))); } #[tokio::test] @@ -460,55 +432,29 @@ mod test { .mount(&mock_server) .await; - let checker = Checker::try_new( - None, - None, - Excludes::default(), - 5, - "curl/7.71.1".to_string(), - true, - None, - HeaderMap::new(), - RequestMethod::GET, - None, - Some(checker_timeout), - false, - None, - ) - .expect("Expected successful instantiation"); + let client = ClientBuilder::default() + .timeout(checker_timeout) + .build() + .unwrap(); - let resp = checker - .check(&Uri::Website(Url::parse(&mock_server.uri()).unwrap())) + let resp = client + .check(Uri::Website(Url::parse(&mock_server.uri()).unwrap())) .await; - assert!(matches!(resp, Status::Timeout)); + assert!(matches!(resp.status, Status::Timeout)); } #[tokio::test] async fn test_include_regex() { - let includes = Some(RegexSet::new(&[r"foo.github.com"]).unwrap()); + let includes = RegexSet::new(&[r"foo.github.com"]).unwrap(); + + let client = ClientBuilder::default().includes(includes).build().unwrap(); - let checker = Checker::try_new( - None, - includes, - Excludes::default(), - 5, - "curl/7.71.1".to_string(), - true, - None, - HeaderMap::new(), - RequestMethod::GET, - None, - None, - false, - None, - ) - .unwrap(); assert_eq!( - checker.excluded(&website_url("https://foo.github.com")), + client.excluded(&website_url("https://foo.github.com")), false ); assert_eq!( - checker.excluded(&website_url("https://bar.github.com")), + client.excluded(&website_url("https://bar.github.com")), true ); } @@ -517,31 +463,21 @@ mod test { async fn test_exclude_include_regex() { let mut excludes = Excludes::default(); excludes.regex = Some(RegexSet::new(&[r"github.com"]).unwrap()); - let includes = Some(RegexSet::new(&[r"foo.github.com"]).unwrap()); + let includes = RegexSet::new(&[r"foo.github.com"]).unwrap(); + + let client = ClientBuilder::default() + .includes(includes) + .excludes(excludes) + .build() + .unwrap(); - let checker = Checker::try_new( - None, - includes, - excludes, - 5, - "curl/7.71.1".to_string(), - true, - None, - HeaderMap::new(), - RequestMethod::GET, - None, - None, - false, - None, - ) - .unwrap(); assert_eq!( - checker.excluded(&website_url("https://foo.github.com")), + client.excluded(&website_url("https://foo.github.com")), false ); - assert_eq!(checker.excluded(&website_url("https://github.com")), true); + assert_eq!(client.excluded(&website_url("https://github.com")), true); assert_eq!( - checker.excluded(&website_url("https://bar.github.com")), + client.excluded(&website_url("https://bar.github.com")), true ); } @@ -552,30 +488,16 @@ mod test { excludes.regex = Some(RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.com"]).unwrap()); - let checker = Checker::try_new( - None, - None, - excludes, - 5, - "curl/7.71.1".to_string(), - true, - None, - HeaderMap::new(), - RequestMethod::GET, - None, - None, - false, - None, - ) - .unwrap(); - assert_eq!(checker.excluded(&website_url("http://github.com")), true); - assert_eq!(checker.excluded(&website_url("http://exclude.org")), true); + let client = ClientBuilder::default().excludes(excludes).build().unwrap(); + + assert_eq!(client.excluded(&website_url("http://github.com")), true); + assert_eq!(client.excluded(&website_url("http://exclude.org")), true); assert_eq!( - checker.excluded(&Uri::Mail("mail@example.com".to_string())), + client.excluded(&Uri::Mail("mail@example.com".to_string())), true ); assert_eq!( - checker.excluded(&Uri::Mail("foo@bar.dev".to_string())), + client.excluded(&Uri::Mail("foo@bar.dev".to_string())), false ); } @@ -610,57 +532,57 @@ mod test { #[test] fn test_excludes_no_private_ips_by_default() { - let checker = get_checker(false, HeaderMap::new()); + let client = ClientBuilder::default().build().unwrap(); - assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_A)), false); - assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_B)), false); - assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_C)), false); - assert_eq!(checker.excluded(&website_url(V4_LINK_LOCAL)), false); - assert_eq!(checker.excluded(&website_url(V4_LOOPBACK)), false); + assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_A)), false); + assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_B)), false); + assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_C)), false); + assert_eq!(client.excluded(&website_url(V4_LINK_LOCAL)), false); + assert_eq!(client.excluded(&website_url(V4_LOOPBACK)), false); - assert_eq!(checker.excluded(&website_url(V6_LOOPBACK)), false); + assert_eq!(client.excluded(&website_url(V6_LOOPBACK)), false); } #[test] fn test_exclude_private() { - let mut checker = get_checker(false, HeaderMap::new()); - checker.excludes.private_ips = true; + let mut client = ClientBuilder::default().build().unwrap(); + client.excludes.private_ips = true; - assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_A)), true); - assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_B)), true); - assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_C)), true); + assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_A)), true); + assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_B)), true); + assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_C)), true); } #[test] fn test_exclude_link_local() { - let mut checker = get_checker(false, HeaderMap::new()); - checker.excludes.link_local_ips = true; + let mut client = ClientBuilder::default().build().unwrap(); + client.excludes.link_local_ips = true; - assert_eq!(checker.excluded(&website_url(V4_LINK_LOCAL)), true); + assert_eq!(client.excluded(&website_url(V4_LINK_LOCAL)), true); } #[test] fn test_exclude_loopback() { - let mut checker = get_checker(false, HeaderMap::new()); - checker.excludes.loopback_ips = true; + let mut client = ClientBuilder::default().build().unwrap(); + client.excludes.loopback_ips = true; - assert_eq!(checker.excluded(&website_url(V4_LOOPBACK)), true); - assert_eq!(checker.excluded(&website_url(V6_LOOPBACK)), true); + assert_eq!(client.excluded(&website_url(V4_LOOPBACK)), true); + assert_eq!(client.excluded(&website_url(V6_LOOPBACK)), true); } #[test] fn test_exclude_ip_v4_mapped_ip_v6_not_supported() { - let mut checker = get_checker(false, HeaderMap::new()); - checker.excludes.private_ips = true; - checker.excludes.link_local_ips = true; + let mut client = ClientBuilder::default().build().unwrap(); + client.excludes.private_ips = true; + client.excludes.link_local_ips = true; // if these were pure IPv4, we would exclude assert_eq!( - checker.excluded(&website_url(V6_MAPPED_V4_PRIVATE_CLASS_A)), + client.excluded(&website_url(V6_MAPPED_V4_PRIVATE_CLASS_A)), false ); assert_eq!( - checker.excluded(&website_url(V6_MAPPED_V4_LINK_LOCAL)), + client.excluded(&website_url(V6_MAPPED_V4_LINK_LOCAL)), false ); } diff --git a/src/client_pool.rs b/src/client_pool.rs new file mode 100644 index 0000000..4cc691d --- /dev/null +++ b/src/client_pool.rs @@ -0,0 +1,33 @@ +use client::Client; +use deadpool::unmanaged::Pool; +use tokio::sync::mpsc; + +use crate::{client, types}; + +pub struct ClientPool { + tx: mpsc::Sender, + rx: mpsc::Receiver, + pool: deadpool::unmanaged::Pool, +} + +impl ClientPool { + pub fn new( + tx: mpsc::Sender, + rx: mpsc::Receiver, + clients: Vec, + ) -> Self { + let pool = Pool::from(clients); + ClientPool { tx, rx, pool } + } + + pub async fn listen(&mut self) { + while let Some(req) = self.rx.recv().await { + let client = self.pool.get().await; + let mut tx = self.tx.clone(); + tokio::spawn(async move { + let resp = client.check(req).await; + tx.send(resp).await.unwrap(); + }); + } + } +} diff --git a/src/collector.rs b/src/collector.rs index f398481..0542c6f 100644 --- a/src/collector.rs +++ b/src/collector.rs @@ -1,21 +1,17 @@ -use crate::extract::{self, extract_links, FileType}; +use crate::extract::{extract_links, FileType}; +use crate::types::Uri; use anyhow::Result; -use extract::Uri; use glob::glob; use reqwest::Url; -use std::path::Path; use std::{collections::HashSet, fs}; +use std::{ffi::OsStr, path::Path}; /// Detect if the given path points to a Markdown, HTML, or plaintext file. -fn resolve_file_type_by_path>(p: P) -> FileType { - let path = p.as_ref(); - match path.extension() { - Some(ext) => match ext.to_str().unwrap() { - "md" => FileType::Markdown, - "html" | "htm" => FileType::HTML, - _ => FileType::Plaintext, - }, - None => FileType::Plaintext, +fn resolve_file_type_by_path>(path: P) -> FileType { + match path.as_ref().extension().and_then(OsStr::to_str) { + Some("md") => FileType::Markdown, + Some("html") => FileType::HTML, + _ => FileType::Plaintext, } } @@ -57,7 +53,7 @@ pub(crate) async fn collect_links( base_url.clone(), )); } - Err(e) => println!("{:?}", e), + Err(e) => println!("Error handling file pattern {}: {:?}", input, e), } } } diff --git a/src/extract.rs b/src/extract.rs index f8c0dfc..feaa8dd 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -1,17 +1,11 @@ +use crate::types::Uri; use linkify::LinkFinder; use pulldown_cmark::{Event as MDEvent, Parser, Tag}; use quick_xml::{events::Event as HTMLEvent, Reader}; -use std::net::IpAddr; +use std::collections::HashSet; use std::path::Path; -use std::{collections::HashSet, fmt::Display}; use url::Url; -#[derive(Clone, Debug, PartialEq, Eq, Hash)] -pub(crate) enum Uri { - Website(Url), - Mail(String), -} - #[derive(Clone, Debug)] pub(crate) enum FileType { HTML, @@ -19,39 +13,6 @@ pub(crate) enum FileType { Plaintext, } -impl Uri { - pub fn as_str(&self) -> &str { - match self { - Uri::Website(url) => url.as_str(), - Uri::Mail(address) => address.as_str(), - } - } - - pub fn scheme(&self) -> Option { - match self { - Uri::Website(url) => Some(url.scheme().to_string()), - Uri::Mail(_address) => None, - } - } - - pub fn host_ip(&self) -> Option { - match self { - Self::Website(url) => match url.host()? { - url::Host::Ipv4(v4_addr) => Some(v4_addr.into()), - url::Host::Ipv6(v6_addr) => Some(v6_addr.into()), - _ => None, - }, - Self::Mail(_) => None, - } - } -} - -impl Display for Uri { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.as_str()) - } -} - // Use LinkFinder here to offload the actual link searching fn find_links(input: &str) -> Vec { let finder = LinkFinder::new(); @@ -168,7 +129,7 @@ pub(crate) fn extract_links( uris.insert(Uri::Mail(link)); } else if !Path::new(&link).exists() { if let Some(base_url) = &base_url { - if let Ok(new_url) = base_url.clone().join(&link) { + if let Ok(new_url) = base_url.join(&link) { uris.insert(Uri::Website(new_url)); } } @@ -176,8 +137,6 @@ pub(crate) fn extract_links( } }; } - - debug!("Found: {:#?}", uris); uris } @@ -185,7 +144,6 @@ pub(crate) fn extract_links( mod test { use super::*; use std::iter::FromIterator; - use std::net::{Ipv4Addr, Ipv6Addr}; #[test] fn test_extract_markdown_links() { @@ -277,30 +235,4 @@ mod test { assert!(links.len() == 1); assert_eq!(links[0].as_str(), expected); } - - #[test] - fn test_uri_host_ip_v4() { - let uri = - Uri::Website(Url::parse("http://127.0.0.1").expect("Expected URI with valid IPv4")); - let ip = uri.host_ip().expect("Expected a valid IPv4"); - assert_eq!(ip, IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1))); - } - - #[test] - fn test_uri_host_ip_v6() { - let uri = - Uri::Website(Url::parse("https://[2020::0010]").expect("Expected URI with valid IPv6")); - let ip = uri.host_ip().expect("Expected a valid IPv6"); - assert_eq!( - ip, - IpAddr::V6(Ipv6Addr::new(0x2020, 0, 0, 0, 0, 0, 0, 0x10)) - ); - } - - #[test] - fn test_uri_host_ip_no_ip() { - let uri = Uri::Website(Url::parse("https://some.cryptic/url").expect("Expected valid URI")); - let ip = uri.host_ip(); - assert!(ip.is_none()); - } } diff --git a/src/main.rs b/src/main.rs index cc969aa..66ac79b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,23 +2,28 @@ extern crate log; use anyhow::{anyhow, Result}; -use futures::future::join_all; use headers::authorization::Basic; use headers::{Authorization, HeaderMap, HeaderMapExt, HeaderName}; use indicatif::{ProgressBar, ProgressStyle}; use regex::RegexSet; -use std::{collections::HashSet, convert::TryInto, time::Duration}; +use std::str::FromStr; +use std::{collections::HashSet, time::Duration}; use structopt::StructOpt; +use tokio::sync::mpsc; -mod checker; +mod client; +mod client_pool; mod collector; mod extract; mod options; +mod stats; mod types; -use checker::Checker; -use extract::Uri; +use client::ClientBuilder; +use client_pool::ClientPool; use options::{Config, LycheeOptions}; +use stats::ResponseStats; +use types::Response; use types::{Excludes, Status}; /// A C-like enum that can be cast to `i32` and used as process exit code. @@ -32,27 +37,6 @@ enum ExitCode { LinkCheckFailure = 2, } -fn print_summary(found: &HashSet, results: &[Status]) { - let found = found.len(); - let excluded: usize = results - .iter() - .filter(|l| matches!(l, Status::Excluded)) - .count(); - let success: usize = results - .iter() - .filter(|l| matches!(l, Status::Ok(_))) - .count(); - let errors: usize = found - excluded - success; - - println!(); - println!("📝Summary"); - println!("-------------------"); - println!("🔍Found: {}", found); - println!("👻Excluded: {}", excluded); - println!("✅Successful: {}", success); - println!("🚫Errors: {}", errors); -} - fn main() -> Result<()> { pretty_env_logger::init(); let opts = LycheeOptions::from_args(); @@ -79,23 +63,50 @@ fn main() -> Result<()> { std::process::exit(errorcode); } -async fn run(cfg: Config, inputs: Vec) -> Result { - let includes = RegexSet::new(&cfg.include).ok(); - let excludes = Excludes::from_options(&cfg); - let mut headers = parse_headers(cfg.headers)?; +fn show_progress(progress_bar: &Option, response: &Response, verbose: bool) { + let message = status_message(&response, verbose); + if let Some(pb) = progress_bar { + pb.inc(1); + // regular println! interferes with progress bar + if let Some(message) = message { + pb.println(message); + } + } else if let Some(message) = message { + println!("{}", message); + }; +} - if let Some(auth) = cfg.basic_auth { +async fn run(cfg: Config, inputs: Vec) -> Result { + let mut headers = parse_headers(&cfg.headers)?; + if let Some(auth) = &cfg.basic_auth { let auth_header = parse_basic_auth(&auth)?; headers.typed_insert(auth_header); } - let accepted = match cfg.accept { - Some(accept) => parse_statuscodes(accept)?, - None => None, - }; - let timeout = parse_timeout(cfg.timeout)?; - let links = collector::collect_links(inputs, cfg.base_url).await?; - let progress_bar = if cfg.progress { + let accepted = cfg.accept.clone().and_then(|a| parse_statuscodes(&a).ok()); + let timeout = parse_timeout(&cfg.timeout)?; + let max_concurrency = cfg.max_concurrency.parse()?; + let method: reqwest::Method = reqwest::Method::from_str(&cfg.method.to_uppercase())?; + let includes = RegexSet::new(&cfg.include)?; + let excludes = Excludes::from_options(&cfg); + + let client = ClientBuilder::default() + .includes(includes) + .excludes(excludes) + .max_redirects(cfg.max_redirects) + .user_agent(cfg.user_agent) + .allow_insecure(cfg.insecure) + .custom_headers(headers) + .method(method) + .timeout(timeout) + .verbose(cfg.verbose) + .github_token(cfg.github_token) + .scheme(cfg.scheme) + .accepted(accepted) + .build()?; + + let links = collector::collect_links(inputs, cfg.base_url.clone()).await?; + let pb = if cfg.progress { Some( ProgressBar::new(links.len() as u64) .with_style( @@ -107,43 +118,51 @@ async fn run(cfg: Config, inputs: Vec) -> Result { } else { None }; - let checker = Checker::try_new( - cfg.github_token, - includes, - excludes, - cfg.max_redirects, - cfg.user_agent, - cfg.insecure, - cfg.scheme, - headers, - cfg.method.try_into()?, - accepted, - Some(timeout), - cfg.verbose, - progress_bar.as_ref(), - )?; - let futures: Vec<_> = links.iter().map(|l| checker.check(l)).collect(); - let results = join_all(futures).await; + let (mut send_req, recv_req) = mpsc::channel(max_concurrency); + let (send_resp, mut recv_resp) = mpsc::channel(max_concurrency); - // note that prints may interfere progress bar so this must go before summary - if let Some(progress_bar) = progress_bar { - progress_bar.finish_and_clear(); + let mut stats = ResponseStats::new(); + + let bar = pb.clone(); + tokio::spawn(async move { + for link in links { + if let Some(pb) = &bar { + pb.set_message(&link.to_string()); + }; + send_req.send(link).await.unwrap(); + } + }); + + tokio::spawn(async move { + // Start receiving requests + let clients: Vec<_> = (0..max_concurrency).map(|_| client.clone()).collect(); + let mut clients = ClientPool::new(send_resp, recv_req, clients); + clients.listen().await; + }); + + while let Some(response) = recv_resp.recv().await { + show_progress(&pb, &response, cfg.verbose); + stats.add(response); + } + + // Note that print statements may interfere with the progress bar, so this + // must go before printing the stats + if let Some(pb) = &pb { + pb.finish_and_clear(); } if cfg.verbose { - print_summary(&links, &results); + println!("\n{}", stats); } - let success = results.iter().all(|r| r.is_success() || r.is_excluded()); - - match success { + match stats.is_success() { true => Ok(ExitCode::Success as i32), false => Ok(ExitCode::LinkCheckFailure as i32), } } -fn read_header(input: String) -> Result<(String, String)> { +fn read_header(input: &str) -> Result<(String, String)> { let elements: Vec<_> = input.split('=').collect(); if elements.len() != 2 { return Err(anyhow!( @@ -154,14 +173,14 @@ fn read_header(input: String) -> Result<(String, String)> { Ok((elements[0].into(), elements[1].into())) } -fn parse_timeout(timeout: String) -> Result { - Ok(Duration::from_secs(timeout.parse::()?)) +fn parse_timeout>(timeout: S) -> Result { + Ok(Duration::from_secs(timeout.as_ref().parse::()?)) } -fn parse_headers(headers: Vec) -> Result { +fn parse_headers>(headers: &[T]) -> Result { let mut out = HeaderMap::new(); for header in headers { - let (key, val) = read_header(header)?; + let (key, val) = read_header(header.as_ref())?; out.insert( HeaderName::from_bytes(key.as_bytes())?, val.parse().unwrap(), @@ -170,13 +189,13 @@ fn parse_headers(headers: Vec) -> Result { Ok(out) } -fn parse_statuscodes(accept: String) -> Result>> { +fn parse_statuscodes>(accept: T) -> Result> { let mut statuscodes = HashSet::new(); - for code in accept.split(',').into_iter() { + for code in accept.as_ref().split(',').into_iter() { let code: reqwest::StatusCode = reqwest::StatusCode::from_bytes(code.as_bytes())?; statuscodes.insert(code); } - Ok(Some(statuscodes)) + Ok(statuscodes) } fn parse_basic_auth(auth: &str) -> Result> { @@ -190,6 +209,18 @@ fn parse_basic_auth(auth: &str) -> Result> { Ok(Authorization::basic(params[0], params[1])) } +fn status_message(response: &Response, verbose: bool) -> Option { + match &response.status { + Status::Ok(code) if verbose => Some(format!("✅ {} [{}]", response.uri, code)), + Status::Redirected if verbose => Some(format!("🔀️ {}", response.uri)), + Status::Excluded if verbose => Some(format!("👻 {}", response.uri)), + Status::Failed(code) => Some(format!("🚫 {} [{}]", response.uri, code)), + Status::Error(e) => Some(format!("⚡ {} ({})", response.uri, e)), + Status::Timeout => Some(format!("⌛ {}", response.uri)), + _ => None, + } +} + #[cfg(test)] mod test { use super::*; @@ -200,25 +231,20 @@ mod test { fn test_parse_custom_headers() { let mut custom = HeaderMap::new(); custom.insert(header::ACCEPT, "text/html".parse().unwrap()); - assert_eq!( - parse_headers(vec!["accept=text/html".into()]).unwrap(), - custom - ); + assert_eq!(parse_headers(&["accept=text/html"]).unwrap(), custom); } #[test] fn test_parse_statuscodes() { - let actual = parse_statuscodes("200,204,301".into()).unwrap(); - let expected: Option> = Some( - [ - StatusCode::OK, - StatusCode::NO_CONTENT, - StatusCode::MOVED_PERMANENTLY, - ] - .iter() - .cloned() - .collect(), - ); + let actual = parse_statuscodes("200,204,301").unwrap(); + let expected: HashSet = [ + StatusCode::OK, + StatusCode::NO_CONTENT, + StatusCode::MOVED_PERMANENTLY, + ] + .iter() + .cloned() + .collect(); assert_eq!(actual, expected); } diff --git a/src/options.rs b/src/options.rs index d6fdf3f..9af337c 100644 --- a/src/options.rs +++ b/src/options.rs @@ -6,6 +6,7 @@ use structopt::StructOpt; const USER_AGENT: &str = "curl/7.71.1"; const METHOD: &str = "get"; const TIMEOUT: &str = "20"; +const MAX_CONCURRENCY: &str = "128"; // Macro for generating default functions to be used by serde macro_rules! default_function { @@ -30,10 +31,7 @@ macro_rules! fold_in { } #[derive(Debug, StructOpt)] -#[structopt( - name = "lychee", - about = "A boring link checker for my projects (and maybe yours)" -)] +#[structopt(name = "lychee", about = "A glorious link checker")] pub(crate) struct LycheeOptions { /// Input files pub inputs: Vec, @@ -47,7 +45,7 @@ pub(crate) struct LycheeOptions { } #[derive(Debug, Deserialize, StructOpt)] -pub(crate) struct Config { +pub struct Config { /// Verbose program output #[structopt(short, long)] #[serde(default)] @@ -63,6 +61,11 @@ pub(crate) struct Config { #[serde(default)] pub max_redirects: usize, + /// Maximum number of concurrent network requests + #[structopt(long, default_value = MAX_CONCURRENCY)] + #[serde(default)] + pub max_concurrency: String, + /// Number of threads to utilize. /// Defaults to number of cores available to the system #[structopt(short = "T", long)] @@ -131,7 +134,8 @@ pub(crate) struct Config { pub timeout: String, /// Request method - #[structopt(short = "M", long, default_value = METHOD)] + // Using `-X` as a short param similar to curl + #[structopt(short = "X", long, default_value = METHOD)] #[serde(default = "method")] pub method: String, @@ -139,7 +143,7 @@ pub(crate) struct Config { #[serde(default)] pub base_url: Option, - #[structopt(long, help = "Basic autentication support. Ex 'username:password'")] + #[structopt(long, help = "Basic authentication support. Ex 'username:password'")] #[serde(default)] pub basic_auth: Option, @@ -185,6 +189,7 @@ impl Config { verbose: false; progress: false; max_redirects: 10; + max_concurrency: MAX_CONCURRENCY; threads: None; user_agent: USER_AGENT; insecure: false; diff --git a/src/stats.rs b/src/stats.rs new file mode 100644 index 0000000..8deed56 --- /dev/null +++ b/src/stats.rs @@ -0,0 +1,64 @@ +use std::{ + collections::HashSet, + fmt::{self, Display}, +}; + +use crate::types::Response; +use crate::types::Status::*; +use crate::types::Uri; + +pub struct ResponseStats { + total: usize, + successful: usize, + failures: HashSet, + timeouts: HashSet, + redirects: HashSet, + excludes: HashSet, + errors: HashSet, +} + +impl ResponseStats { + pub fn new() -> Self { + ResponseStats { + total: 0, + successful: 0, + failures: HashSet::new(), + timeouts: HashSet::new(), + redirects: HashSet::new(), + excludes: HashSet::new(), + errors: HashSet::new(), + } + } + + pub fn add(&mut self, response: Response) { + self.total += 1; + let uri = response.uri; + if !match response.status { + Failed(_) => self.failures.insert(uri), + Timeout => self.timeouts.insert(uri), + Redirected => self.redirects.insert(uri), + Excluded => self.excludes.insert(uri), + Error(_) => self.errors.insert(uri), + _ => false, + } { + self.successful += 1; + } + } + + pub fn is_success(&self) -> bool { + self.total == self.successful + self.excludes.len() + } +} + +impl Display for ResponseStats { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "📝 Summary")?; + writeln!(f, "-------------------")?; + writeln!(f, "🔍 Total: {}", self.total)?; + writeln!(f, "✅ Successful: {}", self.successful)?; + writeln!(f, "⏳ Timeouts: {}", self.timeouts.len())?; + writeln!(f, "🔀 Redirected: {}", self.redirects.len())?; + writeln!(f, "👻 Excluded: {}", self.excludes.len())?; + writeln!(f, "🚫 Errors: {}", self.errors.len() + self.failures.len()) + } +} diff --git a/src/types.rs b/src/types.rs index 27730e6..868811a 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,8 +1,48 @@ use crate::options::Config; use anyhow::anyhow; -use std::{collections::HashSet, convert::TryFrom}; - use regex::RegexSet; +use std::net::IpAddr; +use std::{collections::HashSet, convert::TryFrom, fmt::Display}; +use url::Url; + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum Uri { + Website(Url), + Mail(String), +} + +impl Uri { + pub fn as_str(&self) -> &str { + match self { + Uri::Website(url) => url.as_str(), + Uri::Mail(address) => address.as_str(), + } + } + + pub fn scheme(&self) -> Option { + match self { + Uri::Website(url) => Some(url.scheme().to_string()), + Uri::Mail(_address) => None, + } + } + + pub fn host_ip(&self) -> Option { + match self { + Self::Website(url) => match url.host()? { + url::Host::Ipv4(v4_addr) => Some(v4_addr.into()), + url::Host::Ipv6(v6_addr) => Some(v6_addr.into()), + _ => None, + }, + Self::Mail(_) => None, + } + } +} + +impl Display for Uri { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +} /// Specifies how requests to websites will be made pub(crate) enum RequestMethod { @@ -21,14 +61,32 @@ impl TryFrom for RequestMethod { } } +#[derive(Debug)] +pub struct Response { + pub uri: Uri, + pub status: Status, +} + +impl Response { + pub fn new(uri: Uri, status: Status) -> Self { + Response { uri, status } + } +} + /// Response status of the request #[derive(Debug)] pub enum Status { + /// Request was successful Ok(http::StatusCode), + /// Request failed with HTTP error code Failed(http::StatusCode), + /// Request timed out Timeout, + /// Got redirected to different resource Redirected, + /// Resource was excluded from checking Excluded, + /// Low-level error while loading resource Error(String), } @@ -51,10 +109,6 @@ impl Status { pub fn is_success(&self) -> bool { matches!(self, Status::Ok(_)) } - - pub fn is_excluded(&self) -> bool { - matches!(self, Status::Excluded) - } } impl From for Status { @@ -69,7 +123,8 @@ impl From for Status { /// Exclude configuration for the link checker. /// You can ignore links based on -pub(crate) struct Excludes { +#[derive(Clone, Debug)] +pub struct Excludes { pub regex: Option, /// Example: 192.168.0.1 pub private_ips: bool, @@ -105,3 +160,35 @@ impl Default for Excludes { } } } + +#[cfg(test)] +mod test { + use super::*; + use std::net::{Ipv4Addr, Ipv6Addr}; + + #[test] + fn test_uri_host_ip_v4() { + let uri = + Uri::Website(Url::parse("http://127.0.0.1").expect("Expected URI with valid IPv4")); + let ip = uri.host_ip().expect("Expected a valid IPv4"); + assert_eq!(ip, IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1))); + } + + #[test] + fn test_uri_host_ip_v6() { + let uri = + Uri::Website(Url::parse("https://[2020::0010]").expect("Expected URI with valid IPv6")); + let ip = uri.host_ip().expect("Expected a valid IPv6"); + assert_eq!( + ip, + IpAddr::V6(Ipv6Addr::new(0x2020, 0, 0, 0, 0, 0, 0, 0x10)) + ); + } + + #[test] + fn test_uri_host_ip_no_ip() { + let uri = Uri::Website(Url::parse("https://some.cryptic/url").expect("Expected valid URI")); + let ip = uri.host_ip(); + assert!(ip.is_none()); + } +} diff --git a/tests/cli.rs b/tests/cli.rs index 8a05c17..1a5f100 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -22,7 +22,7 @@ mod cli { .arg(test_all_private_path) .assert() .success() - .stdout(contains("Found: 7")) + .stdout(contains("Total: 7")) .stdout(contains("Excluded: 7")) .stdout(contains("Successful: 0")) .stdout(contains("Errors: 0")); @@ -44,26 +44,12 @@ mod cli { .arg(test_github_path) .assert() .success() - .stdout(contains("Found: 1")) + .stdout(contains("Total: 1")) .stdout(contains("Excluded: 0")) .stdout(contains("Successful: 1")) .stdout(contains("Errors: 0")); } - #[test] - fn test_failure_invalid_method() { - let mut cmd = - Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name"); - - cmd.arg("--method=invalid-method") - .assert() - .failure() - .code(1) - .stderr(contains( - "Error: Only `get` and `head` allowed, got invalid-method", - )); - } - #[test] fn test_failure_404_link() { let mut cmd = @@ -90,6 +76,7 @@ mod cli { .join("TEST_GITHUB_404.md"); cmd.arg(test_github_404_path) + .env_clear() .assert() .failure() .code(2)