diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 1875bf5..508001d 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -53,11 +53,16 @@ jobs: uses: actions-rs/cargo@v1 with: command: fetch - - name: cargo publish check + - name: cargo publish check lychee-lib uses: actions-rs/cargo@v1 with: command: publish - args: --dry-run + args: --dry-run --manifest-path lychee-lib/Cargo.toml + - name: cargo publish check lychee + uses: actions-rs/cargo@v1 + with: + command: publish + args: --dry-run --manifest-path lychee-bin/Cargo.toml publish: if: startsWith(github.ref, 'refs/tags/') @@ -72,9 +77,17 @@ jobs: uses: actions-rs/cargo@v1 with: command: fetch - - name: cargo publish + - name: cargo publish lychee-lib uses: actions-rs/cargo@v1 env: CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} with: command: publish + args: --manifest-path lychee-lib/Cargo.toml + - name: cargo publish lychee + uses: actions-rs/cargo@v1 + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} + with: + command: publish + args: --manifest-path lychee-bin/Cargo.toml diff --git a/Cargo.lock b/Cargo.lock index 520d53d..a85d350 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,5 +1,7 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +version = 3 + [[package]] name = "adler" version = "1.0.2" @@ -533,9 +535,9 @@ dependencies = [ [[package]] name = "darling" -version = "0.12.2" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a06d4a9551359071d1890820e3571252b91229e0712e7c36b08940e603c5a8fc" +checksum = "e9d6ddad5866bb2170686ed03f6839d31a76e5407d80b1c334a2c24618543ffa" dependencies = [ "darling_core", "darling_macro", @@ -543,9 +545,9 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.12.2" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b443e5fb0ddd56e0c9bfa47dc060c5306ee500cb731f2b91432dd65589a77684" +checksum = "a9ced1fd13dc386d5a8315899de465708cf34ee2a6d9394654515214e67bb846" dependencies = [ "fnv", "ident_case", @@ -557,9 +559,9 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.12.2" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0220073ce504f12a70efc4e7cdaea9e9b1b324872e7ad96a208056d7a638b81" +checksum = "0a7a1445d54b2f9792e3b31a3e715feabbace393f38dc4ffd49d94ee9bc487d5" dependencies = [ "darling_core", "quote", @@ -1119,9 +1121,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.3.5" +version = "1.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "615caabe2c3160b313d52ccc905335f4ed5f10881dd63dc5699d47e90be85691" +checksum = "bc35c995b9d93ec174cf9a27d425c7892722101e14993cd227fdb51d70cf9589" [[package]] name = "httpdate" @@ -1284,9 +1286,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" [[package]] name = "js-sys" -version = "0.3.49" +version = "0.3.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc15e39392125075f60c95ba416f5381ff6c3a948ff02ab12464715adf56c821" +checksum = "2d99f9e3e84b8f67f846ef5b4cbbc3b1c29f6c759fcbce6f01aa0e73d932a24c" dependencies = [ "wasm-bindgen", ] @@ -1341,9 +1343,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.91" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8916b1f6ca17130ec6568feccee27c156ad12037880833a3b842a823236502e7" +checksum = "9385f66bf6105b241aa65a61cb923ef20efc665cb9f9bb50ac2f0c4b7f378d41" [[package]] name = "linked-hash-map" @@ -1362,9 +1364,9 @@ dependencies = [ [[package]] name = "lock_api" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd96ffd135b2fd7b973ac026d28085defbe8983df057ced3eb4f2130b0831312" +checksum = "5a3c91c24eae6777794bb1997ad98bbb87daf92890acab859f7eaa4320333176" dependencies = [ "scopeguard", ] @@ -1394,40 +1396,55 @@ version = "0.6.0" dependencies = [ "anyhow", "assert_cmd", - "check-if-email-exists", "console", + "headers", + "http", + "indicatif", + "lazy_static", + "lychee-lib", + "openssl-sys", + "pad", + "predicates", + "pretty_assertions", + "regex", + "reqwest", + "ring", + "serde", + "serde_json", + "structopt", + "tempfile", + "tokio", + "toml", + "uuid", + "wiremock", +] + +[[package]] +name = "lychee-lib" +version = "0.6.0" +dependencies = [ + "check-if-email-exists", "deadpool", "derive_builder", "doc-comment", "fast_chemail", - "futures", "glob", - "headers", "html5ever", "http", "hubcaps", - "indicatif", - "lazy_static", "linkify", - "markup5ever", "markup5ever_rcdom", "openssl-sys", - "pad", - "predicates", "pretty_assertions", "pulldown-cmark", "regex", "reqwest", "ring", "serde", - "serde_json", "shellexpand", - "structopt", "tempfile", "tokio", - "toml", "url", - "uuid", "wiremock", ] @@ -1938,9 +1955,9 @@ checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086" [[package]] name = "proc-macro2" -version = "1.0.24" +version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" +checksum = "a152013215dca273577e18d2bf00fa862b89b24169fb78c4c95aeb07992c9cec" dependencies = [ "unicode-xid", ] @@ -2065,9 +2082,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94341e4e44e24f6b591b59e47a8a027df12e008d73fd5672dbea9cc22f4507d9" +checksum = "8270314b5ccceb518e7e578952f0b72b88222d02e8f77f5ecf7abbb673539041" dependencies = [ "bitflags", ] @@ -2304,9 +2321,9 @@ dependencies = [ [[package]] name = "signal-hook" -version = "0.3.7" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6aa894ef3fade0ee7243422f4fbbd6c2b48e6de767e621d37ef65f2310f53cea" +checksum = "ef33d6d0cd06e0840fba9985aab098c147e67e05cee14d412d3345ed14ff30ac" dependencies = [ "libc", "signal-hook-registry", @@ -2452,9 +2469,9 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.65" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3a1d708c221c5a612956ef9f75b37e454e88d1f7b899fbd3a18d4252012d663" +checksum = "48fe99c6bd8b1cc636890bcc071842de909d902c81ac7dab53ba33c421ab8ffb" dependencies = [ "proc-macro2", "quote", @@ -2547,9 +2564,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.1.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "317cca572a0e89c3ce0ca1f1bdc9369547fe318a683418e42ac8f59d14701023" +checksum = "5b5220f05bb7de7f3f53c7c065e1199b3172696fe2db9f9c4d8ad9b4ee74c342" dependencies = [ "tinyvec_macros", ] @@ -2615,9 +2632,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.6.5" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5143d049e85af7fbc36f5454d990e62c2df705b3589f123b71f441b6b59f443f" +checksum = "940a12c99365c31ea8dd9ba04ec1be183ffe4920102bb7122c2f515437601e8e" dependencies = [ "bytes", "futures-core", @@ -2735,9 +2752,9 @@ dependencies = [ [[package]] name = "unicode-bidi" -version = "0.3.4" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5" +checksum = "eeb8be209bb1c96b7c177c7420d26e04eccacb0eeae6b980e35fcb74678107e0" dependencies = [ "matches", ] @@ -2875,9 +2892,9 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" [[package]] name = "wasm-bindgen" -version = "0.2.72" +version = "0.2.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fe8f61dba8e5d645a4d8132dc7a0a66861ed5e1045d2c0ed940fab33bac0fbe" +checksum = "83240549659d187488f91f33c0f8547cbfef0b2088bc470c116d1d260ef623d9" dependencies = [ "cfg-if", "serde", @@ -2887,9 +2904,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.72" +version = "0.2.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "046ceba58ff062da072c7cb4ba5b22a37f00a302483f7e2a6cdc18fedbdc1fd3" +checksum = "ae70622411ca953215ca6d06d3ebeb1e915f0f6613e3b495122878d7ebec7dae" dependencies = [ "bumpalo", "lazy_static", @@ -2902,9 +2919,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.22" +version = "0.4.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73157efb9af26fb564bb59a009afd1c7c334a44db171d280690d0c3faaec3468" +checksum = "81b8b767af23de6ac18bf2168b690bed2902743ddf0fb39252e36f9e2bfc63ea" dependencies = [ "cfg-if", "js-sys", @@ -2914,9 +2931,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.72" +version = "0.2.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ef9aa01d36cda046f797c57959ff5f3c615c9cc63997a8d545831ec7976819b" +checksum = "3e734d91443f177bfdb41969de821e15c516931c3c3db3d318fa1b68975d0f6f" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2924,9 +2941,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.72" +version = "0.2.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96eb45c1b2ee33545a813a92dbb53856418bf7eb54ab34f7f7ff1448a5b3735d" +checksum = "d53739ff08c8a68b0fdbcd54c372b8ab800b1449ab3c9d706503bc7dd1621b2c" dependencies = [ "proc-macro2", "quote", @@ -2937,15 +2954,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.72" +version = "0.2.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7148f4696fb4960a346eaa60bbfb42a1ac4ebba21f750f75fc1375b098d5ffa" +checksum = "d9a543ae66aa233d14bb765ed9af4a33e81b8b58d1584cf1b47ff8cd0b9e4489" [[package]] name = "web-sys" -version = "0.3.49" +version = "0.3.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59fe19d70f5dacc03f6e46777213facae5ac3801575d56ca6cbd4c93dcd12310" +checksum = "a905d57e488fec8861446d3393670fb50d27a262344013181c2cdf9fff5481be" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 919e1d6..863013f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,75 +1,9 @@ -[package] -authors = ["Matthias Endler "] -description = "A glorious link checker" -documentation = "https://github.com/lycheeverse/lychee/blob/master/README.md" -edition = "2018" -homepage = "https://github.com/lycheeverse/lychee" -keywords = [ - "link", - "checker", - "cli", - "link-checker", - "validator", +[workspace] +members = [ + "lychee-bin", + "lychee-lib", ] -license = "Apache-2.0/MIT" -name = "lychee" -repository = "https://github.com/lycheeverse/lychee" -version = "0.6.0" - -[dependencies] -anyhow = "1.0.38" -futures = "0.3.14" -glob = "0.3.0" -http = "0.2.4" -hubcaps = "0.6.2" -linkify = "0.6.0" -regex = "1.4.5" -url = "2.2.1" -check-if-email-exists = "0.8.21" -indicatif = "0.15.0" -structopt = "0.3.21" -toml = "0.5.8" -serde = { version = "1.0.124", features = ["derive"] } -pulldown-cmark = "0.8.0" -html5ever = "0.25.1" -markup5ever = "0.10.0" -markup5ever_rcdom = "0.1.0" -headers = "0.3.4" -derive_builder = "0.10.0" -deadpool = "0.7.0" -shellexpand = "2.1.0" -lazy_static = "1.4.0" -wiremock = "0.5.1" -openssl-sys = "0.9.61" -serde_json = "1.0.64" -# Make build work on Apple Silicon. -# See https://github.com/briansmith/ring/issues/1163 -# This is necessary for the homebrew build -# https://github.com/Homebrew/homebrew-core/pull/70216 -ring = "0.16.20" -pad = "0.1.6" -console = "0.14.1" -fast_chemail = "0.9.6" - -[dependencies.reqwest] -features = ["gzip"] -version = "0.11.3" - -[dependencies.tokio] -features = ["full"] -version = "1.5.0" [patch.crates-io] # Switch back to version on crates.io after 0.6.3+ is released hubcaps = { git="https://github.com/softprops/hubcaps.git" } - -[dev-dependencies] -assert_cmd = "1.0.3" -predicates = "1.0.7" -uuid = { version = "0.8.2", features = ["v4"] } -tempfile = "3.2.0" -doc-comment = "0.3.3" -pretty_assertions = "0.7.1" - -[features] -vendored-openssl = ["openssl-sys/vendored"] diff --git a/README.md b/README.md index a794b60..c9b8995 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,8 @@ ![Rust](https://github.com/hello-rust/lychee/workflows/Rust/badge.svg) [![docs.rs](https://docs.rs/lychee/badge.svg)](https://docs.rs/lychee) -⚡ A fast, async, resource-friendly link checker written in Rust. -Finds broken hyperlinks and mail addresses inside Markdown, HTML, reStructuredText, or any other text file or website! +⚡ A fast, async, resource-friendly link checker written in Rust.\\ +Finds broken hyperlinks and mail addresses inside Markdown, HTML, reStructuredText, or any other text file or website! Available as a CLI utility and as a GitHub Action: [lycheeverse/lychee-action](https://github.com/lycheeverse/lychee-action). @@ -208,11 +208,11 @@ You can use lychee as a library for your own projects. Here is a "hello world" example: ```rust -use std::error::Error; +use lychee_lib::Result; #[tokio::main] -async fn main() -> Result<(), Box> { - let response = lychee::check("https://github.com/lycheeverse/lychee").await?; +async fn main() -> Result<()> { + let response = lychee_lib::check("https://github.com/lycheeverse/lychee").await?; println!("{}", response); Ok(()) } @@ -221,22 +221,21 @@ async fn main() -> Result<(), Box> { This is equivalent to the following snippet, in which we build our own client: ```rust -use lychee::{ClientBuilder, Status}; -use std::error::Error; +use lychee_lib::{ClientBuilder, Result, Status}; #[tokio::main] -async fn main() -> Result<(), Box> { +async fn main() -> Result<()> { let client = ClientBuilder::default().build()?; let response = client.check("https://github.com/lycheeverse/lychee").await?; - assert!(matches!(response.status, Status::Ok(_))); + assert!(response.status().is_success()); Ok(()) } ``` The client builder is very customizable: -```rust,ignore -let client = lychee::ClientBuilder::default() +```rust, ignore +let client = lychee_lib::ClientBuilder::default() .includes(includes) .excludes(excludes) .max_redirects(cfg.max_redirects) diff --git a/lychee-bin/Cargo.toml b/lychee-bin/Cargo.toml new file mode 100644 index 0000000..f84bac5 --- /dev/null +++ b/lychee-bin/Cargo.toml @@ -0,0 +1,51 @@ +[package] +name = "lychee" +authors = ["Matthias Endler "] +description = "A glorious link checker" +documentation = "https://github.com/lycheeverse/lychee/blob/master/README.md" +edition = "2018" +homepage = "https://github.com/lycheeverse/lychee" +keywords = [ + "link", + "checker", + "cli", + "link-checker", + "validator", +] +license = "Apache-2.0/MIT" +repository = "https://github.com/lycheeverse/lychee" +version = "0.6.0" + +[dependencies] +lychee-lib = { path = "../lychee-lib", version = "0.6.0" } +anyhow = "1.0.40" +console = "0.14.1" +headers = "0.3.4" +http = "0.2.4" +indicatif = "0.15.0" +lazy_static = "1.4.0" +openssl-sys = "0.9.61" +pad = "0.1.6" +regex = "1.4.5" +reqwest = { version = "0.11.3", features = ["gzip"] } +# Make build work on Apple Silicon. +# See https://github.com/briansmith/ring/issues/1163 +# This is necessary for the homebrew build +# https://github.com/Homebrew/homebrew-core/pull/70216 +ring = "0.16.20" +serde = { version = "1.0.125", features = ["derive"] } +serde_json = "1.0.64" +structopt = "0.3.21" +tokio = { version = "1.5.0", features = ["full"] } +toml = "0.5.8" + +[dev-dependencies] +assert_cmd = "1.0.3" +predicates = "1.0.7" +pretty_assertions = "0.7.1" +tempfile = "3.2.0" +uuid = { version = "0.8.2", features = ["v4"] } +wiremock = "0.5.2" + +[features] +vendored-openssl = ["openssl-sys/vendored"] diff --git a/lychee-bin/LICENSE-APACHE b/lychee-bin/LICENSE-APACHE new file mode 100644 index 0000000..f51e79e --- /dev/null +++ b/lychee-bin/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright 2020 The lychee maintainers + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/lychee-bin/LICENSE-MIT b/lychee-bin/LICENSE-MIT new file mode 100644 index 0000000..31aa793 --- /dev/null +++ b/lychee-bin/LICENSE-MIT @@ -0,0 +1,23 @@ +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/src/bin/lychee/main.rs b/lychee-bin/src/main.rs similarity index 78% rename from src/bin/lychee/main.rs rename to lychee-bin/src/main.rs index 443dede..27f2ceb 100644 --- a/src/bin/lychee/main.rs +++ b/lychee-bin/src/main.rs @@ -1,23 +1,43 @@ +#![warn(clippy::all, clippy::pedantic)] +#![warn( + absolute_paths_not_starting_with_crate, + invalid_html_tags, + missing_copy_implementations, + missing_debug_implementations, + semicolon_in_expressions_from_macros, + unreachable_pub, + unused_extern_crates, + variant_size_differences, + clippy::missing_const_for_fn +)] +#![deny(anonymous_parameters, macro_use_extern_crate, pointer_structural_match)] + +// required for apple silicon +use ring as _; + +use std::{collections::HashSet, fs, str::FromStr, time::Duration}; + use anyhow::{anyhow, Context, Result}; -use headers::authorization::Basic; -use headers::{Authorization, HeaderMap, HeaderMapExt, HeaderName}; +use headers::{authorization::Basic, Authorization, HeaderMap, HeaderMapExt, HeaderName}; +use http::StatusCode; use indicatif::{ProgressBar, ProgressStyle}; -use options::Format; +use lychee_lib::{ + collector::{collect_links, Input}, + ClientBuilder, ClientPool, Response, +}; +use openssl_sys as _; // required for vendored-openssl feature use regex::RegexSet; -use stats::color_response; -use std::{collections::HashSet, time::Duration}; -use std::{fs, str::FromStr}; +use ring as _; // required for apple silicon use structopt::StructOpt; use tokio::sync::mpsc; mod options; mod stats; -use crate::options::{Config, LycheeOptions}; -use crate::stats::ResponseStats; - -use lychee::collector::{self, Input}; -use lychee::{ClientBuilder, ClientPool, Response}; +use crate::{ + options::{Config, Format, LycheeOptions}, + stats::{color_response, ResponseStats}, +}; /// A C-like enum that can be cast to `i32` and used as process exit code. enum ExitCode { @@ -64,7 +84,7 @@ fn run_main() -> Result { } fn show_progress(progress_bar: &Option, response: &Response, verbose: bool) { - let out = color_response(response); + let out = color_response(&response.1); if let Some(pb) = progress_bar { pb.inc(1); pb.set_message(&out); @@ -72,7 +92,7 @@ fn show_progress(progress_bar: &Option, response: &Response, verbos pb.println(out); } } else { - if (response.status.is_success() || response.status.is_excluded()) && !verbose { + if (response.status().is_success() || response.status().is_excluded()) && !verbose { return; } println!("{}", out); @@ -117,26 +137,27 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { .github_token(cfg.github_token.clone()) .scheme(cfg.scheme.clone()) .accepted(accepted) - .build()?; + .build() + .map_err(|e| anyhow!(e))?; - let links = collector::collect_links( + let links = collect_links( &inputs, cfg.base_url.clone(), cfg.skip_missing, max_concurrency, ) - .await?; + .await + .map_err(|e| anyhow!(e))?; - let pb = match cfg.no_progress { - true => None, - false => { - let bar = ProgressBar::new(links.len() as u64) - .with_style(ProgressStyle::default_bar().template( + let pb = if cfg.no_progress { + None + } else { + let bar = + ProgressBar::new(links.len() as u64).with_style(ProgressStyle::default_bar().template( "{spinner:.red.bright} {pos}/{len:.dim} [{elapsed_precise}] {bar:25} {wide_msg}", )); - bar.enable_steady_tick(100); - Some(bar) - } + bar.enable_steady_tick(100); + Some(bar) }; let (send_req, recv_req) = mpsc::channel(max_concurrency); @@ -154,9 +175,9 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { } }); + // Start receiving requests tokio::spawn(async move { - // Start receiving requests - let clients: Vec<_> = (0..max_concurrency).map(|_| client.clone()).collect(); + let clients = vec![client; max_concurrency]; let mut clients = ClientPool::new(send_resp, recv_req, clients); clients.listen().await; }); @@ -184,9 +205,10 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { println!("{}", stats_formatted); } - match stats.is_success() { - true => Ok(ExitCode::Success as i32), - false => Ok(ExitCode::LinkCheckFailure as i32), + if stats.is_success() { + Ok(ExitCode::Success as i32) + } else { + Ok(ExitCode::LinkCheckFailure as i32) } } @@ -201,7 +223,7 @@ fn read_header(input: &str) -> Result<(String, String)> { Ok((elements[0].into(), elements[1].into())) } -fn parse_timeout(timeout: usize) -> Duration { +const fn parse_timeout(timeout: usize) -> Duration { Duration::from_secs(timeout as u64) } @@ -217,10 +239,10 @@ fn parse_headers>(headers: &[T]) -> Result { Ok(out) } -fn parse_statuscodes>(accept: T) -> Result> { +fn parse_statuscodes>(accept: T) -> Result> { let mut statuscodes = HashSet::new(); - for code in accept.as_ref().split(',').into_iter() { - let code: reqwest::StatusCode = reqwest::StatusCode::from_bytes(code.as_bytes())?; + for code in accept.as_ref().split(',') { + let code: StatusCode = StatusCode::from_bytes(code.as_bytes())?; statuscodes.insert(code); } Ok(statuscodes) @@ -239,12 +261,15 @@ fn parse_basic_auth(auth: &str) -> Result> { #[cfg(test)] mod test { - use super::*; - use pretty_assertions::assert_eq; + use std::{array, collections::HashSet}; + use headers::{HeaderMap, HeaderMapExt}; use http::StatusCode; + use pretty_assertions::assert_eq; use reqwest::header; + use super::{parse_basic_auth, parse_headers, parse_statuscodes}; + #[test] fn test_parse_custom_headers() { let mut custom = HeaderMap::new(); @@ -255,14 +280,13 @@ mod test { #[test] fn test_parse_statuscodes() { let actual = parse_statuscodes("200,204,301").unwrap(); - let expected: HashSet = [ + let expected = array::IntoIter::new([ StatusCode::OK, StatusCode::NO_CONTENT, StatusCode::MOVED_PERMANENTLY, - ] - .iter() - .cloned() - .collect(); + ]) + .collect::>(); + assert_eq!(actual, expected); } @@ -277,6 +301,7 @@ mod test { let mut actual = HeaderMap::new(); let auth_header = parse_basic_auth("aladin:abretesesamo").unwrap(); actual.typed_insert(auth_header); + assert_eq!(expected, actual); } } diff --git a/src/bin/lychee/options.rs b/lychee-bin/src/options.rs similarity index 85% rename from src/bin/lychee/options.rs rename to lychee-bin/src/options.rs index 8376541..63e2ca5 100644 --- a/src/bin/lychee/options.rs +++ b/lychee-bin/src/options.rs @@ -1,20 +1,27 @@ -use lychee::collector::Input; +use std::{fs, io::ErrorKind, path::PathBuf, str::FromStr}; use anyhow::{anyhow, Error, Result}; use lazy_static::lazy_static; +use lychee_lib::collector::Input; use serde::Deserialize; -use std::str::FromStr; -use std::{fs, io::ErrorKind, path::PathBuf}; use structopt::{clap::crate_version, StructOpt}; -pub(crate) const USER_AGENT: &str = concat!("lychee/", crate_version!()); const METHOD: &str = "get"; const TIMEOUT: usize = 20; const MAX_CONCURRENCY: usize = 128; const MAX_REDIRECTS: usize = 10; +const USER_AGENT: &str = concat!("lychee/", crate_version!()); + +// this exists because structopt requires `&str` type values for defaults +// (we can't use e.g. `TIMEOUT` or `timeout()` which gets created for serde) +lazy_static! { + static ref TIMEOUT_STR: String = TIMEOUT.to_string(); + static ref MAX_CONCURRENCY_STR: String = MAX_CONCURRENCY.to_string(); + static ref MAX_REDIRECTS_STR: String = MAX_REDIRECTS.to_string(); +} #[derive(Debug, Deserialize)] -pub enum Format { +pub(crate) enum Format { String, Json, } @@ -36,18 +43,11 @@ impl Default for Format { } } -// this exists because structopt requires `&str` type values for defaults -// (we can't use e.g. `TIMEOUT` or `timeout()` which gets created for serde) -lazy_static! { - static ref TIMEOUT_STR: String = TIMEOUT.to_string(); - static ref MAX_CONCURRENCY_STR: String = MAX_CONCURRENCY.to_string(); - static ref MAX_REDIRECTS_STR: String = MAX_REDIRECTS.to_string(); -} - // Macro for generating default functions to be used by serde macro_rules! default_function { ( $( $name:ident : $T:ty = $e:expr; )* ) => { $( + #[allow(clippy::missing_const_for_fn)] fn $name() -> $T { $e } @@ -90,10 +90,10 @@ pub(crate) struct LycheeOptions { /// Configuration file to use #[structopt(short, long = "config", default_value = "./lychee.toml")] - pub config_file: String, + pub(crate) config_file: String, #[structopt(flatten)] - pub config: Config, + pub(crate) config: Config, } impl LycheeOptions { @@ -109,142 +109,143 @@ impl LycheeOptions { } } +#[allow(clippy::struct_excessive_bools)] #[derive(Debug, Deserialize, StructOpt)] -pub struct Config { +pub(crate) struct Config { /// Verbose program output #[structopt(short, long)] #[serde(default)] - pub verbose: bool, + pub(crate) verbose: bool, /// Do not show progress bar. /// This is recommended for non-interactive shells (e.g. for continuous /// integration) #[structopt(short, long)] #[serde(default)] - pub no_progress: bool, + pub(crate) no_progress: bool, /// Maximum number of allowed redirects #[structopt(short, long, default_value = &MAX_REDIRECTS_STR)] #[serde(default = "max_redirects")] - pub max_redirects: usize, + pub(crate) max_redirects: usize, /// Maximum number of concurrent network requests #[structopt(long, default_value = &MAX_CONCURRENCY_STR)] #[serde(default = "max_concurrency")] - pub max_concurrency: usize, + pub(crate) max_concurrency: usize, /// Number of threads to utilize. /// Defaults to number of cores available to the system #[structopt(short = "T", long)] #[serde(default)] - pub threads: Option, + pub(crate) threads: Option, /// User agent #[structopt(short, long, default_value = USER_AGENT)] #[serde(default = "user_agent")] - pub user_agent: String, + pub(crate) user_agent: String, /// Proceed for server connections considered insecure (invalid TLS) #[structopt(short, long)] #[serde(default)] - pub insecure: bool, + pub(crate) insecure: bool, /// Only test links with the given scheme (e.g. https) #[structopt(short, long)] #[serde(default)] - pub scheme: Option, + pub(crate) scheme: Option, /// URLs to check (supports regex). Has preference over all excludes. #[structopt(long)] #[serde(default)] - pub include: Vec, + pub(crate) include: Vec, /// Exclude URLs from checking (supports regex) #[structopt(long)] #[serde(default)] - pub exclude: Vec, + pub(crate) exclude: Vec, /// Exclude all private IPs from checking. /// Equivalent to `--exclude-private --exclude-link-local --exclude-loopback` #[structopt(short = "E", long)] #[serde(default)] - pub exclude_all_private: bool, + pub(crate) exclude_all_private: bool, /// Exclude private IP address ranges from checking #[structopt(long)] #[serde(default)] - pub exclude_private: bool, + pub(crate) exclude_private: bool, /// Exclude link-local IP address range from checking #[structopt(long)] #[serde(default)] - pub exclude_link_local: bool, + pub(crate) exclude_link_local: bool, /// Exclude loopback IP address range from checking #[structopt(long)] #[serde(default)] - pub exclude_loopback: bool, + pub(crate) exclude_loopback: bool, /// Exclude all mail addresses from checking #[structopt(long)] #[serde(default)] - pub exclude_mail: bool, + pub(crate) exclude_mail: bool, /// Custom request headers #[structopt(short, long)] #[serde(default)] - pub headers: Vec, + pub(crate) headers: Vec, /// Comma-separated list of accepted status codes for valid links #[structopt(short, long)] #[serde(default)] - pub accept: Option, + pub(crate) accept: Option, /// Website timeout from connect to response finished #[structopt(short, long, default_value = &TIMEOUT_STR)] #[serde(default = "timeout")] - pub timeout: usize, + pub(crate) timeout: usize, /// Request method // Using `-X` as a short param similar to curl #[structopt(short = "X", long, default_value = METHOD)] #[serde(default = "method")] - pub method: String, + pub(crate) method: String, /// Base URL to check relative URLs #[structopt(short, long)] #[serde(default)] - pub base_url: Option, + pub(crate) base_url: Option, /// Basic authentication support. E.g. `username:password` #[structopt(long)] #[serde(default)] - pub basic_auth: Option, + pub(crate) basic_auth: Option, /// GitHub API token to use when checking github.com links, to avoid rate limiting #[structopt(long, env = "GITHUB_TOKEN")] #[serde(default)] - pub github_token: Option, + pub(crate) github_token: Option, /// Skip missing input files (default is to error if they don't exist) #[structopt(long)] #[serde(default)] - pub skip_missing: bool, + pub(crate) skip_missing: bool, /// Ignore case when expanding filesystem path glob inputs #[structopt(long)] #[serde(default)] - pub glob_ignore_case: bool, + pub(crate) glob_ignore_case: bool, /// Output file of status report #[structopt(short, long, parse(from_os_str))] #[serde(default)] - pub output: Option, + pub(crate) output: Option, /// Output file format of status report (json, string) #[structopt(short, long, default_value = "string")] #[serde(default)] - pub format: Format, + pub(crate) format: Format, } impl Config { diff --git a/lychee-bin/src/stats.rs b/lychee-bin/src/stats.rs new file mode 100644 index 0000000..04dc82e --- /dev/null +++ b/lychee-bin/src/stats.rs @@ -0,0 +1,194 @@ +use std::{ + collections::{HashMap, HashSet}, + fmt::{self, Display}, +}; + +use console::style; +use lychee_lib::{Input, Response, ResponseBody, Status}; +use pad::{Alignment, PadStr}; +use serde::Serialize; + +// Maximum padding for each entry in the final statistics output +const MAX_PADDING: usize = 20; + +pub(crate) fn color_response(response: &ResponseBody) -> String { + let out = match response.status { + Status::Ok(_) => style(response).green().bright(), + Status::Redirected(_) => style(response), + Status::Excluded => style(response).dim(), + Status::Timeout(_) => style(response).yellow().bright(), + Status::Error(_) => style(response).red().bright(), + }; + out.to_string() +} + +#[derive(Default, Serialize)] +pub(crate) struct ResponseStats { + total: usize, + successful: usize, + failures: usize, + timeouts: usize, + redirects: usize, + excludes: usize, + errors: usize, + fail_map: HashMap>, +} + +impl ResponseStats { + #[inline] + pub(crate) fn new() -> Self { + Self::default() + } + + pub(crate) fn add(&mut self, response: Response) { + let Response(source, ResponseBody { ref status, .. }) = response; + + self.total += 1; + + match status { + Status::Ok(_) => self.successful += 1, + Status::Error(_) => self.failures += 1, + Status::Timeout(_) => self.timeouts += 1, + Status::Redirected(_) => self.redirects += 1, + Status::Excluded => self.excludes += 1, + } + + if matches!( + status, + Status::Error(_) | Status::Timeout(_) | Status::Redirected(_) + ) { + let fail = self.fail_map.entry(source).or_default(); + fail.insert(response.1); + }; + } + + #[inline] + pub(crate) const fn is_success(&self) -> bool { + self.total == self.successful + self.excludes + } + + #[inline] + pub(crate) const fn is_empty(&self) -> bool { + self.total == 0 + } +} + +fn write_stat(f: &mut fmt::Formatter, title: &str, stat: usize, newline: bool) -> fmt::Result { + let fill = title.chars().count(); + f.write_str(title)?; + f.write_str( + &stat + .to_string() + .pad(MAX_PADDING - fill, '.', Alignment::Right, false), + )?; + + if newline { + f.write_str("\n")?; + } + + Ok(()) +} + +impl Display for ResponseStats { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let separator = "-".repeat(MAX_PADDING + 1); + + writeln!(f, "\u{1f4dd} Summary")?; // 📝 + writeln!(f, "{}", separator)?; + write_stat(f, "\u{1f50d} Total", self.total, true)?; // 🔍 + write_stat(f, "\u{2705} Successful", self.successful, true)?; // ✅ + write_stat(f, "\u{23f3} Timeouts", self.timeouts, true)?; // ⏳ + write_stat(f, "\u{1f500} Redirected", self.redirects, true)?; // 🔀 + write_stat(f, "\u{1f47b} Excluded", self.excludes, true)?; // 👻 + write_stat(f, "\u{1f6ab} Errors", self.errors + self.failures, false)?; // 🚫 + + for (input, responses) in &self.fail_map { + // Using leading newlines over trailing ones (e.g. `writeln!`) + // lets us avoid extra newlines without any additional logic. + write!(f, "\n\nErrors in {}", input)?; + for response in responses { + write!(f, "\n{}", color_response(response))? + } + } + + Ok(()) + } +} + +#[cfg(test)] +mod test { + use std::collections::{HashMap, HashSet}; + + use http::StatusCode; + use lychee_lib::{ClientBuilder, Input, Response, ResponseBody, Status, Uri}; + use pretty_assertions::assert_eq; + use reqwest::Url; + use wiremock::{matchers::path, Mock, MockServer, ResponseTemplate}; + + use super::ResponseStats; + + fn website(url: &str) -> Uri { + Uri::from(Url::parse(url).expect("Expected valid Website URI")) + } + + async fn get_mock_status_response(status_code: S) -> Response + where + S: Into, + { + let mock_server = MockServer::start().await; + let template = ResponseTemplate::new(status_code.into()); + + Mock::given(path("/")) + .respond_with(template) + .mount(&mock_server) + .await; + + ClientBuilder::default() + .build() + .unwrap() + .check(mock_server.uri()) + .await + .unwrap() + } + + #[test] + fn test_stats_is_empty() { + let mut stats = ResponseStats::new(); + assert!(stats.is_empty()); + + stats.add(Response( + Input::Stdin, + ResponseBody { + uri: website("http://example.org/ok"), + status: Status::Ok(StatusCode::OK), + }, + )); + + assert!(!stats.is_empty()); + } + + #[tokio::test] + async fn test_stats() { + let stata = [ + StatusCode::OK, + StatusCode::PERMANENT_REDIRECT, + StatusCode::BAD_GATEWAY, + ]; + + let mut stats = ResponseStats::new(); + for status in &stata { + stats.add(get_mock_status_response(status).await); + } + + let mut expected_map: HashMap> = HashMap::new(); + for status in &stata { + if status.is_server_error() || status.is_client_error() || status.is_redirection() { + let Response(input, response_body) = get_mock_status_response(status).await; + let entry = expected_map.entry(input).or_default(); + entry.insert(response_body); + } + } + + assert_eq!(stats.fail_map, expected_map); + } +} diff --git a/tests/cli.rs b/lychee-bin/tests/cli.rs similarity index 56% rename from tests/cli.rs rename to lychee-bin/tests/cli.rs index c4ec718..d69f53a 100644 --- a/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -1,106 +1,161 @@ #[cfg(test)] mod cli { - use pretty_assertions::assert_eq; + use std::{ + fs::{self, File}, + io::Write, + path::{Path, PathBuf}, + }; - use anyhow::Result; use assert_cmd::Command; - use lychee::test_utils; + use http::StatusCode; + use lychee_lib::Result; use predicates::str::contains; - use std::fs::{self, File}; - use std::io::Write; - use std::path::{Path, PathBuf}; + use pretty_assertions::assert_eq; use uuid::Uuid; + macro_rules! mock_server { + ($status:expr $(, $func:tt ($($arg:expr),*))*) => {{ + let mock_server = wiremock::MockServer::start().await; + let template = wiremock::ResponseTemplate::new(http::StatusCode::from($status)); + let template = template$(.$func($($arg),*))*; + wiremock::Mock::given(wiremock::matchers::method("GET")).respond_with(template).mount(&mock_server).await; + mock_server + }}; + } + fn main_command() -> Command { // this gets the "main" binary name (e.g. `lychee`) Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name") } fn fixtures_path() -> PathBuf { - Path::new(module_path!()).parent().unwrap().join("fixtures") + Path::new(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .join("fixtures") + } + + #[derive(Default)] + struct MockResponseStats { + total: usize, + successful: usize, + failures: usize, + timeouts: usize, + redirects: usize, + excludes: usize, + errors: usize, + } + + impl MockResponseStats { + fn to_json_str(&self) -> String { + format!( + r#"{{ + "total": {}, + "successful": {}, + "failures": {}, + "timeouts": {}, + "redirects": {}, + "excludes": {}, + "errors": {}, + "fail_map": {{}} +}}"#, + self.total, + self.successful, + self.failures, + self.timeouts, + self.redirects, + self.excludes, + self.errors + ) + } + } + + macro_rules! test_json_output { + ($test_file:expr, $expected:expr $(, $arg:expr)*) => {{ + let mut cmd = main_command(); + let test_path = fixtures_path().join($test_file); + let outfile = format!("{}.json", uuid::Uuid::new_v4()); + + let expected = $expected.to_json_str(); + + cmd$(.arg($arg))*.arg("--output").arg(&outfile).arg("--format").arg("json").arg(test_path).assert().success(); + + let output = std::fs::read_to_string(&outfile)?; + assert_eq!(output, expected); + std::fs::remove_file(outfile)?; + Ok(()) + }}; } #[test] - fn test_exclude_all_private() { - let mut cmd = main_command(); - - let test_all_private_path = fixtures_path().join("TEST_ALL_PRIVATE.md"); - - // assert that the command runs OK, and that it excluded all the links - cmd.arg("--exclude-all-private") - .arg("--verbose") - .arg(test_all_private_path) - .assert() - .success() - .stdout(contains("Total............7")) - .stdout(contains("Excluded.........7")) - .stdout(contains("Successful.......0")) - .stdout(contains("Errors...........0")); + fn test_exclude_all_private() -> Result<()> { + test_json_output!( + "TEST_ALL_PRIVATE.md", + MockResponseStats { + total: 7, + excludes: 7, + ..MockResponseStats::default() + }, + "--exclude-all-private", + "--verbose" + ) } #[test] - fn test_exclude_email() { - let mut cmd = main_command(); - - let test_path = fixtures_path().join("TEST_EMAIL.md"); - - // assert that the command runs OK, and that it excluded all the links - cmd.arg("--exclude-mail") - .arg(test_path) - .assert() - .success() - .stdout(contains("Total............6")) - .stdout(contains("Excluded.........4")) - .stdout(contains("Successful.......2")) - .stdout(contains("Errors...........0")); + fn test_exclude_email() -> Result<()> { + test_json_output!( + "TEST_EMAIL.md", + MockResponseStats { + total: 6, + excludes: 4, + successful: 2, + ..MockResponseStats::default() + }, + "--exclude-mail" + ) } /// Test that a GitHub link can be checked without specifying the token. #[test] - fn test_check_github_no_token() { - let mut cmd = main_command(); - let test_github_path = fixtures_path().join("TEST_GITHUB.md"); - - cmd.arg("--verbose") - .arg(test_github_path) - .assert() - .success() - .stdout(contains("Total............1")) - .stdout(contains("Excluded.........0")) - .stdout(contains("Successful.......1")) - .stdout(contains("Errors...........0")); + fn test_check_github_no_token() -> Result<()> { + test_json_output!( + "TEST_GITHUB.md", + MockResponseStats { + total: 1, + successful: 1, + ..MockResponseStats::default() + } + ) } #[test] - fn test_quirks() { - let mut cmd = main_command(); - let test_quirks_path = fixtures_path().join("TEST_QUIRKS.txt"); - - cmd.arg("--verbose") - .arg(test_quirks_path) - .assert() - .success() - .stdout(contains("Total............2")) - .stdout(contains("Excluded.........0")) - .stdout(contains("Successful.......2")) - .stdout(contains("Errors...........0")); + fn test_quirks() -> Result<()> { + test_json_output!( + "TEST_QUIRKS.txt", + MockResponseStats { + total: 2, + successful: 2, + ..MockResponseStats::default() + } + ) } #[tokio::test] - async fn test_failure_404_link() { - let mut cmd = main_command(); - let mock_server = test_utils::get_mock_server(http::StatusCode::NOT_FOUND).await; - let dir = tempfile::tempdir().expect("Failed to create tempdir"); + async fn test_failure_404_link() -> Result<()> { + let mock_server = mock_server!(StatusCode::NOT_FOUND); + let dir = tempfile::tempdir()?; let file_path = dir.path().join("test.txt"); - let mut file = File::create(&file_path).expect("Failed to create tempfile"); - - writeln!(file, "{}", mock_server.uri()).expect("Failed to write to file"); + let mut file = File::create(&file_path)?; + writeln!(file, "{}", mock_server.uri())?; + let mut cmd = main_command(); cmd.arg(file_path) .write_stdin(mock_server.uri()) .assert() .failure() .code(2); + + Ok(()) } #[test] @@ -121,7 +176,7 @@ mod cli { #[tokio::test] async fn test_stdin_input() { let mut cmd = main_command(); - let mock_server = test_utils::get_mock_server(http::StatusCode::OK).await; + let mock_server = mock_server!(StatusCode::OK); cmd.arg("-") .write_stdin(mock_server.uri()) @@ -132,8 +187,7 @@ mod cli { #[tokio::test] async fn test_stdin_input_failure() { let mut cmd = main_command(); - let mock_server = - test_utils::get_mock_server(http::StatusCode::INTERNAL_SERVER_ERROR).await; + let mock_server = mock_server!(StatusCode::INTERNAL_SERVER_ERROR); cmd.arg("-") .write_stdin(mock_server.uri()) @@ -145,8 +199,8 @@ mod cli { #[tokio::test] async fn test_stdin_input_multiple() { let mut cmd = main_command(); - let mock_server_a = test_utils::get_mock_server(http::StatusCode::OK).await; - let mock_server_b = test_utils::get_mock_server(http::StatusCode::OK).await; + let mock_server_a = mock_server!(StatusCode::OK); + let mock_server_b = mock_server!(StatusCode::OK); // this behavior (treating multiple `-` as separate inputs) is the same as most CLI tools // that accept `-` as stdin, e.g. `cat`, `bat`, `grep` etc. @@ -168,7 +222,7 @@ mod cli { .failure() .code(1) .stderr(contains(format!( - "Error: Failed to read file: `{}`", + "Error: Failed to read file: `{}`, reason: No such file or directory (os error 2)", filename ))); } @@ -187,8 +241,8 @@ mod cli { let mut cmd = main_command(); let dir = tempfile::tempdir()?; - let mock_server_a = test_utils::get_mock_server(http::StatusCode::OK).await; - let mock_server_b = test_utils::get_mock_server(http::StatusCode::OK).await; + let mock_server_a = mock_server!(StatusCode::OK); + let mock_server_b = mock_server!(StatusCode::OK); let mut file_a = File::create(dir.path().join("a.md"))?; let mut file_b = File::create(dir.path().join("b.md"))?; @@ -210,8 +264,8 @@ mod cli { let mut cmd = main_command(); let dir = tempfile::tempdir()?; - let mock_server_a = test_utils::get_mock_server(http::StatusCode::OK).await; - let mock_server_b = test_utils::get_mock_server(http::StatusCode::OK).await; + let mock_server_a = mock_server!(StatusCode::OK); + let mock_server_b = mock_server!(StatusCode::OK); let mut file_a = File::create(dir.path().join("README.md"))?; let mut file_b = File::create(dir.path().join("readme.md"))?; @@ -236,7 +290,7 @@ mod cli { let subdir_level_1 = tempfile::tempdir_in(&dir)?; let subdir_level_2 = tempfile::tempdir_in(&subdir_level_1)?; - let mock_server = test_utils::get_mock_server(http::StatusCode::OK).await; + let mock_server = mock_server!(StatusCode::OK); let mut file = File::create(subdir_level_2.path().join("test.md"))?; writeln!(file, "{}", mock_server.uri().as_str())?; @@ -266,7 +320,7 @@ mod cli { .assert() .success(); - let expected = r##"{"total":11,"successful":11,"failures":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"##; + let expected = r#"{"total":11,"successful":11,"failures":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"#; let output = fs::read_to_string(&outfile)?; assert_eq!(output.split_whitespace().collect::(), expected); fs::remove_file(outfile)?; diff --git a/tests/usage.rs b/lychee-bin/tests/usage.rs similarity index 90% rename from tests/usage.rs rename to lychee-bin/tests/usage.rs index f1eb32b..8f66d63 100644 --- a/tests/usage.rs +++ b/lychee-bin/tests/usage.rs @@ -1,11 +1,13 @@ #[cfg(test)] mod readme { - use pretty_assertions::assert_eq; + use std::{ + fs::File, + io::{BufReader, Read}, + path::Path, + }; use assert_cmd::Command; - use std::fs::File; - use std::io::{BufReader, Read}; - use std::path::Path; + use pretty_assertions::assert_eq; fn main_command() -> Command { // this gets the "main" binary name (e.g. `lychee`) @@ -13,7 +15,7 @@ mod readme { } fn load_readme_text() -> String { - let readme_path = Path::new(module_path!()) + let readme_path = Path::new(env!("CARGO_MANIFEST_DIR")) .parent() .unwrap() .join("README.md"); @@ -38,7 +40,7 @@ mod readme { fn test_readme_usage_up_to_date() { let mut cmd = main_command(); - let result = cmd.arg("--help").assert().success(); + let result = cmd.env_clear().arg("--help").assert().success(); let help_output = std::str::from_utf8(&result.get_output().stdout) .expect("Invalid utf8 output for `--help`"); let readme = load_readme_text(); diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml new file mode 100644 index 0000000..a35623d --- /dev/null +++ b/lychee-lib/Cargo.toml @@ -0,0 +1,51 @@ +[package] +name = "lychee-lib" +authors = ["Matthias Endler "] +description = "A glorious link checker" +documentation = "https://github.com/lycheeverse/lychee/blob/master/README.md" +edition = "2018" +homepage = "https://github.com/lycheeverse/lychee" +keywords = [ + "link", + "checker", + "cli", + "link-checker", + "validator", +] +license = "Apache-2.0/MIT" +repository = "https://github.com/lycheeverse/lychee" +version = "0.6.0" + +[dependencies] +check-if-email-exists = "0.8.21" +deadpool = "0.7.0" +derive_builder = "0.10.0" +fast_chemail = "0.9.6" +glob = "0.3.0" +html5ever = "0.25.1" +http = "0.2.4" +hubcaps = "0.6.2" +linkify = "0.6.0" +markup5ever_rcdom = "0.1.0" +openssl-sys = "0.9.61" +pulldown-cmark = "0.8.0" +regex = "1.4.5" +reqwest = { version = "0.11.3", features = ["gzip"] } +# Make build work on Apple Silicon. +# See https://github.com/briansmith/ring/issues/1163 +# This is necessary for the homebrew build +# https://github.com/Homebrew/homebrew-core/pull/70216 +ring = "0.16.20" +serde = { version = "1.0.125", features = ["derive"] } +shellexpand = "2.1.0" +tokio = { version = "1.5.0", features = ["full"] } +url = { version = "2.2.1", features = ["serde"] } + +[dev-dependencies] +doc-comment = "0.3.3" +pretty_assertions = "0.7.1" +tempfile = "3.2.0" +wiremock = "0.5.2" + +[features] +vendored-openssl = ["openssl-sys/vendored"] diff --git a/lychee-lib/LICENSE-APACHE b/lychee-lib/LICENSE-APACHE new file mode 100644 index 0000000..f51e79e --- /dev/null +++ b/lychee-lib/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright 2020 The lychee maintainers + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/lychee-lib/LICENSE-MIT b/lychee-lib/LICENSE-MIT new file mode 100644 index 0000000..31aa793 --- /dev/null +++ b/lychee-lib/LICENSE-MIT @@ -0,0 +1,23 @@ +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs new file mode 100644 index 0000000..b3661bf --- /dev/null +++ b/lychee-lib/src/client.rs @@ -0,0 +1,398 @@ +#![allow( + clippy::module_name_repetitions, + clippy::struct_excessive_bools, + clippy::default_trait_access +)] +use std::{collections::HashSet, convert::TryFrom, time::Duration}; + +use check_if_email_exists::{check_email, CheckEmailInput, Reachable}; +use derive_builder::Builder; +use http::{ + header::{HeaderMap, HeaderValue}, + StatusCode, +}; +use hubcaps::{Credentials, Github}; +use regex::RegexSet; +use reqwest::header; +use tokio::time::sleep; + +use crate::{ + filter::{Excludes, Filter, Includes}, + quirks::Quirks, + uri::Uri, + ErrorKind, Request, Response, Result, Status, +}; + +const DEFAULT_MAX_REDIRECTS: usize = 5; +const DEFAULT_USER_AGENT: &str = concat!("lychee/", env!("CARGO_PKG_VERSION")); + +#[derive(Debug, Clone)] +pub struct Client { + /// Underlying reqwest client instance that handles the HTTP requests. + reqwest_client: reqwest::Client, + /// Github client. + github_client: Option, + /// Filtered domain handling. + filter: Filter, + /// Default request HTTP method to use. + method: reqwest::Method, + /// The set of accepted HTTP status codes for valid URIs. + accepted: Option>, + /// Override behavior for certain known issues with URIs. + quirks: Quirks, +} + +/// A link checker using an API token for Github links +/// otherwise a normal HTTP client. +#[allow(unreachable_pub)] +#[derive(Builder, Debug)] +#[builder(build_fn(skip))] +#[builder(setter(into))] +#[builder(name = "ClientBuilder")] +pub struct ClientBuilderInternal { + /// Set an optional Github token. + /// This allows for more requests before + /// getting rate-limited. + github_token: Option, + /// Check links matching this set of regular expressions + includes: Option, + /// Exclude links matching this set of regular expressions + excludes: Option, + /// Exclude all private network addresses + exclude_all_private: bool, + /// Exclude private IP addresses + exclude_private_ips: bool, + /// Exclude link-local IPs + exclude_link_local_ips: bool, + /// Exclude loopback IP addresses (e.g. 127.0.0.1) + exclude_loopback_ips: bool, + /// Don't check mail addresses + exclude_mail: bool, + /// Maximum number of redirects before returning error + max_redirects: usize, + /// User agent used for checking links + user_agent: String, + /// Ignore SSL errors + allow_insecure: bool, + /// Allowed URI scheme (e.g. https, http). + /// This excludes all links from checking, which + /// don't specify that scheme in the URL. + scheme: Option, + /// Map of headers to send to each resource. + /// This allows working around validation issues + /// on some websites. + custom_headers: HeaderMap, + /// Request method (e.g. `GET` or `HEAD`) + method: reqwest::Method, + /// Set of accepted return codes / status codes + accepted: Option>, + /// Response timeout per request + timeout: Option, +} + +impl ClientBuilder { + fn build_excludes(&self) -> Excludes { + // exclude_all_private option turns on all "private" excludes, + // including private IPs, link-local IPs and loopback IPs + let exclude_all_private = matches!(self.exclude_all_private, Some(true)); + let enable_exclude = |opt| exclude_all_private || matches!(opt, Some(true)); + + Excludes { + regex: self.excludes.clone().unwrap_or_default(), + private_ips: enable_exclude(self.exclude_private_ips), + link_local_ips: enable_exclude(self.exclude_link_local_ips), + loopback_ips: enable_exclude(self.exclude_loopback_ips), + mail: self.exclude_mail.unwrap_or_default(), + } + } + + fn build_includes(&self) -> Includes { + let regex = self.includes.clone().flatten(); + Includes { regex } + } + + /// The build method instantiates the client. + #[allow(clippy::missing_errors_doc)] + pub fn build(&self) -> Result { + // Faking the user agent is necessary for some websites, unfortunately. + // Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com). + let user_agent = self + .user_agent + .clone() + .unwrap_or_else(|| DEFAULT_USER_AGENT.to_owned()); + + let mut headers = self.custom_headers.clone().unwrap_or_default(); + headers.insert(header::USER_AGENT, HeaderValue::from_str(&user_agent)?); + headers.insert( + header::TRANSFER_ENCODING, + HeaderValue::from_static("chunked"), + ); + + let allow_insecure = self.allow_insecure.unwrap_or(false); + let max_redirects = self.max_redirects.unwrap_or(DEFAULT_MAX_REDIRECTS); + + let builder = reqwest::ClientBuilder::new() + .gzip(true) + .default_headers(headers) + .danger_accept_invalid_certs(allow_insecure) + .redirect(reqwest::redirect::Policy::limited(max_redirects)); + + let timeout = self.timeout.flatten(); + + let reqwest_client = (match timeout { + Some(t) => builder.timeout(t), + None => builder, + }) + .build()?; + + let github_token = match self.github_token.clone().flatten() { + Some(token) if !token.is_empty() => { + Some(Github::new(user_agent, Credentials::Token(token))?) + } + _ => None, + }; + + let includes = self.build_includes(); + let excludes = self.build_excludes(); + let scheme = self.scheme.clone().flatten().map(|s| s.to_lowercase()); + + let filter = Filter::new(Some(includes), Some(excludes), scheme); + + let quirks = Quirks::default(); + + Ok(Client { + reqwest_client, + github_client: github_token, + filter, + quirks, + method: self.method.clone().unwrap_or(reqwest::Method::GET), + accepted: self.accepted.clone().unwrap_or_default(), + }) + } +} + +impl Client { + pub async fn check(&self, request: T) -> Result + where + Request: TryFrom, + ErrorKind: From, + { + let Request { uri, source } = Request::try_from(request)?; + let status = if self.filter.is_excluded(&uri) { + Status::Excluded + } else if uri.scheme() == "mailto" { + self.check_mail(&uri).await + } else { + self.check_website(&uri).await + }; + + Ok(Response::new(uri, status, source)) + } + + pub async fn check_website(&self, uri: &Uri) -> Status { + let mut retries: i64 = 3; + let mut wait: u64 = 1; + + let mut status = self.check_default(uri).await; + while retries > 0 { + if status.is_success() { + return status; + } + retries -= 1; + sleep(Duration::from_secs(wait)).await; + wait *= 2; + status = self.check_default(uri).await; + } + // Pull out the heavy weapons in case of a failed normal request. + // This could be a Github URL and we run into the rate limiter. + if let Some((owner, repo)) = uri.extract_github() { + return self.check_github(owner, repo).await; + } + + status + } + + async fn check_github(&self, owner: &str, repo: &str) -> Status { + match &self.github_client { + Some(github) => github + .repo(owner, repo) + .get() + .await + .map_or_else(|e| e.into(), |_| Status::Ok(StatusCode::OK)), + None => ErrorKind::MissingGitHubToken.into(), + } + } + + async fn check_default(&self, uri: &Uri) -> Status { + let request = match self + .reqwest_client + .request(self.method.clone(), uri.as_str()) + .build() + { + Ok(r) => r, + Err(e) => return e.into(), + }; + + let request = self.quirks.apply(request); + + match self.reqwest_client.execute(request).await { + Ok(ref response) => Status::new(response, self.accepted.clone()), + Err(e) => e.into(), + } + } + + pub async fn check_mail(&self, uri: &Uri) -> Status { + let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]); + let result = &(check_email(&input).await)[0]; + + if let Reachable::Invalid = result.is_reachable { + ErrorKind::UnreachableEmailAddress(uri.clone()).into() + } else { + Status::Ok(StatusCode::OK) + } + } +} + +/// A convenience function to check a single URI +/// This is the most simple link check and avoids having to create a client manually. +/// For more complex scenarios, look into using the [`ClientBuilder`] instead. +#[allow(clippy::missing_errors_doc)] +pub async fn check(request: T) -> Result +where + Request: TryFrom, + ErrorKind: From, +{ + let client = ClientBuilder::default().build()?; + Ok(client.check(request).await?) +} + +#[cfg(test)] +mod test { + use std::time::{Duration, Instant}; + + use http::{header::HeaderMap, StatusCode}; + use reqwest::header; + + use super::ClientBuilder; + use crate::{mock_server, test_utils::get_mock_client_response}; + + #[tokio::test] + async fn test_nonexistent() { + let mock_server = mock_server!(StatusCode::NOT_FOUND); + let res = get_mock_client_response(mock_server.uri()).await; + + assert!(res.status().is_failure()); + } + + #[tokio::test] + async fn test_nonexistent_with_path() { + let res = get_mock_client_response("http://127.0.0.1/invalid").await; + + assert!(res.status().is_failure()); + } + + #[tokio::test] + async fn test_exponential_backoff() { + let mock_server = mock_server!(StatusCode::NOT_FOUND); + + let start = Instant::now(); + let res = get_mock_client_response(mock_server.uri()).await; + let end = start.elapsed(); + + assert!(res.status().is_failure()); + + // on slow connections, this might take a bit longer than nominal backed-off timeout (7 secs) + assert!(end.as_secs() >= 7); + assert!(end.as_secs() <= 8); + } + + #[tokio::test] + async fn test_github() { + let res = get_mock_client_response("https://github.com/lycheeverse/lychee").await; + + assert!(res.status().is_success()); + } + + #[tokio::test] + async fn test_github_nonexistent() { + let res = get_mock_client_response("https://github.com/lycheeverse/not-lychee").await; + + assert!(res.status().is_failure()); + } + + #[tokio::test] + async fn test_youtube() { + // This is applying a quirk. See the quirks module. + let res = get_mock_client_response("https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7").await; + assert!(res.status().is_success()); + + let res = get_mock_client_response("https://www.youtube.com/watch?v=invalidNlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7").await; + assert!(res.status().is_failure()); + } + + #[tokio::test] + async fn test_non_github() { + let mock_server = mock_server!(StatusCode::OK); + let res = get_mock_client_response(mock_server.uri()).await; + + assert!(res.status().is_success()); + } + + #[tokio::test] + async fn test_invalid_ssl() { + let res = get_mock_client_response("https://expired.badssl.com/").await; + + assert!(res.status().is_failure()); + + // Same, but ignore certificate error + let res = ClientBuilder::default() + .allow_insecure(true) + .build() + .unwrap() + .check("https://expired.badssl.com/") + .await + .unwrap(); + assert!(res.status().is_success()); + } + + #[tokio::test] + async fn test_custom_headers() { + let res = get_mock_client_response("https://crates.io/crates/lychee/").await; + + assert!(res.status().is_failure()); + + // Try again, but with a custom header. + // For example, crates.io requires a custom accept header. + // See https://github.com/rust-lang/crates.io/issues/788 + let mut custom = HeaderMap::new(); + custom.insert(header::ACCEPT, "text/html".parse().unwrap()); + let res = ClientBuilder::default() + .custom_headers(custom) + .build() + .unwrap() + .check("https://crates.io/crates/lychee") + .await + .unwrap(); + assert!(res.status().is_success()); + } + + #[tokio::test] + async fn test_timeout() { + // Note: this checks response timeout, not connect timeout. + // To check connect timeout, we'd have to do something more involved, + // see: https://github.com/LukeMathWalker/wiremock-rs/issues/19 + let mock_delay = Duration::from_millis(20); + let checker_timeout = Duration::from_millis(10); + assert!(mock_delay > checker_timeout); + + let mock_server = mock_server!(StatusCode::OK, set_delay(mock_delay)); + + let client = ClientBuilder::default() + .timeout(checker_timeout) + .build() + .unwrap(); + + let res = client.check(mock_server.uri()).await.unwrap(); + assert!(res.status().is_timeout()); + } +} diff --git a/src/client_pool.rs b/lychee-lib/src/client_pool.rs similarity index 73% rename from src/client_pool.rs rename to lychee-lib/src/client_pool.rs index 56e58b3..438e0ab 100644 --- a/src/client_pool.rs +++ b/lychee-lib/src/client_pool.rs @@ -4,6 +4,7 @@ use tokio::sync::mpsc; use crate::{client, types}; +#[allow(missing_debug_implementations)] pub struct ClientPool { tx: mpsc::Sender, rx: mpsc::Receiver, @@ -11,6 +12,7 @@ pub struct ClientPool { } impl ClientPool { + #[must_use] pub fn new( tx: mpsc::Sender, rx: mpsc::Receiver, @@ -20,12 +22,15 @@ impl ClientPool { ClientPool { tx, rx, pool } } + #[allow(clippy::missing_panics_doc)] pub async fn listen(&mut self) { while let Some(req) = self.rx.recv().await { let client = self.pool.get().await; let tx = self.tx.clone(); tokio::spawn(async move { - let resp = client.check(req).await.expect("Invalid URI"); + // Client::check() may fail only because Request::try_from() may fail + // here request is already Request, so it never fails + let resp = client.check(req).await.unwrap(); tx.send(resp) .await .expect("Cannot send response to channel"); diff --git a/src/collector.rs b/lychee-lib/src/collector.rs similarity index 65% rename from src/collector.rs rename to lychee-lib/src/collector.rs index cf6d9a6..fb59be3 100644 --- a/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,32 +1,42 @@ -use crate::{ - extract::{extract_links, FileType}, - Request, +use std::{ + collections::HashSet, + fmt::Display, + path::{Path, PathBuf}, }; -use anyhow::{anyhow, Context, Result}; + use glob::glob_with; use reqwest::Url; use serde::Serialize; use shellexpand::tilde; -use std::path::Path; -use std::path::PathBuf; -use std::{collections::HashSet, fmt::Display}; -use tokio::fs::read_to_string; -use tokio::io::{stdin, AsyncReadExt}; +use tokio::{ + fs::read_to_string, + io::{stdin, AsyncReadExt}, +}; + +use crate::{ + extract::{extract_links, FileType}, + Request, Result, +}; const STDIN: &str = "-"; - +/// Links which need to be validated. #[derive(Debug, Clone, PartialEq, Eq, Hash)] #[non_exhaustive] pub enum Input { - RemoteUrl(Url), + /// URL (of HTTP/HTTPS scheme). + RemoteUrl(Box), + /// Unix shell style glob pattern. FsGlob { pattern: String, ignore_case: bool }, + /// File path. FsPath(PathBuf), + /// Standard Input. Stdin, + /// Raw string input. String(String), } impl Serialize for Input { - fn serialize(&self, serializer: S) -> Result + fn serialize(&self, serializer: S) -> std::result::Result where S: serde::Serializer, { @@ -36,26 +46,13 @@ impl Serialize for Input { impl Display for Input { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Input::RemoteUrl(url) => { - write!(f, "{}", url) - } - Input::FsGlob { - pattern, - ignore_case: _, - } => { - write!(f, "{}", pattern) - } - Input::FsPath(path) => { - write!(f, "{}", path.to_str().unwrap_or_default()) - } - Input::Stdin => { - write!(f, "stdin") - } - Input::String(_) => { - write!(f, "raw input string") - } - } + f.write_str(match self { + Input::RemoteUrl(url) => url.as_str(), + Input::FsGlob { pattern, .. } => pattern, + Input::FsPath(path) => path.to_str().unwrap_or_default(), + Input::Stdin => "stdin", + Input::String(_) => "raw input string", + }) } } @@ -67,6 +64,7 @@ pub struct InputContent { } impl InputContent { + #[must_use] pub fn from_string(s: &str, file_type: FileType) -> Self { // TODO: consider using Cow (to avoid one .clone() for String types) Self { @@ -78,58 +76,50 @@ impl InputContent { } impl Input { + #[must_use] pub fn new(value: &str, glob_ignore_case: bool) -> Self { if value == STDIN { Self::Stdin + } else if let Ok(url) = Url::parse(&value) { + Self::RemoteUrl(Box::new(url)) } else { - match Url::parse(&value) { - Ok(url) => Self::RemoteUrl(url), - Err(_) => { - // this seems to be the only way to determine if this is a glob pattern - let is_glob = glob::Pattern::escape(value) != value; + // this seems to be the only way to determine if this is a glob pattern + let is_glob = glob::Pattern::escape(value) != value; - if is_glob { - Self::FsGlob { - pattern: value.to_owned(), - ignore_case: glob_ignore_case, - } - } else { - Self::FsPath(value.into()) - } + if is_glob { + Self::FsGlob { + pattern: value.to_owned(), + ignore_case: glob_ignore_case, } + } else { + Self::FsPath(value.into()) } } } + #[allow(clippy::missing_panics_doc, clippy::missing_errors_doc)] pub async fn get_contents( &self, file_type_hint: Option, skip_missing: bool, ) -> Result> { - use Input::*; - - match self { + match *self { // TODO: should skip_missing also affect URLs? - RemoteUrl(url) => Ok(vec![Self::url_contents(url).await?]), - FsGlob { - pattern, + Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]), + Input::FsGlob { + ref pattern, ignore_case, - } => Ok(Self::glob_contents(pattern, *ignore_case).await?), - FsPath(path) => { - let content = Self::path_content(&path).await.with_context(|| { - format!( - "Failed to read file: `{}`", - path.to_str().unwrap_or("") - ) - }); + } => Ok(Self::glob_contents(pattern, ignore_case).await?), + Input::FsPath(ref path) => { + let content = Self::path_content(path).await; match content { Ok(input_content) => Ok(vec![input_content]), Err(_) if skip_missing => Ok(vec![]), - Err(arg) => Err(anyhow!(arg)), + Err(e) => Err(e), } } - Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]), - String(s) => Ok(vec![Self::string_content(s, file_type_hint)]), + Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]), + Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]), } } @@ -142,11 +132,10 @@ impl Input { }; let res = reqwest::get(url.clone()).await?; - let content = res.text().await?; let input_content = InputContent { - input: Input::RemoteUrl(url.clone()), + input: Input::RemoteUrl(Box::new(url.clone())), file_type, - content, + content: res.text().await?, }; Ok(input_content) @@ -172,10 +161,13 @@ impl Input { Ok(contents) } - async fn path_content + AsRef>(path: P) -> Result { + async fn path_content + AsRef + Clone>(path: P) -> Result { + let content = read_to_string(&path) + .await + .map_err(|e| (path.clone().into(), e))?; let input_content = InputContent { file_type: FileType::from(path.as_ref()), - content: read_to_string(&path).await?, + content, input: Input::FsPath(path.into()), }; @@ -203,15 +195,17 @@ impl Input { /// Fetch all unique links from a slice of inputs /// All relative URLs get prefixed with `base_url` if given. +#[allow(clippy::missing_errors_doc)] pub async fn collect_links( inputs: &[Input], base_url: Option, skip_missing_inputs: bool, max_concurrency: usize, ) -> Result> { - let base_url = match base_url { - Some(url) => Some(Url::parse(&url)?), - _ => None, + let base_url = if let Some(url) = base_url { + Some(Url::parse(&url).map_err(|e| (url, e))?) + } else { + None }; let (contents_tx, mut contents_rx) = tokio::sync::mpsc::channel(max_concurrency); @@ -236,7 +230,7 @@ pub async fn collect_links( for input_content in result? { let base_url = base_url.clone(); let handle = - tokio::task::spawn_blocking(move || extract_links(&input_content, base_url)); + tokio::task::spawn_blocking(move || extract_links(&input_content, &base_url)); extract_links_handles.push(handle); } } @@ -257,23 +251,32 @@ pub async fn collect_links( #[cfg(test)] mod test { - use super::*; - use pretty_assertions::assert_eq; + use std::{fs::File, io::Write}; + use http::StatusCode; + use pretty_assertions::assert_eq; + use reqwest::Url; + + use super::{collect_links, Input}; use crate::{ - test_utils::{get_mock_server_with_content, website}, - Uri, + extract::FileType, + mock_server, + test_utils::{mail, website}, + Result, Uri, }; - use std::fs::File; - use std::io::Write; - use std::str::FromStr; + + const TEST_STRING: &str = "http://test-string.com"; + const TEST_URL: &str = "https://test-url.org"; + const TEST_FILE: &str = "https://test-file.io"; + const TEST_GLOB_1: &str = "https://test-glob-1.io"; + const TEST_GLOB_2_MAIL: &str = "test@glob-2.io"; #[tokio::test] #[ignore] async fn test_file_without_extension_is_plaintext() -> Result<()> { - let dir = tempfile::tempdir()?; + let temp_dir = tempfile::tempdir()?; // Treat as plaintext file (no extension) - let file_path = dir.path().join("README"); + let file_path = temp_dir.path().join("README"); let _file = File::create(&file_path)?; let input = Input::new(&file_path.as_path().display().to_string(), true); let contents = input.get_contents(None, true).await?; @@ -295,16 +298,12 @@ mod test { #[tokio::test] async fn test_collect_links() -> Result<()> { - const TEST_STRING: &str = "http://test-string.com"; - const TEST_URL: &str = "https://test-url.org"; - const TEST_FILE: &str = "https://test-file.io"; - const TEST_GLOB_1: &str = "https://test-glob-1.io"; - const TEST_GLOB_2_MAIL: &str = "test@glob-2.io"; + let temp_dir = tempfile::tempdir()?; + let temp_dir_path = temp_dir.path(); - let dir = tempfile::tempdir()?; - let file_path = dir.path().join("f"); - let file_glob_1_path = dir.path().join("glob-1"); - let file_glob_2_path = dir.path().join("glob-2"); + let file_path = temp_dir_path.join("f"); + let file_glob_1_path = temp_dir_path.join("glob-1"); + let file_glob_2_path = temp_dir_path.join("glob-2"); let mut file = File::create(&file_path)?; let mut file_glob_1 = File::create(file_glob_1_path)?; @@ -314,14 +313,16 @@ mod test { writeln!(file_glob_1, "{}", TEST_GLOB_1)?; writeln!(file_glob_2, "{}", TEST_GLOB_2_MAIL)?; - let mock_server = get_mock_server_with_content(http::StatusCode::OK, Some(TEST_URL)).await; + let mock_server = mock_server!(StatusCode::OK, set_body_string(TEST_URL)); let inputs = vec![ - Input::String(TEST_STRING.to_string()), - Input::RemoteUrl(Url::from_str(&mock_server.uri())?), + Input::String(TEST_STRING.to_owned()), + Input::RemoteUrl(Box::new( + Url::parse(&mock_server.uri()).map_err(|e| (mock_server.uri(), e))?, + )), Input::FsPath(file_path), Input::FsGlob { - pattern: dir.path().join("glob*").to_str().unwrap().to_string(), + pattern: temp_dir_path.join("glob*").to_str().unwrap().to_owned(), ignore_case: true, }, ]; @@ -329,12 +330,12 @@ mod test { let responses = collect_links(&inputs, None, false, 8).await?; let mut links = responses.into_iter().map(|r| r.uri).collect::>(); - let mut expected_links: Vec = vec![ + let mut expected_links = vec![ website(TEST_STRING), website(TEST_URL), website(TEST_FILE), website(TEST_GLOB_1), - Uri::Mail(TEST_GLOB_2_MAIL.to_string()), + mail(TEST_GLOB_2_MAIL), ]; links.sort(); diff --git a/src/extract.rs b/lychee-lib/src/extract.rs similarity index 52% rename from src/extract.rs rename to lychee-lib/src/extract.rs index ac07050..ff96a56 100644 --- a/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -1,15 +1,17 @@ -use crate::uri::Uri; -use crate::{collector::InputContent, Request}; -use html5ever::parse_document; -use html5ever::tendril::{StrTendril, TendrilSink}; +use std::{collections::HashSet, convert::TryFrom, path::Path}; + +use html5ever::{ + parse_document, + tendril::{StrTendril, TendrilSink}, +}; use linkify::LinkFinder; use markup5ever_rcdom::{Handle, NodeData, RcDom}; use pulldown_cmark::{Event as MDEvent, Parser, Tag}; -use std::path::Path; -use std::{collections::HashSet, convert::TryFrom}; use url::Url; -#[derive(Clone, Debug, PartialEq, Eq)] +use crate::{collector::InputContent, Request, Uri}; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum FileType { Html, Markdown, @@ -26,20 +28,17 @@ impl> From

for FileType { /// Detect if the given path points to a Markdown, HTML, or plaintext file. fn from(p: P) -> FileType { let path = p.as_ref(); - match path.extension() { - Some(ext) => match ext { - _ if (ext == "md" || ext == "markdown") => FileType::Markdown, - _ if (ext == "htm" || ext == "html") => FileType::Html, - _ => FileType::Plaintext, - }, - // Assume HTML in case of no extension. - // Note: this is only reasonable for URLs; not paths on disk. - // For example, `README` without an extension is more likely to be a plaintext file. - // A better solution would be to also implement `From for FileType`. - // Unfortunately that's not possible without refactoring, as - // `AsRef` could be implemented for `Url` in the future, which is why - // `From for FileType` is not allowed. - None => FileType::Html, + // Assume HTML in case of no extension. + // Note: this is only reasonable for URLs; not paths on disk. + // For example, `README` without an extension is more likely to be a plaintext file. + // A better solution would be to also implement `From for FileType`. + // Unfortunately that's not possible without refactoring, as + // `AsRef` could be implemented for `Url` in the future, which is why + // `From for FileType` is not allowed. + match path.extension().and_then(std::ffi::OsStr::to_str) { + Some("md") | Some("markdown") => FileType::Markdown, + Some("htm") | Some("html") | None => FileType::Html, + Some(_) => FileType::Plaintext, } } } @@ -55,10 +54,9 @@ fn extract_links_from_markdown(input: &str) -> Vec { let parser = Parser::new(input); parser .flat_map(|event| match event { - MDEvent::Start(tag) => match tag { - Tag::Link(_, url, _) | Tag::Image(_, url, _) => vec![url.to_string()], - _ => vec![], - }, + MDEvent::Start(Tag::Link(_, url, _)) | MDEvent::Start(Tag::Image(_, url, _)) => { + vec![url.to_string()] + } MDEvent::Text(txt) => extract_links_from_plaintext(&txt.to_string()), MDEvent::Html(html) => extract_links_from_html(&html.to_string()), _ => vec![], @@ -69,7 +67,7 @@ fn extract_links_from_markdown(input: &str) -> Vec { /// Extract unparsed URL strings from a HTML string. fn extract_links_from_html(input: &str) -> Vec { let tendril = StrTendril::from(input); - let rc_dom = parse_document(RcDom::default(), Default::default()).one(tendril); + let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril); let mut urls = Vec::new(); @@ -84,15 +82,11 @@ fn extract_links_from_html(input: &str) -> Vec { fn walk_html_links(mut urls: &mut Vec, node: &Handle) { match node.data { NodeData::Text { ref contents } => { - for link in extract_links_from_plaintext(&contents.borrow()) { - urls.push(link); - } + urls.append(&mut extract_links_from_plaintext(&contents.borrow())); } NodeData::Comment { ref contents } => { - for link in extract_links_from_plaintext(contents) { - urls.push(link); - } + urls.append(&mut extract_links_from_plaintext(contents)); } NodeData::Element { @@ -106,9 +100,7 @@ fn walk_html_links(mut urls: &mut Vec, node: &Handle) { if elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) { urls.push(attr_value); } else { - for link in extract_links_from_plaintext(&attr_value) { - urls.push(link); - } + urls.append(&mut extract_links_from_plaintext(&attr_value)); } } } @@ -148,7 +140,7 @@ fn extract_links_from_plaintext(input: &str) -> Vec { pub(crate) fn extract_links( input_content: &InputContent, - base_url: Option, + base_url: &Option, ) -> HashSet { let links = match input_content.file_type { FileType::Markdown => extract_links_from_markdown(&input_content.content), @@ -160,21 +152,14 @@ pub(crate) fn extract_links( // Silently ignore the parse failures for now. let mut requests: HashSet = HashSet::new(); for link in links { - match Uri::try_from(link.as_str()) { - Ok(uri) => { - requests.insert(Request::new(uri, input_content.input.clone())); - } - Err(_) => { - if !Path::new(&link).exists() { - if let Some(base_url) = &base_url { - if let Ok(new_url) = base_url.join(&link) { - requests.insert(Request::new( - Uri::Website(new_url), - input_content.input.clone(), - )); - } - } - } + if let Ok(uri) = Uri::try_from(link.as_str()) { + requests.insert(Request::new(uri, input_content.input.clone())); + } else if !Path::new(&link).exists() { + if let Some(new_url) = base_url.as_ref().and_then(|u| u.join(&link).ok()) { + requests.insert(Request::new( + Uri { url: new_url }, + input_content.input.clone(), + )); } }; } @@ -183,15 +168,29 @@ pub(crate) fn extract_links( #[cfg(test)] mod test { - use crate::test_utils::website; + use std::{ + array, + collections::HashSet, + fs::File, + io::{BufReader, Read}, + path::Path, + }; - use super::*; use pretty_assertions::assert_eq; - use std::fs::File; - use std::io::{BufReader, Read}; + use url::Url; + + use super::{ + extract_links, extract_links_from_html, extract_links_from_markdown, + extract_links_from_plaintext, find_links, FileType, + }; + use crate::{ + collector::InputContent, + test_utils::{mail, website}, + Uri, + }; fn load_fixture(filename: &str) -> String { - let fixture_path = Path::new(module_path!()) + let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR")) .parent() .unwrap() .join("fixtures") @@ -208,106 +207,92 @@ mod test { content } + fn extract_uris(input: &str, file_type: FileType, base_url: Option<&str>) -> HashSet { + extract_links( + &InputContent::from_string(input, file_type), + &base_url.map(|u| Url::parse(u).unwrap()), + ) + .into_iter() + .map(|r| r.uri) + .collect() + } + #[test] fn test_file_type() { // FIXME: Assume plaintext in case a path has no extension // assert_eq!(FileType::from(Path::new("/")), FileType::Plaintext); - - assert_eq!(FileType::from(Path::new("test.md")), FileType::Markdown); + assert_eq!(FileType::from("test.md"), FileType::Markdown); + assert_eq!(FileType::from("test.markdown"), FileType::Markdown); + assert_eq!(FileType::from("test.html"), FileType::Html); + assert_eq!(FileType::from("test.txt"), FileType::Plaintext); + assert_eq!(FileType::from("test.something"), FileType::Plaintext); assert_eq!( - FileType::from(Path::new("test.markdown")), - FileType::Markdown - ); - assert_eq!(FileType::from(Path::new("test.html")), FileType::Html); - assert_eq!(FileType::from(Path::new("test.txt")), FileType::Plaintext); - assert_eq!( - FileType::from(Path::new("test.something")), - FileType::Plaintext - ); - assert_eq!( - FileType::from(Path::new("/absolute/path/to/test.something")), + FileType::from("/absolute/path/to/test.something"), FileType::Plaintext ); } #[test] fn test_extract_link_at_end_of_line() { - let link = "http://www.apache.org/licenses/LICENSE-2.0"; - let input = format!("{}\n", link); + let input = "http://www.apache.org/licenses/LICENSE-2.0\n"; + let link = input.trim_end(); - let found = extract_links_from_markdown(&input); - assert_eq!(vec![link], found); - - let found = extract_links_from_plaintext(&input); - assert_eq!(vec![link], found); - - let found = extract_links_from_html(&input); - assert_eq!(vec![link], found); + assert_eq!(vec![link], extract_links_from_markdown(&input)); + assert_eq!(vec![link], extract_links_from_plaintext(&input)); + assert_eq!(vec![link], extract_links_from_html(&input)); } #[test] fn test_extract_markdown_links() { - let input = "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)"; - let links: HashSet = extract_links( - &InputContent::from_string(input, FileType::Markdown), - Some(Url::parse("https://github.com/hello-rust/lychee/").unwrap()), - ) - .into_iter() - .map(|r| r.uri) - .collect(); - assert_eq!( - links, - [ - website("https://endler.dev"), - website("https://github.com/hello-rust/lychee/relative_link"), - ] - .iter() - .cloned() - .collect() - ) + let links = extract_uris( + "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)", + FileType::Markdown, + Some("https://github.com/hello-rust/lychee/"), + ); + + let expected_links = array::IntoIter::new([ + website("https://endler.dev"), + website("https://github.com/hello-rust/lychee/relative_link"), + ]) + .collect::>(); + + assert_eq!(links, expected_links) } #[test] fn test_extract_html_links() { - let input = r#" + let links = extract_uris( + r#"

- "#; - - let links: HashSet = extract_links( - &InputContent::from_string(input, FileType::Html), - Some(Url::parse("https://github.com/lycheeverse/").unwrap()), - ) - .into_iter() - .map(|r| r.uri) - .collect(); - - assert_eq!( - links, - [ - website("https://github.com/lycheeverse/lychee/"), - website("https://github.com/lycheeverse/blob/master/README.md"), - ] - .iter() - .cloned() - .collect::>(), + "#, + FileType::Html, + Some("https://github.com/lycheeverse/"), ); + + let expected_links = array::IntoIter::new([ + website("https://github.com/lycheeverse/lychee/"), + website("https://github.com/lycheeverse/blob/master/README.md"), + ]) + .collect::>(); + + assert_eq!(links, expected_links); } #[test] fn test_skip_markdown_anchors() { - let input = "This is [a test](#lol)."; - let links = extract_links(&InputContent::from_string(input, FileType::Markdown), None); - assert_eq!(links, HashSet::new()) + let links = extract_uris("This is [a test](#lol).", FileType::Markdown, None); + + assert!(links.is_empty()) } #[test] fn test_skip_markdown_internal_urls() { - let input = "This is [a test](./internal)."; - let links = extract_links(&InputContent::from_string(input, FileType::Markdown), None); - assert_eq!(links, HashSet::new()) + let links = extract_uris("This is [a test](./internal).", FileType::Markdown, None); + + assert!(links.is_empty()) } #[test] @@ -317,23 +302,16 @@ mod test { This is [an internal url](@/internal.markdown) \ This is [an internal url](@/internal.markdown#example) \ This is [an internal url](@/internal.md#example)"; - let links: HashSet = extract_links( - &InputContent::from_string(input, FileType::Markdown), - Some(Url::parse(base_url).unwrap()), - ) - .into_iter() - .map(|r| r.uri) - .collect(); - let expected = [ + let links = extract_uris(input, FileType::Markdown, Some(base_url)); + + let expected = array::IntoIter::new([ website("https://localhost.com/@/internal.md"), website("https://localhost.com/@/internal.markdown"), website("https://localhost.com/@/internal.md#example"), website("https://localhost.com/@/internal.markdown#example"), - ] - .iter() - .cloned() - .collect(); + ]) + .collect::>(); assert_eq!(links, expected) } @@ -341,15 +319,9 @@ mod test { #[test] fn test_skip_markdown_email() { let input = "Get in touch - [Contact Us](mailto:test@test.com)"; - let links: HashSet = - extract_links(&InputContent::from_string(input, FileType::Markdown), None) - .into_iter() - .map(|r| r.uri) - .collect(); - let expected: HashSet = [Uri::Mail("test@test.com".to_string())] - .iter() - .cloned() - .collect(); + let links = extract_uris(input, FileType::Markdown, None); + let expected = array::IntoIter::new([mail("test@test.com")]).collect::>(); + assert_eq!(links, expected) } @@ -357,55 +329,40 @@ mod test { fn test_non_markdown_links() { let input = "https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.org"; - let links: HashSet = - extract_links(&InputContent::from_string(input, FileType::Plaintext), None) - .into_iter() - .map(|r| r.uri) - .collect(); + let links: HashSet = extract_uris(input, FileType::Plaintext, None); - let expected = [ + let expected = array::IntoIter::new([ website("https://endler.dev"), website("https://hello-rust.show/foo/bar?lol=1"), - Uri::Mail("test@example.org".to_string()), - ] - .iter() - .cloned() - .collect(); + mail("test@example.org"), + ]) + .collect::>(); assert_eq!(links, expected) } #[test] - #[ignore] - // TODO: Does this escaping need to work properly? - // See https://github.com/tcort/markdown-link-check/issues/37 fn test_md_escape() { let input = r#"http://msdn.microsoft.com/library/ie/ms535874\(v=vs.85\).aspx"#; let links = find_links(input); let expected = "http://msdn.microsoft.com/library/ie/ms535874(v=vs.85).aspx)"; - assert!(links.len() == 1); - assert_eq!(links[0].as_str(), expected); + + matches!(&links[..], [link] if link.as_str() == expected); } #[test] fn test_extract_html5_not_valid_xml() { let input = load_fixture("TEST_HTML5.html"); - let links: HashSet = - extract_links(&InputContent::from_string(&input, FileType::Html), None) - .into_iter() - .map(|r| r.uri) - .collect(); + let links = extract_uris(&input, FileType::Html, None); - let expected_links = [ + let expected_links = array::IntoIter::new([ website("https://example.org/head/home"), website("https://example.org/css/style_full_url.css"), // the body links wouldn't be present if the file was parsed strictly as XML website("https://example.org/body/a"), website("https://example.org/body/div_empty_a"), - ] - .iter() - .cloned() - .collect(); + ]) + .collect::>(); assert_eq!(links, expected_links); } @@ -413,15 +370,9 @@ mod test { #[test] fn test_extract_html5_not_valid_xml_relative_links() { let input = load_fixture("TEST_HTML5.html"); - let links: HashSet = extract_links( - &InputContent::from_string(&input, FileType::Html), - Some(Url::parse("https://example.org").unwrap()), - ) - .into_iter() - .map(|r| r.uri) - .collect(); + let links = extract_uris(&input, FileType::Html, Some("https://example.org")); - let expected_links = [ + let expected_links = array::IntoIter::new([ website("https://example.org/head/home"), website("https://example.org/images/icon.png"), website("https://example.org/css/style_relative_url.css"), @@ -430,10 +381,8 @@ mod test { // the body links wouldn't be present if the file was parsed strictly as XML website("https://example.org/body/a"), website("https://example.org/body/div_empty_a"), - ] - .iter() - .cloned() - .collect(); + ]) + .collect::>(); assert_eq!(links, expected_links); } @@ -442,16 +391,10 @@ mod test { fn test_extract_html5_lowercase_doctype() { // this has been problematic with previous XML based parser let input = load_fixture("TEST_HTML5_LOWERCASE_DOCTYPE.html"); - let links: HashSet = - extract_links(&InputContent::from_string(&input, FileType::Html), None) - .into_iter() - .map(|r| r.uri) - .collect(); + let links = extract_uris(&input, FileType::Html, None); - let expected_links = [website("https://example.org/body/a")] - .iter() - .cloned() - .collect(); + let expected_links = + array::IntoIter::new([website("https://example.org/body/a")]).collect::>(); assert_eq!(links, expected_links); } @@ -460,22 +403,16 @@ mod test { fn test_extract_html5_minified() { // minified HTML with some quirky elements such as href attribute values specified without quotes let input = load_fixture("TEST_HTML5_MINIFIED.html"); - let links: HashSet = - extract_links(&InputContent::from_string(&input, FileType::Html), None) - .into_iter() - .map(|r| r.uri) - .collect(); + let links = extract_uris(&input, FileType::Html, None); - let expected_links = [ + let expected_links = array::IntoIter::new([ website("https://example.org/"), website("https://example.org/favicon.ico"), website("https://fonts.externalsite.com"), website("https://example.org/docs/"), website("https://example.org/forum"), - ] - .iter() - .cloned() - .collect(); + ]) + .collect::>(); assert_eq!(links, expected_links); } @@ -484,18 +421,10 @@ mod test { fn test_extract_html5_malformed() { // malformed links shouldn't stop the parser from further parsing let input = load_fixture("TEST_HTML5_MALFORMED_LINKS.html"); - let links: HashSet = - extract_links(&InputContent::from_string(&input, FileType::Html), None) - .into_iter() - .map(|r| r.uri) - .collect(); + let links = extract_uris(&input, FileType::Html, None); - let expected_links = [Uri::Website( - Url::parse("https://example.org/valid").unwrap(), - )] - .iter() - .cloned() - .collect(); + let expected_links = + array::IntoIter::new([website("https://example.org/valid")]).collect::>(); assert_eq!(links, expected_links); } @@ -504,21 +433,15 @@ mod test { fn test_extract_html5_custom_elements() { // the element name shouldn't matter for attributes like href, src, cite etc let input = load_fixture("TEST_HTML5_CUSTOM_ELEMENTS.html"); - let links: HashSet = - extract_links(&InputContent::from_string(&input, FileType::Html), None) - .into_iter() - .map(|r| r.uri) - .collect(); + let links = extract_uris(&input, FileType::Html, None); - let expected_links = [ + let expected_links = array::IntoIter::new([ website("https://example.org/some-weird-element"), website("https://example.org/even-weirder-src"), website("https://example.org/even-weirder-href"), website("https://example.org/citations"), - ] - .iter() - .cloned() - .collect(); + ]) + .collect::>(); assert_eq!(links, expected_links); } @@ -527,21 +450,13 @@ mod test { fn test_extract_urls_with_at_sign_properly() { // note that these used to parse as emails let input = "https://example.com/@test/test http://otherdomain.com/test/@test".to_string(); - let links: HashSet = extract_links( - &InputContent::from_string(&input, FileType::Plaintext), - None, - ) - .into_iter() - .map(|r| r.uri) - .collect(); + let links = extract_uris(&input, FileType::Plaintext, None); - let expected_links = [ + let expected_links = array::IntoIter::new([ website("https://example.com/@test/test"), website("http://otherdomain.com/test/@test"), - ] - .iter() - .cloned() - .collect(); + ]) + .collect::>(); assert_eq!(links, expected_links); } diff --git a/src/filter/excludes.rs b/lychee-lib/src/filter/excludes.rs similarity index 56% rename from src/filter/excludes.rs rename to lychee-lib/src/filter/excludes.rs index 52aab15..e487860 100644 --- a/src/filter/excludes.rs +++ b/lychee-lib/src/filter/excludes.rs @@ -1,14 +1,14 @@ -use lazy_static::lazy_static; use regex::RegexSet; use std::net::IpAddr; use crate::Uri; /// Pre-defined exclusions for known false-positives -static FALSE_POSITIVE_REGEX: &[&str] = &[r"http://www.w3.org/1999/xhtml"]; +static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"]; /// Exclude configuration for the link checker. /// You can ignore links based on regex patterns or pre-defined IP ranges. +#[allow(clippy::struct_excessive_bools)] #[derive(Clone, Debug)] pub struct Excludes { /// User-defined set of excluded regex patterns @@ -37,54 +37,40 @@ impl Default for Excludes { } impl Excludes { + #[inline] + #[must_use] pub fn regex(&self, input: &str) -> bool { - if let Some(excludes) = &self.regex { - if excludes.is_match(input) { - return true; - } - } - false + self.regex.as_ref().map_or(false, |re| re.is_match(input)) } - pub fn false_positive(&self, input: &str) -> bool { - lazy_static! { - static ref FALSE_POSITIVES: RegexSet = RegexSet::new(FALSE_POSITIVE_REGEX).unwrap(); - } - FALSE_POSITIVES.is_match(input) + #[must_use] + pub fn is_false_positive(input: &str) -> bool { + input == FALSE_POSITIVE_PAT[0] } + #[must_use] pub fn ip(&self, uri: &Uri) -> bool { - if let Some(ipaddr) = uri.host_ip() { - if self.loopback_ips && ipaddr.is_loopback() { - return true; - } - + match uri.host_ip() { + Some(ip_addr) if self.loopback_ips && ip_addr.is_loopback() => true, // Note: in a pathological case, an IPv6 address can be IPv4-mapped // (IPv4 address embedded in a IPv6). We purposefully // don't deal with it here, and assume if an address is IPv6, // we shouldn't attempt to map it to IPv4. // See: https://tools.ietf.org/html/rfc4291#section-2.5.5.2 - if let IpAddr::V4(v4addr) = ipaddr { - if self.private_ips && v4addr.is_private() { - return true; - } - if self.link_local_ips && v4addr.is_link_local() { - return true; - } - } + Some(IpAddr::V4(v4_addr)) if self.private_ips && v4_addr.is_private() => true, + Some(IpAddr::V4(v4_addr)) if self.link_local_ips && v4_addr.is_link_local() => true, + _ => false, } - - false } - pub fn is_mail_excluded(&self) -> bool { + #[inline] + #[must_use] + pub const fn is_mail_excluded(&self) -> bool { self.mail } + #[inline] pub fn is_empty(&self) -> bool { - match &self.regex { - None => true, - Some(regex_set) => regex_set.is_empty(), - } + self.regex.as_ref().map_or(true, RegexSet::is_empty) } } diff --git a/lychee-lib/src/filter/includes.rs b/lychee-lib/src/filter/includes.rs new file mode 100644 index 0000000..8a8ac7c --- /dev/null +++ b/lychee-lib/src/filter/includes.rs @@ -0,0 +1,21 @@ +use regex::RegexSet; + +/// Include configuration for the link checker. +/// You can include links based on regex patterns +#[derive(Clone, Debug, Default)] +pub struct Includes { + pub regex: Option, +} + +impl Includes { + #[inline] + #[must_use] + pub fn regex(&self, input: &str) -> bool { + self.regex.as_ref().map_or(false, |re| re.is_match(input)) + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.regex.as_ref().map_or(true, RegexSet::is_empty) + } +} diff --git a/lychee-lib/src/filter/mod.rs b/lychee-lib/src/filter/mod.rs new file mode 100644 index 0000000..a6fcb99 --- /dev/null +++ b/lychee-lib/src/filter/mod.rs @@ -0,0 +1,309 @@ +mod excludes; +mod includes; + +pub use excludes::Excludes; +pub use includes::Includes; + +use crate::uri::Uri; + +/// A generic URI filter +/// Used to decide if a given URI should be checked or skipped +#[derive(Clone, Debug, Default)] +pub struct Filter { + pub(crate) includes: Includes, + pub(crate) excludes: Excludes, + pub(crate) scheme: Option, +} + +impl Filter { + #[must_use] + pub fn new( + includes: Option, + excludes: Option, + scheme: Option, + ) -> Self { + Filter { + includes: includes.unwrap_or_default(), + excludes: excludes.unwrap_or_default(), + scheme, + } + } + + #[must_use] + pub fn is_excluded(&self, uri: &Uri) -> bool { + // Skip mail? + if self.excludes.is_mail_excluded() && uri.scheme() == "mailto" { + return true; + } + // Skip specific IP address? + if self.excludes.ip(&uri) { + return true; + } + + let input = uri.as_str(); + if self.includes.is_empty() { + if self.excludes.is_empty() { + // No regex includes/excludes at all? + // Not excluded unless it's a known false positive + return Excludes::is_false_positive(input); + } + } else if self.includes.regex(input) { + // Included explicitly (Includes take precedence over excludes) + return false; + } + // Exclude well-known false-positives. + // This is done after checking includes to allow for user-overwrites. + if Excludes::is_false_positive(uri.as_str()) { + return true; + } + if self.excludes.is_empty() { + if !self.includes.is_empty() { + // In case we have includes and no excludes, + // skip everything that was not included + return true; + } + } else if self.excludes.regex(input) { + // Excluded explicitly + return true; + } + + // URI scheme excluded? + matches!(self.scheme, Some(ref scheme) if scheme != uri.scheme()) + } +} + +#[cfg(test)] +mod test { + use regex::RegexSet; + use reqwest::Url; + use url::Host; + + use super::{Excludes, Filter, Includes}; + use crate::test_utils::{mail, website}; + + // Note: the standard library as of Rust stable 1.47.0 does not expose + // "link-local" or "private" IPv6 checks. However, one might argue + // that these concepts do exist in IPv6, albeit the naming is different. + // See: https://en.wikipedia.org/wiki/Link-local_address#IPv6 + // See: https://en.wikipedia.org/wiki/Private_network#IPv6 + // See: https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html#method.is_unicast_link_local + const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1"; + const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1"; + const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1"; + + const V4_LOOPBACK: &str = "http://127.0.0.1"; + const V6_LOOPBACK: &str = "http://[::1]"; + + const V4_LINK_LOCAL: &str = "http://169.254.0.1"; + + // IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6) + const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]"; + const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]"; + + macro_rules! assert_ip_address { + (v4: $ip:expr, $predicate:tt) => {{ + let res = if let Host::Ipv4(ipv4) = Url::parse($ip).map_err(|_| ())?.host().ok_or(())? { + ipv4.$predicate() + } else { + false + }; + std::assert!(res); + }}; + (v6: $ip:expr, $predicate:tt) => { + let res = if let Host::Ipv6(ipv6) = Url::parse($ip).map_err(|_| ())?.host().ok_or(())? { + ipv6.$predicate() + } else { + false + }; + std::assert!(res); + }; + } + + #[allow(clippy::shadow_unrelated)] + #[test] + fn test_const_sanity() -> Result<(), ()> { + assert_ip_address!(v4: V4_PRIVATE_CLASS_A, is_private); + assert_ip_address!(v4: V4_PRIVATE_CLASS_B, is_private); + assert_ip_address!(v4: V4_PRIVATE_CLASS_C, is_private); + + assert_ip_address!(v4: V4_LOOPBACK, is_loopback); + assert_ip_address!(v6: V6_LOOPBACK, is_loopback); + + assert_ip_address!(v4: V4_LINK_LOCAL, is_link_local); + + Ok(()) + } + + #[test] + fn test_includes_and_excludes_empty() { + // This is the pre-configured, empty set of excludes for a client + // In this case, only the requests matching the include set will be checked + let filter = Filter::default(); + + assert!(!filter.is_excluded(&website("https://example.org"))); + } + + #[test] + fn test_false_positives() { + let filter = Filter::default(); + + assert!(filter.is_excluded(&website("http://www.w3.org/1999/xhtml"))); + assert!(!filter.is_excluded(&website("https://example.org"))); + } + + #[test] + fn test_overwrite_false_positives() { + let includes = Includes { + regex: Some(RegexSet::new(&[r"http://www.w3.org/1999/xhtml"]).unwrap()), + }; + let filter = Filter { + includes, + ..Filter::default() + }; + assert!(!filter.is_excluded(&website("http://www.w3.org/1999/xhtml"))); + } + + #[test] + fn test_include_regex() { + let includes = Includes { + regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()), + }; + let filter = Filter { + includes, + ..Filter::default() + }; + + // Only the requests matching the include set will be checked + assert!(!filter.is_excluded(&website("https://foo.example.org"))); + assert!(filter.is_excluded(&website("https://bar.example.org"))); + assert!(filter.is_excluded(&website("https://example.org"))); + } + + #[test] + fn test_exclude_mail() { + let excludes = Excludes { + mail: true, + ..Excludes::default() + }; + let filter = Filter { + excludes, + ..Filter::default() + }; + + assert!(filter.is_excluded(&mail("mail@example.org"))); + assert!(filter.is_excluded(&mail("foo@bar.dev"))); + assert!(!filter.is_excluded(&website("http://bar.dev"))); + } + + #[test] + fn test_exclude_regex() { + let excludes = Excludes { + regex: Some( + RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.org"]).unwrap(), + ), + ..Excludes::default() + }; + let filter = Filter { + excludes, + ..Filter::default() + }; + + assert!(filter.is_excluded(&website("http://github.com"))); + assert!(filter.is_excluded(&website("http://exclude.org"))); + assert!(filter.is_excluded(&mail("mail@example.org"))); + + assert!(!filter.is_excluded(&website("http://bar.dev"))); + assert!(!filter.is_excluded(&mail("foo@bar.dev"))); + } + #[test] + fn test_exclude_include_regex() { + let includes = Includes { + regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()), + }; + let excludes = Excludes { + regex: Some(RegexSet::new(&[r"example.org"]).unwrap()), + ..Excludes::default() + }; + let filter = Filter { + includes, + excludes, + ..Filter::default() + }; + + // Includes take preference over excludes + assert!(!filter.is_excluded(&website("https://foo.example.org")),); + + assert!(filter.is_excluded(&website("https://example.org"))); + assert!(filter.is_excluded(&website("https://bar.example.org"))); + } + + #[test] + fn test_excludes_no_private_ips_by_default() { + let filter = Filter::default(); + + assert!(!filter.is_excluded(&website(V4_PRIVATE_CLASS_A))); + assert!(!filter.is_excluded(&website(V4_PRIVATE_CLASS_B))); + assert!(!filter.is_excluded(&website(V4_PRIVATE_CLASS_C))); + assert!(!filter.is_excluded(&website(V4_LINK_LOCAL))); + assert!(!filter.is_excluded(&website(V4_LOOPBACK))); + assert!(!filter.is_excluded(&website(V6_LOOPBACK))); + } + + #[test] + fn test_exclude_private_ips() { + let filter = Filter { + excludes: Excludes { + private_ips: true, + ..Excludes::default() + }, + ..Filter::default() + }; + + assert!(filter.is_excluded(&website(V4_PRIVATE_CLASS_A))); + assert!(filter.is_excluded(&website(V4_PRIVATE_CLASS_B))); + assert!(filter.is_excluded(&website(V4_PRIVATE_CLASS_C))); + } + + #[test] + fn test_exclude_link_local() { + let filter = Filter { + excludes: Excludes { + link_local_ips: true, + ..Excludes::default() + }, + ..Filter::default() + }; + + assert!(filter.is_excluded(&website(V4_LINK_LOCAL))); + } + + #[test] + fn test_exclude_loopback() { + let filter = Filter { + excludes: Excludes { + loopback_ips: true, + ..Excludes::default() + }, + ..Filter::default() + }; + + assert!(filter.is_excluded(&website(V4_LOOPBACK))); + assert!(filter.is_excluded(&website(V6_LOOPBACK))); + } + + #[test] + fn test_exclude_ip_v4_mapped_ip_v6_not_supported() { + let filter = Filter { + excludes: Excludes { + private_ips: true, + link_local_ips: true, + ..Excludes::default() + }, + ..Filter::default() + }; + + // if these were pure IPv4, we would exclude + assert!(!filter.is_excluded(&website(V6_MAPPED_V4_PRIVATE_CLASS_A))); + assert!(!filter.is_excluded(&website(V6_MAPPED_V4_LINK_LOCAL))); + } +} diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs new file mode 100644 index 0000000..c665bb1 --- /dev/null +++ b/lychee-lib/src/lib.rs @@ -0,0 +1,73 @@ +//! `lychee` is a library for checking links. +//! "Hello world" example: +//! ``` +//! use lychee_lib::Result; +//! +//! #[tokio::main] +//! async fn main() -> Result<()> { +//! let response = lychee_lib::check("https://github.com/lycheeverse/lychee").await?; +//! println!("{}", response); +//! Ok(()) +//! } +//! ``` +//! +//! For more specific use-cases you can build a lychee client yourself, +//! using the `ClientBuilder` which can be used to +//! configure and run your own link checker and grants full flexibility: +//! +//! ``` +//! use lychee_lib::{ClientBuilder, Result, Status}; +//! +//! #[tokio::main] +//! async fn main() -> Result<()> { +//! let client = ClientBuilder::default().build()?; +//! let response = client.check("https://github.com/lycheeverse/lychee").await?; +//! assert!(response.status().is_success()); +//! Ok(()) +//! } +//! ``` +#![warn(clippy::all, clippy::pedantic)] +#![warn( + absolute_paths_not_starting_with_crate, + invalid_html_tags, + missing_copy_implementations, + missing_debug_implementations, + semicolon_in_expressions_from_macros, + unreachable_pub, + unused_crate_dependencies, + unused_extern_crates, + variant_size_differences, + clippy::missing_const_for_fn +)] +#![deny(anonymous_parameters, macro_use_extern_crate, pointer_structural_match)] +// #![deny(missing_docs)] + +#[cfg(doctest)] +doc_comment::doctest!("../../README.md"); + +mod client; +mod client_pool; +mod quirks; +mod types; +mod uri; + +pub mod collector; +pub mod extract; +pub mod filter; +#[cfg(test)] +#[macro_use] +pub mod test_utils; + +#[cfg(test)] +use doc_comment as _; // required for doctest +use openssl_sys as _; // required for vendored-openssl feature +use ring as _; // required for apple silicon + +pub use crate::{ + client::{check, ClientBuilder}, + client_pool::ClientPool, + collector::Input, + filter::{Excludes, Filter, Includes}, + types::{ErrorKind, Request, Response, ResponseBody, Result, Status}, + uri::Uri, +}; diff --git a/src/quirks/mod.rs b/lychee-lib/src/quirks/mod.rs similarity index 63% rename from src/quirks/mod.rs rename to lychee-lib/src/quirks/mod.rs index d19a773..1a38ff0 100644 --- a/src/quirks/mod.rs +++ b/lychee-lib/src/quirks/mod.rs @@ -6,13 +6,13 @@ use reqwest::{Request, Url}; const GOOGLEBOT: &str = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://google.com/bot.html)"; #[derive(Debug, Clone)] -pub struct Quirk { - pub pattern: Regex, - pub rewrite: fn(Request) -> Request, +pub(crate) struct Quirk { + pub(crate) pattern: Regex, + pub(crate) rewrite: fn(Request) -> Request, } #[derive(Debug, Clone)] -pub struct Quirks { +pub(crate) struct Quirks { quirks: Vec, } @@ -62,7 +62,7 @@ impl Quirks { /// Apply quirks to a given request. Only the first quirk regex pattern /// matching the URL will be applied. The rest will be discarded for /// simplicity reasons. This limitation might be lifted in the future. - pub fn apply(&self, request: Request) -> Request { + pub(crate) fn apply(&self, request: Request) -> Request { for quirk in &self.quirks { if quirk.pattern.is_match(request.url().as_str()) { return (quirk.rewrite)(request); @@ -75,51 +75,68 @@ impl Quirks { #[cfg(test)] mod tests { - use super::*; + use http::{header, Method}; use pretty_assertions::assert_eq; + use reqwest::{Request, Url}; + + use super::{Quirks, GOOGLEBOT}; + + #[derive(Debug)] + struct MockRequest(Request); + + impl MockRequest { + fn new(method: Method, url: Url) -> Self { + Self(Request::new(method, url)) + } + } + + impl PartialEq for MockRequest { + fn eq(&self, other: &Self) -> bool { + self.0.url() == other.0.url() && self.0.method() == other.0.method() + } + } #[test] fn test_twitter_request() { - let orig = Url::parse("https://twitter.com/zarfeblong/status/1339742840142872577").unwrap(); - let request = Request::new(Method::GET, orig.clone()); - let quirks = Quirks::default(); - let modified = quirks.apply(request); - assert_eq!(modified.url(), &orig); - assert_eq!(modified.method(), Method::HEAD); + let url = Url::parse("https://twitter.com/zarfeblong/status/1339742840142872577").unwrap(); + let request = Request::new(Method::GET, url.clone()); + let modified = Quirks::default().apply(request); + assert_eq!( modified.headers().get(header::USER_AGENT).unwrap(), &GOOGLEBOT ); + assert_eq!(MockRequest(modified), MockRequest::new(Method::HEAD, url)); } #[test] fn test_youtube_video_request() { - let orig = Url::parse("https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7").unwrap(); - let request = Request::new(Method::GET, orig); - let quirks = Quirks::default(); - let modified = quirks.apply(request); + let url = Url::parse("https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7").unwrap(); + let request = Request::new(Method::GET, url); + let modified = Quirks::default().apply(request); let expected_url = Url::parse("https://www.youtube.com/oembed?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DNlKuICiT470%26list%3DPLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ%26index%3D7").unwrap(); - assert_eq!(modified.url(), &expected_url); - assert_eq!(modified.method(), Method::GET); + + assert_eq!( + MockRequest(modified), + MockRequest::new(Method::GET, expected_url) + ); } #[test] fn test_non_video_youtube_url_untouched() { - let orig = Url::parse("https://www.youtube.com/channel/UCaYhcUwRBNscFNUKTjgPFiA").unwrap(); - let request = Request::new(Method::GET, orig.clone()); - let quirks = Quirks::default(); - let modified = quirks.apply(request); - assert_eq!(modified.url(), &orig); - assert_eq!(modified.method(), Method::GET); + let url = Url::parse("https://www.youtube.com/channel/UCaYhcUwRBNscFNUKTjgPFiA").unwrap(); + let request = Request::new(Method::GET, url.clone()); + let modified = Quirks::default().apply(request); + + assert_eq!(MockRequest(modified), MockRequest::new(Method::GET, url)); } #[test] fn test_no_quirk_applied() { - let orig = Url::parse("https://endler.dev").unwrap(); - let request = Request::new(Method::GET, orig.clone()); - let quirks = Quirks::default(); - let modified = quirks.apply(request); - assert_eq!(modified.url(), &orig); - assert_eq!(modified.method(), Method::GET); + let url = Url::parse("https://endler.dev").unwrap(); + let request = Request::new(Method::GET, url.clone()); + let modified = Quirks::default().apply(request); + + assert_eq!(MockRequest(modified), MockRequest::new(Method::GET, url)); } } diff --git a/lychee-lib/src/test_utils.rs b/lychee-lib/src/test_utils.rs new file mode 100644 index 0000000..def6827 --- /dev/null +++ b/lychee-lib/src/test_utils.rs @@ -0,0 +1,45 @@ +use std::convert::TryFrom; + +use reqwest::Url; + +use crate::{ClientBuilder, ErrorKind, Request, Uri}; + +#[macro_export] +macro_rules! mock_server { + ($status:expr $(, $func:tt ($($arg:expr),*))*) => {{ + let mock_server = wiremock::MockServer::start().await; + let template = wiremock::ResponseTemplate::new(http::StatusCode::from($status)); + let template = template$(.$func($($arg),*))*; + wiremock::Mock::given(wiremock::matchers::method("GET")).respond_with(template).mount(&mock_server).await; + mock_server + }}; +} + +pub(crate) async fn get_mock_client_response(request: T) -> crate::Response +where + Request: TryFrom, + ErrorKind: From, +{ + ClientBuilder::default() + .build() + .unwrap() + .check(request) + .await + .unwrap() +} + +/// Helper method to convert a string into a URI +/// Note: This panics on error, so it should only be used for testing +pub(crate) fn website(url: &str) -> Uri { + Uri::from(Url::parse(url).expect("Expected valid Website URI")) +} + +pub(crate) fn mail(address: &str) -> Uri { + if address.starts_with("mailto:") { + Url::parse(address) + } else { + Url::parse(&(String::from("mailto:") + address)) + } + .expect("Expected valid Mail Address") + .into() +} diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs new file mode 100644 index 0000000..bd97614 --- /dev/null +++ b/lychee-lib/src/types/error.rs @@ -0,0 +1,163 @@ +use std::{any::Any, convert::Infallible, fmt::Display, hash::Hash, path::PathBuf}; + +use http::header::InvalidHeaderValue; +use serde::{Serialize, Serializer}; + +use crate::Uri; + +/// Kinds of status errors. +#[allow(clippy::module_name_repetitions)] +#[derive(Debug)] +#[non_exhaustive] +pub enum ErrorKind { + // TODO: maybe need to be splitted; currently first slot is Some only for reading files + IoError(Option, std::io::Error), + ReqwestError(reqwest::Error), + HubcapsError(hubcaps::Error), + UrlParseError(String, (url::ParseError, Option)), + UnreachableEmailAddress(Uri), + InvalidHeader(InvalidHeaderValue), + InvalidGlobPattern(glob::PatternError), + MissingGitHubToken, +} + +impl PartialEq for ErrorKind { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::IoError(p1, e1), Self::IoError(p2, e2)) => p1 == p2 && e1.kind() == e2.kind(), + (Self::ReqwestError(e1), Self::ReqwestError(e2)) => e1.to_string() == e2.to_string(), + (Self::HubcapsError(e1), Self::HubcapsError(e2)) => e1.to_string() == e2.to_string(), + (Self::UrlParseError(s1, e1), Self::UrlParseError(s2, e2)) => s1 == s2 && e1 == e2, + (Self::UnreachableEmailAddress(u1), Self::UnreachableEmailAddress(u2)) => u1 == u2, + (Self::InvalidGlobPattern(e1), Self::InvalidGlobPattern(e2)) => { + e1.msg == e2.msg && e1.pos == e2.pos + } + (Self::InvalidHeader(_), Self::InvalidHeader(_)) + | (Self::MissingGitHubToken, Self::MissingGitHubToken) => true, + _ => false, + } + } +} + +impl Eq for ErrorKind {} + +impl Hash for ErrorKind { + fn hash(&self, state: &mut H) + where + H: std::hash::Hasher, + { + match self { + Self::IoError(p, e) => (p, e.kind()).hash(state), + Self::ReqwestError(e) => e.to_string().hash(state), + Self::HubcapsError(e) => e.to_string().hash(state), + Self::UrlParseError(s, e) => (s, e.type_id()).hash(state), + Self::UnreachableEmailAddress(u) => u.hash(state), + Self::InvalidHeader(e) => e.to_string().hash(state), + Self::InvalidGlobPattern(e) => e.to_string().hash(state), + Self::MissingGitHubToken => std::mem::discriminant(self).hash(state), + } + } +} + +impl Display for ErrorKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::IoError(Some(p), e) => write!( + f, + "Failed to read file: `{}`, reason: {}", + p.to_str().unwrap_or(""), + e + ), + Self::IoError(None, e) => e.fmt(f), + Self::ReqwestError(e) => e.fmt(f), + Self::HubcapsError(e) => e.fmt(f), + Self::UrlParseError(s, (url_err, Some(mail_err))) => { + write!( + f, + "Cannot parse {} as website url ({}) or mail address ({})", + s, url_err, mail_err + ) + } + Self::UrlParseError(s, (url_err, None)) => { + write!(f, "Cannot parse {} as website url ({})", s, url_err) + } + Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri), + Self::InvalidHeader(e) => e.fmt(f), + Self::InvalidGlobPattern(e) => e.fmt(f), + Self::MissingGitHubToken => f.write_str( + "GitHub token not specified. To check GitHub links reliably, \ + use `--github-token` flag / `GITHUB_TOKEN` env var.", + ), + } + } +} + +impl Serialize for ErrorKind { + fn serialize(&self, serializer: S) -> std::result::Result + where + S: Serializer, + { + serializer.collect_str(self) + } +} + +impl From<(PathBuf, std::io::Error)> for ErrorKind { + fn from(value: (PathBuf, std::io::Error)) -> Self { + Self::IoError(Some(value.0), value.1) + } +} + +impl From for ErrorKind { + fn from(e: std::io::Error) -> Self { + Self::IoError(None, e) + } +} + +impl From for ErrorKind { + fn from(e: tokio::task::JoinError) -> Self { + Self::IoError(None, e.into()) + } +} + +impl From for ErrorKind { + fn from(e: reqwest::Error) -> Self { + Self::ReqwestError(e) + } +} + +impl From for ErrorKind { + fn from(e: hubcaps::Error) -> Self { + Self::HubcapsError(e) + } +} + +impl From<(String, url::ParseError)> for ErrorKind { + fn from(value: (String, url::ParseError)) -> Self { + Self::UrlParseError(value.0, (value.1, None)) + } +} + +impl From<(String, url::ParseError, fast_chemail::ParseError)> for ErrorKind { + fn from(value: (String, url::ParseError, fast_chemail::ParseError)) -> Self { + Self::UrlParseError(value.0, (value.1, Some(value.2))) + } +} + +impl From for ErrorKind { + fn from(e: InvalidHeaderValue) -> Self { + Self::InvalidHeader(e) + } +} + +impl From for ErrorKind { + fn from(e: glob::PatternError) -> Self { + Self::InvalidGlobPattern(e) + } +} + +impl From for ErrorKind { + fn from(_: Infallible) -> Self { + // tautological + unreachable!() + } +} diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs new file mode 100644 index 0000000..db0a166 --- /dev/null +++ b/lychee-lib/src/types/mod.rs @@ -0,0 +1,13 @@ +#![allow(unreachable_pub)] + +mod error; +mod request; +mod response; +mod status; + +pub use error::ErrorKind; +pub use request::Request; +pub use response::{Response, ResponseBody}; +pub use status::Status; + +pub type Result = std::result::Result; diff --git a/lychee-lib/src/types/request.rs b/lychee-lib/src/types/request.rs new file mode 100644 index 0000000..7b9342f --- /dev/null +++ b/lychee-lib/src/types/request.rs @@ -0,0 +1,41 @@ +use std::{convert::TryFrom, fmt::Display}; + +use crate::{ErrorKind, Input, Uri}; + +#[derive(Debug, PartialEq, Eq, Hash, Clone)] +pub struct Request { + pub uri: Uri, + pub source: Input, +} + +impl Request { + #[inline] + #[must_use] + pub const fn new(uri: Uri, source: Input) -> Self { + Request { uri, source } + } +} + +impl Display for Request { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} ({})", self.uri, self.source) + } +} + +impl TryFrom for Request { + type Error = ErrorKind; + + fn try_from(s: String) -> Result { + let uri = Uri::try_from(s.as_str())?; + Ok(Request::new(uri, Input::String(s))) + } +} + +impl TryFrom<&str> for Request { + type Error = ErrorKind; + + fn try_from(s: &str) -> Result { + let uri = Uri::try_from(s)?; + Ok(Request::new(uri, Input::String(s.to_owned()))) + } +} diff --git a/lychee-lib/src/types/response.rs b/lychee-lib/src/types/response.rs new file mode 100644 index 0000000..8f3b64d --- /dev/null +++ b/lychee-lib/src/types/response.rs @@ -0,0 +1,65 @@ +use std::fmt::Display; + +use serde::Serialize; + +use crate::{Input, Status, Uri}; + +#[derive(Debug)] +pub struct Response(pub Input, pub ResponseBody); + +impl Response { + #[inline] + #[must_use] + pub const fn new(uri: Uri, status: Status, source: Input) -> Self { + Response(source, ResponseBody { uri, status }) + } + + #[inline] + #[must_use] + pub const fn status(&self) -> &Status { + &self.1.status + } +} + +impl Display for Response { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + ::fmt(&self.1, f) + } +} + +impl Serialize for Response { + fn serialize(&self, s: S) -> Result + where + S: serde::Serializer, + { + ::serialize(&self.1, s) + } +} + +#[allow(clippy::module_name_repetitions)] +#[derive(Debug, Serialize, Hash, PartialEq, Eq)] +pub struct ResponseBody { + #[serde(flatten)] + pub uri: Uri, + pub status: Status, +} + +impl Display for ResponseBody { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let ResponseBody { + ref uri, + ref status, + } = self; + + // TODO: Other errors? + let metadata = match status { + Status::Ok(code) | Status::Redirected(code) => { + format!(" [{}]", code) + } + Status::Timeout(Some(code)) => format!(" [{}]", code), + Status::Error(e) => format!(" ({})", e), + _ => "".to_owned(), + }; + write!(f, "{} {}{}", status.icon(), uri, metadata) + } +} diff --git a/lychee-lib/src/types/status.rs b/lychee-lib/src/types/status.rs new file mode 100644 index 0000000..40d5699 --- /dev/null +++ b/lychee-lib/src/types/status.rs @@ -0,0 +1,127 @@ +use std::{collections::HashSet, fmt::Display}; + +use http::StatusCode; +use reqwest::Response; +use serde::{Serialize, Serializer}; + +use crate::ErrorKind; + +const ICON_OK: &str = "\u{2714}"; // ✔ +const ICON_REDIRECTED: &str = "\u{21c4}"; // ⇄ +const ICON_EXCLUDED: &str = "\u{003f}"; // ? +const ICON_ERROR: &str = "\u{2717}"; // ✗ +const ICON_TIMEOUT: &str = "\u{29d6}"; // ⧖ + +/// Response status of the request. +#[allow(variant_size_differences)] +#[derive(Debug, Hash, PartialEq, Eq)] +pub enum Status { + /// Request was successful + Ok(StatusCode), + /// Failed request + Error(Box), + /// Request timed out + Timeout(Option), + /// Got redirected to different resource + Redirected(StatusCode), + /// Resource was excluded from checking + Excluded, +} + +impl Display for Status { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Status::Ok(c) => write!(f, "OK ({})", c), + Status::Redirected(c) => write!(f, "Redirect ({})", c), + Status::Excluded => f.write_str("Excluded"), + Status::Error(e) => write!(f, "Failed: {}", e), + Status::Timeout(Some(c)) => write!(f, "Timeout ({})", c), + Status::Timeout(None) => f.write_str("Timeout"), + } + } +} + +impl Serialize for Status { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + serializer.collect_str(self) + } +} + +impl Status { + #[allow(clippy::missing_panics_doc)] + #[must_use] + pub fn new(response: &Response, accepted: Option>) -> Self { + let code = response.status(); + + if let Some(true) = accepted.map(|a| a.contains(&code)) { + Self::Ok(code) + } else { + match response.error_for_status_ref() { + Ok(_) if code.is_success() => Self::Ok(code), + Ok(_) if code.is_redirection() => Self::Redirected(code), + Err(e) => e.into(), + Ok(_) => unreachable!(), + } + } + } + + #[inline] + #[must_use] + pub const fn is_success(&self) -> bool { + matches!(self, Status::Ok(_)) + } + + #[inline] + #[must_use] + pub const fn is_failure(&self) -> bool { + matches!(self, Status::Error(_)) + } + + #[inline] + #[must_use] + pub const fn is_excluded(&self) -> bool { + matches!(self, Status::Excluded) + } + + #[inline] + #[must_use] + pub const fn is_timeout(&self) -> bool { + matches!(self, Status::Timeout(_)) + } + + #[must_use] + pub const fn icon(&self) -> &str { + match self { + Status::Ok(_) => ICON_OK, + Status::Redirected(_) => ICON_REDIRECTED, + Status::Excluded => ICON_EXCLUDED, + Status::Error(_) => ICON_ERROR, + Status::Timeout(_) => ICON_TIMEOUT, + } + } +} + +impl From for Status { + fn from(e: ErrorKind) -> Self { + Self::Error(Box::new(e)) + } +} + +impl From for Status { + fn from(e: reqwest::Error) -> Self { + if e.is_timeout() { + Self::Timeout(e.status()) + } else { + Self::Error(Box::new(ErrorKind::ReqwestError(e))) + } + } +} + +impl From for Status { + fn from(e: hubcaps::Error) -> Self { + Self::Error(Box::new(e.into())) + } +} diff --git a/lychee-lib/src/uri.rs b/lychee-lib/src/uri.rs new file mode 100644 index 0000000..92e51ad --- /dev/null +++ b/lychee-lib/src/uri.rs @@ -0,0 +1,214 @@ +use std::{convert::TryFrom, fmt::Display, net::IpAddr}; + +use fast_chemail::parse_email; +use serde::{Deserialize, Serialize}; +use url::Url; + +use crate::{ErrorKind, Result}; + +/// Lychee's own representation of a URI, which encapsulates all support formats. +/// +/// If the scheme is `mailto`, it's a mail address. +/// Otherwise it's treated as a website URL. +#[derive(Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct Uri { + /// Website URL or mail address + pub(crate) url: Url, +} + +impl Uri { + /// Returns the string representation of the `Uri`. + /// + /// If it's an email address, returns the string with scheme stripped. + /// Otherwise returns the string as-is. + #[inline] + #[must_use] + pub fn as_str(&self) -> &str { + self.url.as_ref().trim_start_matches("mailto:") + } + + #[inline] + #[must_use] + pub fn scheme(&self) -> &str { + self.url.scheme() + } + + #[inline] + #[must_use] + pub fn domain(&self) -> Option<&str> { + self.url.domain() + } + + #[inline] + #[must_use] + pub fn path_segments(&self) -> Option> { + self.url.path_segments() + } + + #[must_use] + pub fn host_ip(&self) -> Option { + match self.url.host()? { + url::Host::Domain(_) => None, + url::Host::Ipv4(v4_addr) => Some(v4_addr.into()), + url::Host::Ipv6(v6_addr) => Some(v6_addr.into()), + } + } + + // TODO: Support GitLab etc. + pub(crate) fn extract_github(&self) -> Option<(&str, &str)> { + debug_assert!( + !matches!(self.scheme(), "mailto"), + "Should only be called on a Website type!" + ); + + // TODO: Support more patterns + if matches!( + self.domain()?, + "github.com" | "www.github.com" | "raw.githubusercontent.com" + ) { + let mut path = self.path_segments()?; + let owner = path.next()?; + let repo = path.next()?; + return Some((owner, repo)); + } + + None + } +} + +impl AsRef for Uri { + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl From for Uri { + fn from(url: Url) -> Self { + Self { url } + } +} + +impl TryFrom for Uri { + type Error = ErrorKind; + + fn try_from(s: String) -> Result { + let s = s.trim_start_matches("mailto:"); + if let Err(mail_err) = parse_email(s) { + match Url::parse(s) { + Ok(uri) => Ok(uri.into()), + Err(url_err) => Err((s.to_owned(), url_err, mail_err).into()), + } + } else { + Ok(Url::parse(&(String::from("mailto:") + s)).unwrap().into()) + } + } +} + +impl TryFrom<&str> for Uri { + type Error = ErrorKind; + + fn try_from(s: &str) -> Result { + let s = s.trim_start_matches("mailto:"); + if let Err(mail_err) = parse_email(s) { + match Url::parse(s) { + Ok(uri) => Ok(uri.into()), + Err(url_err) => Err((s.to_owned(), url_err, mail_err).into()), + } + } else { + Ok(Url::parse(&(String::from("mailto:") + s)).unwrap().into()) + } + } +} + +impl Display for Uri { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +#[cfg(test)] +mod test { + use std::{ + convert::TryFrom, + net::{IpAddr, Ipv4Addr, Ipv6Addr}, + }; + + use pretty_assertions::assert_eq; + + use super::Uri; + use crate::test_utils::{mail, website}; + + #[test] + fn test_uri_from_str() { + assert!(Uri::try_from("").is_err()); + assert_eq!( + Uri::try_from("http://example.org"), + Ok(website("http://example.org")) + ); + assert_eq!( + Uri::try_from("http://example.org/@test/testing"), + Ok(website("http://example.org/@test/testing")) + ); + assert_eq!( + Uri::try_from("mail@example.org"), + Ok(mail("mail@example.org")) + ); + assert_eq!( + Uri::try_from("mailto:mail@example.org"), + Ok(mail("mail@example.org")) + ); + } + + #[test] + fn test_uri_host_ip_v4() { + assert_eq!( + website("http://127.0.0.1").host_ip(), + Some(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1))) + ); + } + + #[test] + fn test_uri_host_ip_v6() { + assert_eq!( + website("https://[2020::0010]").host_ip(), + Some(IpAddr::V6(Ipv6Addr::new(0x2020, 0, 0, 0, 0, 0, 0, 0x10))) + ); + } + + #[test] + fn test_uri_host_ip_no_ip() { + assert!(website("https://some.cryptic/url").host_ip().is_none()); + } + + #[test] + fn test_mail() { + assert_eq!( + website("http://127.0.0.1").host_ip(), + Some(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1))) + ); + } + + #[test] + fn test_is_github() { + assert_eq!( + website("http://github.com/lycheeverse/lychee").extract_github(), + Some(("lycheeverse", "lychee")) + ); + + assert_eq!( + website("http://www.github.com/lycheeverse/lychee").extract_github(), + Some(("lycheeverse", "lychee")) + ); + + assert_eq!( + website("https://github.com/lycheeverse/lychee").extract_github(), + Some(("lycheeverse", "lychee")) + ); + + assert!( + website("https://pkg.go.dev/github.com/Debian/pkg-go-tools/cmd/pgt-gopath") + .extract_github() + .is_none() + ); + } +} diff --git a/rust-toolchain b/rust-toolchain new file mode 100644 index 0000000..2bf5ad0 --- /dev/null +++ b/rust-toolchain @@ -0,0 +1 @@ +stable diff --git a/src/bin/lychee/stats.rs b/src/bin/lychee/stats.rs deleted file mode 100644 index 2e04804..0000000 --- a/src/bin/lychee/stats.rs +++ /dev/null @@ -1,182 +0,0 @@ -use console::style; -use pad::{Alignment, PadStr}; -use serde::Serialize; - -use std::{ - collections::{HashMap, HashSet}, - fmt::{self, Display}, -}; - -use lychee::{self, collector::Input, Response, Status}; - -// Maximum padding for each entry in the final statistics output -const MAX_PADDING: usize = 20; - -pub fn color_response(response: &Response) -> String { - let out = match response.status { - Status::Ok(_) => style(response).green().bright(), - Status::Redirected(_) => style(response), - Status::Excluded => style(response).dim(), - Status::Timeout(_) => style(response).yellow().bright(), - Status::Error(_, _) => style(response).red().bright(), - }; - out.to_string() -} - -#[derive(Serialize)] -pub struct ResponseStats { - total: usize, - successful: usize, - failures: usize, - timeouts: usize, - redirects: usize, - excludes: usize, - errors: usize, - fail_map: HashMap>, -} - -impl ResponseStats { - pub fn new() -> Self { - let fail_map = HashMap::new(); - ResponseStats { - total: 0, - successful: 0, - failures: 0, - timeouts: 0, - redirects: 0, - excludes: 0, - errors: 0, - fail_map, - } - } - - pub fn add(&mut self, response: Response) { - self.total += 1; - match response.status { - Status::Error(_, _) => self.failures += 1, - Status::Timeout(_) => self.timeouts += 1, - Status::Redirected(_) => self.redirects += 1, - Status::Excluded => self.excludes += 1, - _ => self.successful += 1, - } - - if matches!( - response.status, - Status::Error(_, _) | Status::Timeout(_) | Status::Redirected(_) - ) { - let fail = self.fail_map.entry(response.source.clone()).or_default(); - fail.insert(response); - }; - } - - pub fn is_success(&self) -> bool { - self.total == self.successful + self.excludes - } - - pub fn is_empty(&self) -> bool { - self.total == 0 - } -} - -fn write_stat(f: &mut fmt::Formatter, title: &str, stat: usize, newline: bool) -> fmt::Result { - let fill = title.chars().count(); - f.write_str(title)?; - f.write_str( - &stat - .to_string() - .pad(MAX_PADDING - fill, '.', Alignment::Right, false), - )?; - - if newline { - f.write_str("\n")?; - } - - Ok(()) -} - -impl Display for ResponseStats { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let separator = "-".repeat(MAX_PADDING + 1); - - writeln!(f, "📝 Summary")?; - writeln!(f, "{}", separator)?; - write_stat(f, "🔍 Total", self.total, true)?; - write_stat(f, "✅ Successful", self.successful, true)?; - write_stat(f, "⏳ Timeouts", self.timeouts, true)?; - write_stat(f, "🔀 Redirected", self.redirects, true)?; - write_stat(f, "👻 Excluded", self.excludes, true)?; - write_stat(f, "🚫 Errors", self.errors + self.failures, false)?; - - for (input, responses) in &self.fail_map { - // Using leading newlines over trailing ones (e.g. `writeln!`) - // lets us avoid extra newlines without any additional logic. - write!(f, "\n\nErrors in {}", input)?; - for response in responses { - write!(f, "\n{}", color_response(response))? - } - } - - Ok(()) - } -} - -#[cfg(test)] -mod test_super { - use lychee::{test_utils::website, Status}; - - use super::*; - use pretty_assertions::assert_eq; - - #[test] - fn test_stats_is_empty() { - let mut stats = ResponseStats::new(); - assert!(stats.is_empty()); - - stats.add(Response { - uri: website("http://example.org/ok"), - status: Status::Ok(http::StatusCode::OK), - source: Input::Stdin, - }); - - assert!(!stats.is_empty()); - } - - #[test] - fn test_stats() { - let mut stats = ResponseStats::new(); - stats.add(Response { - uri: website("http://example.org/ok"), - status: Status::Ok(http::StatusCode::OK), - source: Input::Stdin, - }); - stats.add(Response { - uri: website("http://example.org/failed"), - status: Status::Error("".to_string(), Some(http::StatusCode::BAD_GATEWAY)), - source: Input::Stdin, - }); - stats.add(Response { - uri: website("http://example.org/redirect"), - status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT), - source: Input::Stdin, - }); - let mut expected_map = HashMap::new(); - expected_map.insert( - Input::Stdin, - vec![ - Response { - uri: website("http://example.org/failed"), - status: Status::Error("".to_string(), Some(http::StatusCode::BAD_GATEWAY)), - source: Input::Stdin, - }, - Response { - uri: website("http://example.org/redirect"), - status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT), - source: Input::Stdin, - }, - ] - .into_iter() - .collect::>(), - ); - assert_eq!(stats.fail_map, expected_map); - } -} diff --git a/src/client.rs b/src/client.rs deleted file mode 100644 index b9f0220..0000000 --- a/src/client.rs +++ /dev/null @@ -1,519 +0,0 @@ -use anyhow::{anyhow, bail, Context, Result}; -use check_if_email_exists::{check_email, CheckEmailInput}; -use derive_builder::Builder; -use headers::{HeaderMap, HeaderValue}; -use hubcaps::{Credentials, Github}; -use regex::{Regex, RegexSet}; -use reqwest::header; -use std::convert::TryInto; -use std::{collections::HashSet, time::Duration}; -use tokio::time::sleep; -use url::Url; - -use crate::filter::Excludes; -use crate::filter::Filter; -use crate::filter::Includes; -use crate::quirks::Quirks; -use crate::types::{Response, Status}; -use crate::uri::Uri; -use crate::Request; - -const VERSION: &str = env!("CARGO_PKG_VERSION"); -const DEFAULT_MAX_REDIRECTS: usize = 5; - -#[derive(Debug, Clone)] -pub struct Client { - /// The underlying reqwest client instance that handles the HTTP requests - reqwest_client: reqwest::Client, - /// Github API client - github: Option, - /// Filtered domain handling - filter: Filter, - /// The default request HTTP method to use - method: reqwest::Method, - /// The set of accepted HTTP status codes for valid URIs - accepted: Option>, - /// Override behavior for certain known issues with URIs - quirks: Quirks, -} - -/// A link checker using an API token for Github links -/// otherwise a normal HTTP client. -#[derive(Builder, Debug)] -#[builder(build_fn(skip))] -#[builder(setter(into))] -#[builder(name = "ClientBuilder")] -pub struct ClientBuilderInternal { - /// Set an optional Github token. - /// This allows for more requests before - /// getting rate-limited. - github_token: Option, - /// Check links matching this set of regular expressions - includes: Option, - /// Exclude links matching this set of regular expressions - excludes: Option, - /// Exclude all private network addresses - exclude_all_private: bool, - /// Exclude private IP addresses - exclude_private_ips: bool, - /// Exclude link-local IPs - exclude_link_local_ips: bool, - /// Exclude loopback IP addresses (e.g. 127.0.0.1) - exclude_loopback_ips: bool, - /// Don't check mail addresses - exclude_mail: bool, - /// Maximum number of redirects before returning error - max_redirects: usize, - /// User agent used for checking links - user_agent: String, - /// Ignore SSL errors - allow_insecure: bool, - /// Allowed URI scheme (e.g. https, http). - /// This excludes all links from checking, which - /// don't specify that scheme in the URL. - scheme: Option, - /// Map of headers to send to each resource. - /// This allows working around validation issues - /// on some websites. - custom_headers: HeaderMap, - /// Request method (e.g. `GET` or `HEAD`) - method: reqwest::Method, - /// Set of accepted return codes / status codes - accepted: Option>, - /// Response timeout per request - timeout: Option, -} - -impl ClientBuilder { - fn build_excludes(&mut self) -> Excludes { - // exclude_all_private option turns on all "private" excludes, - // including private IPs, link-local IPs and loopback IPs - let enable_exclude = |opt| opt || self.exclude_all_private.unwrap_or_default(); - - Excludes { - regex: self.excludes.clone().unwrap_or_default(), - private_ips: enable_exclude(self.exclude_private_ips.unwrap_or_default()), - link_local_ips: enable_exclude(self.exclude_link_local_ips.unwrap_or_default()), - loopback_ips: enable_exclude(self.exclude_loopback_ips.unwrap_or_default()), - mail: self.exclude_mail.unwrap_or_default(), - } - } - - fn build_includes(&mut self) -> Includes { - Includes { - regex: self.includes.clone().unwrap_or_default(), - } - } - - /// The build method instantiates the client. - pub fn build(&mut self) -> Result { - let mut headers = HeaderMap::new(); - - // Faking the user agent is necessary for some websites, unfortunately. - // Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com). - let user_agent = self - .user_agent - .clone() - .unwrap_or_else(|| format!("lychee/{}", VERSION)); - - headers.insert(header::USER_AGENT, HeaderValue::from_str(&user_agent)?); - headers.insert(header::TRANSFER_ENCODING, HeaderValue::from_str("chunked")?); - if let Some(custom) = &self.custom_headers { - headers.extend(custom.clone()); - } - - let allow_insecure = self.allow_insecure.unwrap_or(false); - let max_redirects = self.max_redirects.unwrap_or(DEFAULT_MAX_REDIRECTS); - - let builder = reqwest::ClientBuilder::new() - .gzip(true) - .default_headers(headers) - .danger_accept_invalid_certs(allow_insecure) - .redirect(reqwest::redirect::Policy::limited(max_redirects)); - - let builder = match self.timeout { - Some(t) => builder - .timeout(t.ok_or_else(|| anyhow!("cannot parse timeout: {:?}", self.timeout))?), - None => builder, - }; - - let reqwest_client = builder.build()?; - - let token: Option = self.github_token.clone().unwrap_or_default(); - let github = match token { - Some(token) => { - if token.is_empty() { - None - } else { - let github = Github::new(user_agent, Credentials::Token(token))?; - Some(github) - } - } - None => None, - }; - - let scheme = self.scheme.clone().unwrap_or(None); - let scheme = scheme.map(|s| s.to_lowercase()); - - let includes = self.build_includes(); - let excludes = self.build_excludes(); - - let filter = Filter::new(Some(includes), Some(excludes), scheme); - - let quirks = Quirks::default(); - - Ok(Client { - reqwest_client, - github, - filter, - quirks, - method: self.method.clone().unwrap_or(reqwest::Method::GET), - accepted: self.accepted.clone().unwrap_or(None), - }) - } -} - -impl Client { - pub async fn check>(&self, request: T) -> Result { - let request: Request = match request.try_into() { - Ok(request) => request, - Err(_e) => bail!("Invalid URI"), - }; - if self.filter.excluded(&request) { - return Ok(Response::new(request.uri, Status::Excluded, request.source)); - } - - let status = self.check_main(&request).await?; - Ok(Response::new(request.uri, status, request.source)) - } - - async fn check_main(&self, request: &Request) -> Result { - Ok(match request.uri { - Uri::Website(ref url) => self.check_website(&url).await, - Uri::Mail(ref address) => { - // TODO: We should not be using a HTTP status code for mail - match self.check_mail(&address).await { - true => Status::Ok(http::StatusCode::OK), - false => Status::Error(format!("Invalid mail address: {}", address), None), - } - } - }) - } - - pub async fn check_website(&self, url: &Url) -> Status { - let mut retries: i64 = 3; - let mut wait: u64 = 1; - let status = loop { - let res = self.check_default(&url).await; - match res.is_success() { - true => return res, - false => { - if retries > 0 { - retries -= 1; - sleep(Duration::from_secs(wait)).await; - wait *= 2; - } else { - break res; - } - } - } - }; - // Pull out the heavy weapons in case of a failed normal request. - // This could be a Github URL and we run into the rate limiter. - if let Ok((owner, repo)) = self.extract_github(url.as_str()) { - return self.check_github(owner, repo).await; - } - - status - } - - async fn check_github(&self, owner: String, repo: String) -> Status { - match &self.github { - Some(github) => { - let repo = github.repo(owner, repo).get().await; - match repo { - Err(e) => Status::Error(e.to_string(), None), - Ok(_) => Status::Ok(http::StatusCode::OK), - } - } - None => Status::Error( - "GitHub token not specified. To check GitHub links reliably, \ - use `--github-token` flag / `GITHUB_TOKEN` env var." - .to_string(), - None, - ), - } - } - - async fn check_default(&self, url: &Url) -> Status { - let request = match self - .reqwest_client - .request(self.method.clone(), url.to_owned()) - .build() - { - Ok(r) => r, - Err(e) => return e.into(), - }; - let request = self.quirks.apply(request); - - match self.reqwest_client.execute(request).await { - Ok(response) => Status::new(response.status(), self.accepted.clone()), - Err(e) => e.into(), - } - } - - fn extract_github(&self, url: &str) -> Result<(String, String)> { - let re = Regex::new(r#"^(https?://)?(www.)?github.com/(?P[^/]*)/(?P[^/]*)"#)?; - let caps = re.captures(&url).context("Invalid capture")?; - let owner = caps.name("owner").context("Cannot capture owner")?; - let repo = caps.name("repo").context("Cannot capture repo")?; - Ok((owner.as_str().into(), repo.as_str().into())) - } - - pub async fn check_mail(&self, address: &str) -> bool { - let input = CheckEmailInput::new(vec![address.to_string()]); - let results = check_email(&input).await; - let result = results.get(0); - match result { - None => false, - Some(result) => { - // Accept everything that is not invalid - !matches!( - result.is_reachable, - check_if_email_exists::Reachable::Invalid - ) - } - } - } -} - -/// A convenience function to check a single URI -/// This is the most simple link check and avoids having to create a client manually. -/// For more complex scenarios, look into using the `ClientBuilder` instead. -pub async fn check>(request: T) -> Result { - let client = ClientBuilder::default().build()?; - Ok(client.check(request).await?) -} - -#[cfg(test)] -mod test { - use super::*; - use pretty_assertions::assert_eq; - use std::time::{Duration, Instant}; - use wiremock::matchers::method; - use wiremock::{Mock, MockServer, ResponseTemplate}; - - #[tokio::test] - async fn test_nonexistent() { - let template = ResponseTemplate::new(404); - let mock_server = MockServer::start().await; - Mock::given(method("GET")) - .respond_with(template) - .mount(&mock_server) - .await; - - let res = ClientBuilder::default() - .build() - .unwrap() - .check(mock_server.uri()) - .await - .unwrap(); - assert!(res.status.is_failure()); - } - - #[tokio::test] - async fn test_nonexistent_with_path() { - let res = ClientBuilder::default() - .build() - .unwrap() - .check("http://127.0.0.1/invalid") - .await - .unwrap(); - assert!(res.status.is_failure()); - } - - #[tokio::test] - async fn test_exponential_backoff() { - let template = ResponseTemplate::new(404); - let mock_server = MockServer::start().await; - Mock::given(method("GET")) - .respond_with(template) - .mount(&mock_server) - .await; - - let start = Instant::now(); - let res = ClientBuilder::default() - .build() - .unwrap() - .check(mock_server.uri()) - .await - .unwrap(); - let end = start.elapsed(); - - assert!(matches!(res.status, Status::Error(_, _))); - - // on slow connections, this might take a bit longer than nominal backed-off timeout (7 secs) - assert!(end.as_secs() >= 7); - assert!(end.as_secs() <= 8); - } - - #[test] - fn test_is_github() { - assert_eq!( - ClientBuilder::default() - .build() - .unwrap() - .extract_github("github.com/lycheeverse/lychee") - .unwrap(), - ("lycheeverse".into(), "lychee".into()) - ); - assert_eq!( - ClientBuilder::default() - .build() - .unwrap() - .extract_github("www.github.com/lycheeverse/lychee") - .unwrap(), - ("lycheeverse".into(), "lychee".into()) - ); - assert_eq!( - ClientBuilder::default() - .build() - .unwrap() - .extract_github("https://github.com/lycheeverse/lychee") - .unwrap(), - ("lycheeverse".into(), "lychee".into()) - ); - assert!(ClientBuilder::default() - .build() - .unwrap() - .extract_github("https://pkg.go.dev/github.com/Debian/pkg-go-tools/cmd/pgt-gopath") - .is_err()); - } - #[tokio::test] - async fn test_github() { - assert!(ClientBuilder::default() - .build() - .unwrap() - .check("https://github.com/lycheeverse/lychee") - .await - .unwrap() - .status - .is_success()); - } - - #[tokio::test] - async fn test_github_nonexistent() { - let res = ClientBuilder::default() - .build() - .unwrap() - .check("https://github.com/lycheeverse/not-lychee") - .await - .unwrap() - .status; - assert!(res.is_failure()); - } - - #[tokio::test] - async fn test_youtube() { - // This is applying a quirk. See the quirks module. - let client: Client = ClientBuilder::default().build().unwrap(); - assert!(client.check("https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7") - .await - .unwrap() - .status.is_success()); - assert!(client.check("https://www.youtube.com/watch?v=invalidNlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7") - .await - .unwrap() - .status.is_failure()); - } - - #[tokio::test] - async fn test_non_github() { - let template = ResponseTemplate::new(200); - let mock_server = MockServer::start().await; - Mock::given(method("GET")) - .respond_with(template) - .mount(&mock_server) - .await; - - let res = ClientBuilder::default() - .build() - .unwrap() - .check(mock_server.uri()) - .await - .unwrap() - .status; - assert!(res.is_success()); - } - - #[tokio::test] - async fn test_invalid_ssl() { - let res = ClientBuilder::default() - .build() - .unwrap() - .check("https://expired.badssl.com/") - .await - .unwrap(); - assert!(res.status.is_failure()); - - // Same, but ignore certificate error - let res = ClientBuilder::default() - .allow_insecure(true) - .build() - .unwrap() - .check("https://expired.badssl.com/") - .await - .unwrap(); - assert!(res.status.is_success()); - } - - #[tokio::test] - async fn test_custom_headers() { - let res = ClientBuilder::default() - .build() - .unwrap() - .check("https://crates.io/crates/lychee") - .await - .unwrap(); - assert!(res.status.is_failure()); - - // Try again, but with a custom header. - // For example, crates.io requires a custom accept header. - // See https://github.com/rust-lang/crates.io/issues/788 - let mut custom = HeaderMap::new(); - custom.insert(header::ACCEPT, "text/html".parse().unwrap()); - let res = ClientBuilder::default() - .custom_headers(custom) - .build() - .unwrap() - .check("https://crates.io/crates/lychee") - .await - .unwrap(); - assert!(res.status.is_success()); - } - - #[tokio::test] - async fn test_timeout() { - // Note: this checks response timeout, not connect timeout. - // To check connect timeout, we'd have to do something more involved, - // see: https://github.com/LukeMathWalker/wiremock-rs/issues/19 - let mock_delay = Duration::from_millis(20); - let checker_timeout = Duration::from_millis(10); - assert!(mock_delay > checker_timeout); - - let template = ResponseTemplate::new(200).set_delay(mock_delay); - let mock_server = MockServer::start().await; - Mock::given(method("GET")) - .respond_with(template) - .mount(&mock_server) - .await; - - let client = ClientBuilder::default() - .timeout(checker_timeout) - .build() - .unwrap(); - - let resp = client.check(mock_server.uri()).await.unwrap(); - assert!(matches!(resp.status, Status::Timeout(_))); - } -} diff --git a/src/filter/includes.rs b/src/filter/includes.rs deleted file mode 100644 index 5ff0751..0000000 --- a/src/filter/includes.rs +++ /dev/null @@ -1,32 +0,0 @@ -use regex::RegexSet; - -/// Include configuration for the link checker. -/// You can include links based on regex patterns -#[derive(Clone, Debug)] -pub struct Includes { - pub regex: Option, -} - -impl Default for Includes { - fn default() -> Self { - Self { regex: None } - } -} - -impl Includes { - pub fn regex(&self, input: &str) -> bool { - if let Some(includes) = &self.regex { - if includes.is_match(input) { - return true; - } - } - false - } - - pub fn is_empty(&self) -> bool { - match &self.regex { - None => true, - Some(regex_set) => regex_set.is_empty(), - } - } -} diff --git a/src/filter/mod.rs b/src/filter/mod.rs deleted file mode 100644 index 66080a8..0000000 --- a/src/filter/mod.rs +++ /dev/null @@ -1,315 +0,0 @@ -mod excludes; -mod includes; - -pub use excludes::Excludes; -pub use includes::Includes; - -use crate::uri::Uri; -use crate::Request; - -/// A generic URI filter -/// Used to decide if a given URI should be checked or skipped -#[derive(Clone, Debug)] -pub struct Filter { - includes: Includes, - excludes: Excludes, - scheme: Option, -} - -impl Filter { - pub fn new( - includes: Option, - excludes: Option, - scheme: Option, - ) -> Self { - let includes = match includes { - Some(includes) => includes, - None => Includes::default(), - }; - let excludes = match excludes { - Some(excludes) => excludes, - None => Excludes::default(), - }; - Filter { - includes, - excludes, - scheme, - } - } - - pub fn excluded(&self, request: &Request) -> bool { - // Skip mail? - if self.excludes.is_mail_excluded() && matches!(request.uri, Uri::Mail(_)) { - return true; - } - // Skip specific IP address? - if self.excludes.ip(&request.uri) { - return true; - } - // No regex includes/excludes at all? - if self.includes.is_empty() && self.excludes.is_empty() { - // Not excluded unless it's a known false positive - return self.excludes.false_positive(request.uri.as_str()); - } - // Includes take precedence over excludes - if self.includes.regex(request.uri.as_str()) { - return false; - } - // Exclude well-known false-positives. - // This is done after checking includes to allow for user-overwrites. - if self.excludes.false_positive(request.uri.as_str()) { - return true; - } - // In case we have includes and no excludes, - // skip everything that was not included - if !self.includes.is_empty() && self.excludes.is_empty() { - return true; - } - // We have no includes. Check regex excludes - if self.excludes.regex(request.uri.as_str()) { - return true; - } - // URI scheme excluded? - if self.scheme.is_none() { - return false; - } - request.uri.scheme() != self.scheme - } -} - -#[cfg(test)] -mod test { - // Note: the standard library as of Rust stable 1.47.0 does not expose - // "link-local" or "private" IPv6 checks. However, one might argue - // that these concepts do exist in IPv6, albeit the naming is different. - // See: https://en.wikipedia.org/wiki/Link-local_address#IPv6 - // See: https://en.wikipedia.org/wiki/Private_network#IPv6 - // See: https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html#method.is_unicast_link_local - const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1"; - const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1"; - const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1"; - - const V4_LOOPBACK: &str = "http://127.0.0.1"; - const V6_LOOPBACK: &str = "http://[::1]"; - - const V4_LINK_LOCAL: &str = "http://169.254.0.1"; - - // IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6) - const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]"; - const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]"; - - use regex::RegexSet; - use reqwest::Url; - - use super::*; - use pretty_assertions::assert_eq; - - use crate::{test_utils::website, Input}; - - /// Helper method to convert a string into a Request - /// Note: This panics on error, so it should only be used for testing - pub fn request(url: &str) -> Request { - Request::new(website(url), Input::Stdin) - } - - #[test] - fn test_const_sanity() { - let get_host = |s| { - Url::parse(s) - .expect("Expected valid URL") - .host() - .expect("Expected host address") - .to_owned() - }; - let into_v4 = |host| match host { - url::Host::Ipv4(ipv4) => ipv4, - _ => panic!("Not IPv4"), - }; - let into_v6 = |host| match host { - url::Host::Ipv6(ipv6) => ipv6, - _ => panic!("Not IPv6"), - }; - - assert!(into_v4(get_host(V4_PRIVATE_CLASS_A)).is_private()); - assert!(into_v4(get_host(V4_PRIVATE_CLASS_B)).is_private()); - assert!(into_v4(get_host(V4_PRIVATE_CLASS_C)).is_private()); - - assert!(into_v4(get_host(V4_LOOPBACK)).is_loopback()); - assert!(into_v6(get_host(V6_LOOPBACK)).is_loopback()); - - assert!(into_v4(get_host(V4_LINK_LOCAL)).is_link_local()); - } - - #[test] - fn test_includes_and_excludes_empty() { - // This is the pre-configured, empty set of excludes for a client - // In this case, only the requests matching the include set will be checked - let includes = Some(Includes::default()); - let excludes = Some(Excludes::default()); - let filter = Filter::new(includes, excludes, None); - assert_eq!(filter.excluded(&request("https://example.org")), false); - } - - #[test] - fn test_false_positives() { - let includes = Some(Includes::default()); - let excludes = Some(Excludes::default()); - let filter = Filter::new(includes, excludes, None); - assert_eq!( - filter.excluded(&request("http://www.w3.org/1999/xhtml")), - true - ); - assert_eq!(filter.excluded(&request("https://example.org")), false); - } - - #[test] - fn test_overwrite_false_positives() { - let includes = Some(Includes { - regex: Some(RegexSet::new(&[r"http://www.w3.org/1999/xhtml"]).unwrap()), - }); - let excludes = Some(Excludes::default()); - let filter = Filter::new(includes, excludes, None); - assert_eq!( - filter.excluded(&request("http://www.w3.org/1999/xhtml")), - false - ); - } - - #[test] - fn test_include_regex() { - let includes = Some(Includes { - regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()), - }); - let filter = Filter::new(includes, None, None); - - // Only the requests matching the include set will be checked - assert_eq!(filter.excluded(&request("https://foo.example.org")), false); - assert_eq!(filter.excluded(&request("https://bar.example.org")), true); - assert_eq!(filter.excluded(&request("https://example.org")), true); - } - - #[test] - fn test_exclude_mail() { - let excludes = Excludes { - mail: true, - ..Default::default() - }; - let filter = Filter::new(None, Some(excludes), None); - - assert_eq!( - filter.excluded(&Request::new( - Uri::Mail("mail@example.org".to_string()), - Input::Stdin, - )), - true - ); - assert_eq!( - filter.excluded(&Request::new( - Uri::Mail("foo@bar.dev".to_string()), - Input::Stdin, - )), - true - ); - assert_eq!(filter.excluded(&request("http://bar.dev")), false); - } - - #[test] - fn test_exclude_regex() { - let excludes = Excludes { - regex: Some( - RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.org"]).unwrap(), - ), - ..Default::default() - }; - let filter = Filter::new(None, Some(excludes), None); - - assert_eq!(filter.excluded(&request("http://github.com")), true); - assert_eq!(filter.excluded(&request("http://exclude.org")), true); - assert_eq!( - filter.excluded(&Request::new( - Uri::Mail("mail@example.org".to_string()), - Input::Stdin, - )), - true - ); - - assert_eq!(filter.excluded(&request("http://bar.dev")), false); - assert_eq!( - filter.excluded(&Request::new( - Uri::Mail("foo@bar.dev".to_string()), - Input::Stdin, - )), - false - ); - } - #[test] - fn test_exclude_include_regex() { - let includes = Some(Includes { - regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()), - }); - let excludes = Excludes { - regex: Some(RegexSet::new(&[r"example.org"]).unwrap()), - ..Default::default() - }; - - let filter = Filter::new(includes, Some(excludes), None); - - // Includes take preference over excludes - assert_eq!(filter.excluded(&request("https://foo.example.org")), false); - - assert_eq!(filter.excluded(&request("https://example.org")), true); - assert_eq!(filter.excluded(&request("https://bar.example.org")), true); - } - - #[test] - fn test_excludes_no_private_ips_by_default() { - let filter = Filter::new(None, None, None); - - assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_A)), false); - assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_B)), false); - assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_C)), false); - assert_eq!(filter.excluded(&request(V4_LINK_LOCAL)), false); - assert_eq!(filter.excluded(&request(V4_LOOPBACK)), false); - assert_eq!(filter.excluded(&request(V6_LOOPBACK)), false); - } - - #[test] - fn test_exclude_private_ips() { - let mut filter = Filter::new(None, None, None); - filter.excludes.private_ips = true; - - assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_A)), true); - assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_B)), true); - assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_C)), true); - } - - #[test] - fn test_exclude_link_local() { - let mut filter = Filter::new(None, None, None); - filter.excludes.link_local_ips = true; - assert_eq!(filter.excluded(&request(V4_LINK_LOCAL)), true); - } - - #[test] - fn test_exclude_loopback() { - let mut filter = Filter::new(None, None, None); - filter.excludes.loopback_ips = true; - - assert_eq!(filter.excluded(&request(V4_LOOPBACK)), true); - assert_eq!(filter.excluded(&request(V6_LOOPBACK)), true); - } - - #[test] - fn test_exclude_ip_v4_mapped_ip_v6_not_supported() { - let mut filter = Filter::new(None, None, None); - filter.excludes.private_ips = true; - filter.excludes.link_local_ips = true; - - // if these were pure IPv4, we would exclude - assert_eq!( - filter.excluded(&request(V6_MAPPED_V4_PRIVATE_CLASS_A)), - false - ); - assert_eq!(filter.excluded(&request(V6_MAPPED_V4_LINK_LOCAL)), false); - } -} diff --git a/src/lib.rs b/src/lib.rs deleted file mode 100644 index 5dfbe56..0000000 --- a/src/lib.rs +++ /dev/null @@ -1,58 +0,0 @@ -#[deny(missing_docs)] - -/** -* `lychee` is a library for checking links. -* "Hello world" example: -* ``` -* use std::error::Error; -* -* #[tokio::main] -* async fn main() -> Result<(), Box> { -* let response = lychee::check("https://github.com/lycheeverse/lychee").await?; -* println!("{}", response); -* Ok(()) -* } -* ``` -* -* For more specific use-cases you can build a lychee client yourself, -* using the `ClientBuilder` which can be used to -* configure and run your own link checker and grants full flexibility: -* -* ``` -* use lychee::{ClientBuilder, Status}; -* use std::error::Error; -* -* #[tokio::main] -* async fn main() -> Result<(), Box> { -* let client = ClientBuilder::default().build()?; -* let response = client.check("https://github.com/lycheeverse/lychee").await?; -* assert!(matches!(response.status, Status::Ok(_))); -* Ok(()) -* } -* ``` -*/ - -#[cfg(doctest)] -#[macro_use] -extern crate doc_comment; - -#[cfg(doctest)] -doctest!("../README.md"); - -mod client; -mod client_pool; -mod filter; -mod quirks; -mod types; -mod uri; - -pub mod collector; -pub mod extract; -pub mod test_utils; - -pub use client::check; -pub use client::ClientBuilder; -pub use client_pool::ClientPool; -pub use collector::Input; -pub use types::*; -pub use uri::Uri; diff --git a/src/test_utils.rs b/src/test_utils.rs deleted file mode 100644 index 9d6b429..0000000 --- a/src/test_utils.rs +++ /dev/null @@ -1,41 +0,0 @@ -use http::StatusCode; -use reqwest::Url; -use wiremock::matchers::path; -use wiremock::{Mock, MockServer, ResponseTemplate}; - -use crate::Uri; - -#[allow(unused)] -pub async fn get_mock_server(response_code: S) -> MockServer -where - S: Into, -{ - get_mock_server_with_content(response_code, None).await -} - -pub async fn get_mock_server_with_content(response_code: S, content: Option<&str>) -> MockServer -where - S: Into, -{ - let mock_server = MockServer::start().await; - let template = ResponseTemplate::new(response_code.into()); - - let template = if let Some(s) = content { - template.set_body_string(s) - } else { - template - }; - - Mock::given(path("/")) - .respond_with(template) - .mount(&mock_server) - .await; - - mock_server -} - -/// Helper method to convert a string into a URI -/// Note: This panics on error, so it should only be used for testing -pub fn website(url: &str) -> Uri { - Uri::Website(Url::parse(url).expect("Expected valid Website URI")) -} diff --git a/src/types.rs b/src/types.rs deleted file mode 100644 index 19af772..0000000 --- a/src/types.rs +++ /dev/null @@ -1,216 +0,0 @@ -use crate::{collector::Input, uri::Uri}; -use anyhow::anyhow; -use serde::{Serialize, Serializer}; -use std::{collections::HashSet, convert::TryFrom, fmt::Display}; - -#[derive(Debug, PartialEq, Eq, Hash, Clone)] -pub struct Request { - pub uri: Uri, - pub source: Input, -} - -impl Request { - pub fn new(uri: Uri, source: Input) -> Self { - Request { uri, source } - } -} - -impl Display for Request { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{} ({})", self.uri, self.source) - } -} - -impl TryFrom for Request { - type Error = anyhow::Error; - - fn try_from(s: String) -> Result { - let uri = Uri::try_from(s.as_str())?; - Ok(Request::new(uri, Input::String(s))) - } -} - -impl TryFrom<&str> for Request { - type Error = anyhow::Error; - - fn try_from(s: &str) -> Result { - let uri = Uri::try_from(s)?; - Ok(Request::new(uri, Input::String(s.to_owned()))) - } -} - -/// Specifies how requests to websites will be made -pub(crate) enum RequestMethod { - Get, - Head, -} - -impl TryFrom for RequestMethod { - type Error = anyhow::Error; - fn try_from(value: String) -> Result { - match value.to_lowercase().as_ref() { - "get" => Ok(RequestMethod::Get), - "head" => Ok(RequestMethod::Head), - _ => Err(anyhow!("Only `get` and `head` allowed, got {}", value)), - } - } -} - -#[derive(Debug, PartialEq, Eq, Hash, Serialize)] -pub struct Response { - #[serde(flatten)] - pub uri: Uri, - pub status: Status, - #[serde(skip)] - pub source: Input, -} - -impl Response { - pub fn new(uri: Uri, status: Status, source: Input) -> Self { - Response { - uri, - status, - source, - } - } -} - -impl Display for Response { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let metadata = match &self.status { - Status::Ok(code) | Status::Redirected(code) => { - format!(" [{}]", code) - } - Status::Timeout(code) if code.is_some() => format!(" [{}]", code.unwrap()), - Status::Error(e, code) => { - if let Some(code) = code { - format!(" ({})", code) - } else { - format!(" ({})", e) - } - } - _ => "".to_string(), - }; - write!(f, "{} {}{}", self.status.icon(), self.uri, metadata) - } -} - -/// Response status of the request -#[derive(Debug, Hash, PartialEq, Eq)] -pub enum Status { - /// Request was successful - Ok(http::StatusCode), - /// Request failed with HTTP error code - Error(String, Option), - /// Request timed out - Timeout(Option), - /// Got redirected to different resource - Redirected(http::StatusCode), - /// Resource was excluded from checking - Excluded, -} - -impl Display for Status { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let out = match self { - Status::Ok(c) => format!("OK ({})", c), - Status::Redirected(c) => format!("Redirect ({})", c), - Status::Excluded => "Excluded".to_string(), - Status::Error(err, code) => { - if let Some(code) = code { - format!("Failed: {} ({})", err, code) - } else { - format!("Failed: {}", err) - } - } - Status::Timeout(Some(c)) => format!("Timeout ({})", c), - Status::Timeout(None) => "Timeout".to_string(), - }; - write!(f, "{}", out) - } -} - -impl Serialize for Status { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - serializer.collect_str(self) - } -} - -impl Status { - pub fn new(statuscode: http::StatusCode, accepted: Option>) -> Self { - if let Some(true) = accepted.map(|a| a.contains(&statuscode)) { - Status::Ok(statuscode) - } else if statuscode.is_success() { - Status::Ok(statuscode) - } else if statuscode.is_redirection() { - Status::Redirected(statuscode) - } else { - Status::Error("".into(), Some(statuscode)) - } - } - - pub fn is_success(&self) -> bool { - matches!(self, Status::Ok(_)) - } - - pub fn is_failure(&self) -> bool { - matches!(self, Status::Error(_, _)) - } - - pub fn is_excluded(&self) -> bool { - matches!(self, Status::Excluded) - } - - pub fn icon(&self) -> &str { - match self { - Status::Ok(_) => "✔", - Status::Redirected(_) => "⇄️", - Status::Excluded => "?", - Status::Error(_, _) => "✗", - Status::Timeout(_) => "⧖", - } - } -} - -impl From for Status { - fn from(e: reqwest::Error) -> Self { - if e.is_timeout() { - Status::Timeout(e.status()) - } else { - Status::Error(e.to_string(), e.status()) - } - } -} - -#[cfg(test)] -mod test { - use crate::test_utils::website; - use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; - - #[test] - fn test_uri_host_ip_v4() { - let uri = website("http://127.0.0.1"); - let ip = uri.host_ip().expect("Expected a valid IPv4"); - assert_eq!(ip, IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1))); - } - - #[test] - fn test_uri_host_ip_v6() { - let uri = website("https://[2020::0010]"); - let ip = uri.host_ip().expect("Expected a valid IPv6"); - assert_eq!( - ip, - IpAddr::V6(Ipv6Addr::new(0x2020, 0, 0, 0, 0, 0, 0, 0x10)) - ); - } - - #[test] - fn test_uri_host_ip_no_ip() { - let uri = website("https://some.cryptic/url"); - let ip = uri.host_ip(); - assert!(ip.is_none()); - } -} diff --git a/src/uri.rs b/src/uri.rs deleted file mode 100644 index ae576b1..0000000 --- a/src/uri.rs +++ /dev/null @@ -1,135 +0,0 @@ -use anyhow::{bail, Result}; -use fast_chemail::is_valid_email; -use serde::{Deserialize, Serialize}; -use std::net::IpAddr; -use std::{convert::TryFrom, fmt::Display}; -use url::Url; - -/// Lychee's own representation of a URI, which encapsulates all support formats -#[derive(Clone, PartialOrd, Ord, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub enum Uri { - /// Website URL - Website(Url), - /// Mail address - Mail(String), -} - -impl Uri { - pub fn as_str(&self) -> &str { - match self { - Uri::Website(url) => url.as_str(), - Uri::Mail(address) => address.as_str(), - } - } - - pub fn scheme(&self) -> Option { - match self { - Uri::Website(url) => Some(url.scheme().to_string()), - Uri::Mail(_address) => None, - } - } - - pub fn host_ip(&self) -> Option { - match self { - Self::Website(url) => match url.host()? { - url::Host::Ipv4(v4_addr) => Some(v4_addr.into()), - url::Host::Ipv6(v6_addr) => Some(v6_addr.into()), - _ => None, - }, - Self::Mail(_) => None, - } - } -} - -fn is_internal_link(link: &str) -> bool { - // The first element should contain the Markdown file link - // @see https://www.markdownguide.org/basic-syntax/#links - let anchor_links = link.split('#').next().unwrap_or(""); - anchor_links.ends_with(".md") | anchor_links.ends_with(".markdown") -} - -impl TryFrom<&str> for Uri { - type Error = anyhow::Error; - - fn try_from(s: &str) -> Result { - // Check for internal Markdown links - let is_link_internal = is_internal_link(s); - // Remove the `mailto` scheme if it exists - // to avoid parsing it as a website URL. - let s = s.trim_start_matches("mailto:"); - - if let Ok(uri) = Url::parse(s) { - return Ok(Uri::Website(uri)); - } else if !is_link_internal && is_valid_email(&s) { - return Ok(Uri::Mail(s.to_string())); - } - bail!("Cannot convert to Uri") - } -} - -impl Display for Uri { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.as_str()) - } -} - -#[cfg(test)] -mod test { - use crate::test_utils::website; - - use super::*; - use pretty_assertions::assert_eq; - use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; - - #[test] - fn test_uri_from_str() { - assert!(matches!(Uri::try_from(""), Err(_))); - assert_eq!( - Uri::try_from("http://example.org").unwrap(), - website("http://example.org") - ); - assert_eq!( - Uri::try_from("http://example.org/@test/testing").unwrap(), - website("http://example.org/@test/testing") - ); - assert_eq!( - Uri::try_from("mail@example.org").unwrap(), - Uri::Mail("mail@example.org".to_string()) - ); - assert_eq!( - Uri::try_from("mailto:mail@example.org").unwrap(), - Uri::Mail("mail@example.org".to_string()) - ); - } - - #[test] - fn test_uri_host_ip_v4() { - let uri = website("http://127.0.0.1"); - let ip = uri.host_ip().expect("Expected a valid IPv4"); - assert_eq!(ip, IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1))); - } - - #[test] - fn test_uri_host_ip_v6() { - let uri = website("https://[2020::0010]"); - let ip = uri.host_ip().expect("Expected a valid IPv6"); - assert_eq!( - ip, - IpAddr::V6(Ipv6Addr::new(0x2020, 0, 0, 0, 0, 0, 0, 0x10)) - ); - } - - #[test] - fn test_uri_host_ip_no_ip() { - let uri = website("https://some.cryptic/url"); - let ip = uri.host_ip(); - assert!(ip.is_none()); - } - - #[test] - fn test_mail() { - let uri = website("http://127.0.0.1"); - let ip = uri.host_ip().expect("Expected a valid IPv4"); - assert_eq!(ip, IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1))); - } -}