Major refactor of codebase (#208)

- The binary component and library component are separated as two
  packages in the same workspace.
  - `lychee` is the binary component, in `lychee-bin/*`.
  - `lychee-lib` is the library component, in `lychee-lib/*`.
  - Users can now install only the `lychee-lib`, instead of both
    components, that would require fewer dependencies and faster
    compilation.
  - Dependencies for each component are adjusted and updated. E.g.,
    no CLI dependencies for `lychee-lib`.
  - CLI tests are only moved to `lychee`, as it has nothing to do
    with the library component.
- `Status::Error` is refactored to contain dedicated error enum,
  `ErrorKind`.
  - The motivation is to delay the formatting of errors to strings.
    Note that `e.to_string()` is not necessarily cheap (though
    trivial in many cases). The formatting is no delayed until the
    error is needed to be displayed to users. So in some cases, if
    the error is never used, it means that it won't be formatted at
    all.
- Replaced `regex` based matching with one of the following:
  - Simple string equality test in the case of 'false positivie'.
  - URL parsing based test, in the case of extracting repository and
    user name for GitHub links.
  - Either cases would be much more efficient than `regex` based
    matching. First, there's no need to construct a state machine for
    regex. Second, URL is already verified and parsed on its creation,
    and extracting its components is fairly cheap. Also, this removes
    the dependency on `lazy-static` in `lychee-lib`.
- `types` module now has a sub-directory, and its components are now
  separated into their own modules (in that sub-directory).
- `lychee-lib::test_utils` module is only compiled for tests.
- `wiremock` is moved to `dev-dependency` as it's only needed for
  `test` modules.
- Dependencies are listed in alphabetical order.
- Imports are organized in the following fashion:
  - Imports from `std`
  - Imports from 3rd-party crates, and `lychee-lib`.
  - Imports from `crate::*` or `super::*`.
- No glob import.
- I followed suggestion from `cargo clippy`, with `clippy::all` and
  `clippy:pedantic`.

Co-authored-by: Lucius Hu <lebensterben@users.noreply.github.com>
This commit is contained in:
Lucius Hu 2021-04-14 19:24:11 -04:00 committed by GitHub
parent a3f62fc558
commit 228e5df6a3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
40 changed files with 2881 additions and 2196 deletions

View file

@ -53,11 +53,16 @@ jobs:
uses: actions-rs/cargo@v1
with:
command: fetch
- name: cargo publish check
- name: cargo publish check lychee-lib
uses: actions-rs/cargo@v1
with:
command: publish
args: --dry-run
args: --dry-run --manifest-path lychee-lib/Cargo.toml
- name: cargo publish check lychee
uses: actions-rs/cargo@v1
with:
command: publish
args: --dry-run --manifest-path lychee-bin/Cargo.toml
publish:
if: startsWith(github.ref, 'refs/tags/')
@ -72,9 +77,17 @@ jobs:
uses: actions-rs/cargo@v1
with:
command: fetch
- name: cargo publish
- name: cargo publish lychee-lib
uses: actions-rs/cargo@v1
env:
CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
with:
command: publish
args: --manifest-path lychee-lib/Cargo.toml
- name: cargo publish lychee
uses: actions-rs/cargo@v1
env:
CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
with:
command: publish
args: --manifest-path lychee-bin/Cargo.toml

125
Cargo.lock generated
View file

@ -1,5 +1,7 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "adler"
version = "1.0.2"
@ -533,9 +535,9 @@ dependencies = [
[[package]]
name = "darling"
version = "0.12.2"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a06d4a9551359071d1890820e3571252b91229e0712e7c36b08940e603c5a8fc"
checksum = "e9d6ddad5866bb2170686ed03f6839d31a76e5407d80b1c334a2c24618543ffa"
dependencies = [
"darling_core",
"darling_macro",
@ -543,9 +545,9 @@ dependencies = [
[[package]]
name = "darling_core"
version = "0.12.2"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b443e5fb0ddd56e0c9bfa47dc060c5306ee500cb731f2b91432dd65589a77684"
checksum = "a9ced1fd13dc386d5a8315899de465708cf34ee2a6d9394654515214e67bb846"
dependencies = [
"fnv",
"ident_case",
@ -557,9 +559,9 @@ dependencies = [
[[package]]
name = "darling_macro"
version = "0.12.2"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0220073ce504f12a70efc4e7cdaea9e9b1b324872e7ad96a208056d7a638b81"
checksum = "0a7a1445d54b2f9792e3b31a3e715feabbace393f38dc4ffd49d94ee9bc487d5"
dependencies = [
"darling_core",
"quote",
@ -1119,9 +1121,9 @@ dependencies = [
[[package]]
name = "httparse"
version = "1.3.5"
version = "1.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "615caabe2c3160b313d52ccc905335f4ed5f10881dd63dc5699d47e90be85691"
checksum = "bc35c995b9d93ec174cf9a27d425c7892722101e14993cd227fdb51d70cf9589"
[[package]]
name = "httpdate"
@ -1284,9 +1286,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
[[package]]
name = "js-sys"
version = "0.3.49"
version = "0.3.50"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc15e39392125075f60c95ba416f5381ff6c3a948ff02ab12464715adf56c821"
checksum = "2d99f9e3e84b8f67f846ef5b4cbbc3b1c29f6c759fcbce6f01aa0e73d932a24c"
dependencies = [
"wasm-bindgen",
]
@ -1341,9 +1343,9 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.91"
version = "0.2.93"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8916b1f6ca17130ec6568feccee27c156ad12037880833a3b842a823236502e7"
checksum = "9385f66bf6105b241aa65a61cb923ef20efc665cb9f9bb50ac2f0c4b7f378d41"
[[package]]
name = "linked-hash-map"
@ -1362,9 +1364,9 @@ dependencies = [
[[package]]
name = "lock_api"
version = "0.4.2"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd96ffd135b2fd7b973ac026d28085defbe8983df057ced3eb4f2130b0831312"
checksum = "5a3c91c24eae6777794bb1997ad98bbb87daf92890acab859f7eaa4320333176"
dependencies = [
"scopeguard",
]
@ -1394,40 +1396,55 @@ version = "0.6.0"
dependencies = [
"anyhow",
"assert_cmd",
"check-if-email-exists",
"console",
"headers",
"http",
"indicatif",
"lazy_static",
"lychee-lib",
"openssl-sys",
"pad",
"predicates",
"pretty_assertions",
"regex",
"reqwest",
"ring",
"serde",
"serde_json",
"structopt",
"tempfile",
"tokio",
"toml",
"uuid",
"wiremock",
]
[[package]]
name = "lychee-lib"
version = "0.6.0"
dependencies = [
"check-if-email-exists",
"deadpool",
"derive_builder",
"doc-comment",
"fast_chemail",
"futures",
"glob",
"headers",
"html5ever",
"http",
"hubcaps",
"indicatif",
"lazy_static",
"linkify",
"markup5ever",
"markup5ever_rcdom",
"openssl-sys",
"pad",
"predicates",
"pretty_assertions",
"pulldown-cmark",
"regex",
"reqwest",
"ring",
"serde",
"serde_json",
"shellexpand",
"structopt",
"tempfile",
"tokio",
"toml",
"url",
"uuid",
"wiremock",
]
@ -1938,9 +1955,9 @@ checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086"
[[package]]
name = "proc-macro2"
version = "1.0.24"
version = "1.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71"
checksum = "a152013215dca273577e18d2bf00fa862b89b24169fb78c4c95aeb07992c9cec"
dependencies = [
"unicode-xid",
]
@ -2065,9 +2082,9 @@ dependencies = [
[[package]]
name = "redox_syscall"
version = "0.2.5"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94341e4e44e24f6b591b59e47a8a027df12e008d73fd5672dbea9cc22f4507d9"
checksum = "8270314b5ccceb518e7e578952f0b72b88222d02e8f77f5ecf7abbb673539041"
dependencies = [
"bitflags",
]
@ -2304,9 +2321,9 @@ dependencies = [
[[package]]
name = "signal-hook"
version = "0.3.7"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6aa894ef3fade0ee7243422f4fbbd6c2b48e6de767e621d37ef65f2310f53cea"
checksum = "ef33d6d0cd06e0840fba9985aab098c147e67e05cee14d412d3345ed14ff30ac"
dependencies = [
"libc",
"signal-hook-registry",
@ -2452,9 +2469,9 @@ dependencies = [
[[package]]
name = "syn"
version = "1.0.65"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3a1d708c221c5a612956ef9f75b37e454e88d1f7b899fbd3a18d4252012d663"
checksum = "48fe99c6bd8b1cc636890bcc071842de909d902c81ac7dab53ba33c421ab8ffb"
dependencies = [
"proc-macro2",
"quote",
@ -2547,9 +2564,9 @@ dependencies = [
[[package]]
name = "tinyvec"
version = "1.1.1"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "317cca572a0e89c3ce0ca1f1bdc9369547fe318a683418e42ac8f59d14701023"
checksum = "5b5220f05bb7de7f3f53c7c065e1199b3172696fe2db9f9c4d8ad9b4ee74c342"
dependencies = [
"tinyvec_macros",
]
@ -2615,9 +2632,9 @@ dependencies = [
[[package]]
name = "tokio-util"
version = "0.6.5"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5143d049e85af7fbc36f5454d990e62c2df705b3589f123b71f441b6b59f443f"
checksum = "940a12c99365c31ea8dd9ba04ec1be183ffe4920102bb7122c2f515437601e8e"
dependencies = [
"bytes",
"futures-core",
@ -2735,9 +2752,9 @@ dependencies = [
[[package]]
name = "unicode-bidi"
version = "0.3.4"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5"
checksum = "eeb8be209bb1c96b7c177c7420d26e04eccacb0eeae6b980e35fcb74678107e0"
dependencies = [
"matches",
]
@ -2875,9 +2892,9 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
[[package]]
name = "wasm-bindgen"
version = "0.2.72"
version = "0.2.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fe8f61dba8e5d645a4d8132dc7a0a66861ed5e1045d2c0ed940fab33bac0fbe"
checksum = "83240549659d187488f91f33c0f8547cbfef0b2088bc470c116d1d260ef623d9"
dependencies = [
"cfg-if",
"serde",
@ -2887,9 +2904,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.72"
version = "0.2.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "046ceba58ff062da072c7cb4ba5b22a37f00a302483f7e2a6cdc18fedbdc1fd3"
checksum = "ae70622411ca953215ca6d06d3ebeb1e915f0f6613e3b495122878d7ebec7dae"
dependencies = [
"bumpalo",
"lazy_static",
@ -2902,9 +2919,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.22"
version = "0.4.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73157efb9af26fb564bb59a009afd1c7c334a44db171d280690d0c3faaec3468"
checksum = "81b8b767af23de6ac18bf2168b690bed2902743ddf0fb39252e36f9e2bfc63ea"
dependencies = [
"cfg-if",
"js-sys",
@ -2914,9 +2931,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.72"
version = "0.2.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ef9aa01d36cda046f797c57959ff5f3c615c9cc63997a8d545831ec7976819b"
checksum = "3e734d91443f177bfdb41969de821e15c516931c3c3db3d318fa1b68975d0f6f"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
@ -2924,9 +2941,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.72"
version = "0.2.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96eb45c1b2ee33545a813a92dbb53856418bf7eb54ab34f7f7ff1448a5b3735d"
checksum = "d53739ff08c8a68b0fdbcd54c372b8ab800b1449ab3c9d706503bc7dd1621b2c"
dependencies = [
"proc-macro2",
"quote",
@ -2937,15 +2954,15 @@ dependencies = [
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.72"
version = "0.2.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7148f4696fb4960a346eaa60bbfb42a1ac4ebba21f750f75fc1375b098d5ffa"
checksum = "d9a543ae66aa233d14bb765ed9af4a33e81b8b58d1584cf1b47ff8cd0b9e4489"
[[package]]
name = "web-sys"
version = "0.3.49"
version = "0.3.50"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59fe19d70f5dacc03f6e46777213facae5ac3801575d56ca6cbd4c93dcd12310"
checksum = "a905d57e488fec8861446d3393670fb50d27a262344013181c2cdf9fff5481be"
dependencies = [
"js-sys",
"wasm-bindgen",

View file

@ -1,75 +1,9 @@
[package]
authors = ["Matthias Endler <matthias@endler.dev>"]
description = "A glorious link checker"
documentation = "https://github.com/lycheeverse/lychee/blob/master/README.md"
edition = "2018"
homepage = "https://github.com/lycheeverse/lychee"
keywords = [
"link",
"checker",
"cli",
"link-checker",
"validator",
[workspace]
members = [
"lychee-bin",
"lychee-lib",
]
license = "Apache-2.0/MIT"
name = "lychee"
repository = "https://github.com/lycheeverse/lychee"
version = "0.6.0"
[dependencies]
anyhow = "1.0.38"
futures = "0.3.14"
glob = "0.3.0"
http = "0.2.4"
hubcaps = "0.6.2"
linkify = "0.6.0"
regex = "1.4.5"
url = "2.2.1"
check-if-email-exists = "0.8.21"
indicatif = "0.15.0"
structopt = "0.3.21"
toml = "0.5.8"
serde = { version = "1.0.124", features = ["derive"] }
pulldown-cmark = "0.8.0"
html5ever = "0.25.1"
markup5ever = "0.10.0"
markup5ever_rcdom = "0.1.0"
headers = "0.3.4"
derive_builder = "0.10.0"
deadpool = "0.7.0"
shellexpand = "2.1.0"
lazy_static = "1.4.0"
wiremock = "0.5.1"
openssl-sys = "0.9.61"
serde_json = "1.0.64"
# Make build work on Apple Silicon.
# See https://github.com/briansmith/ring/issues/1163
# This is necessary for the homebrew build
# https://github.com/Homebrew/homebrew-core/pull/70216
ring = "0.16.20"
pad = "0.1.6"
console = "0.14.1"
fast_chemail = "0.9.6"
[dependencies.reqwest]
features = ["gzip"]
version = "0.11.3"
[dependencies.tokio]
features = ["full"]
version = "1.5.0"
[patch.crates-io]
# Switch back to version on crates.io after 0.6.3+ is released
hubcaps = { git="https://github.com/softprops/hubcaps.git" }
[dev-dependencies]
assert_cmd = "1.0.3"
predicates = "1.0.7"
uuid = { version = "0.8.2", features = ["v4"] }
tempfile = "3.2.0"
doc-comment = "0.3.3"
pretty_assertions = "0.7.1"
[features]
vendored-openssl = ["openssl-sys/vendored"]

View file

@ -3,8 +3,8 @@
![Rust](https://github.com/hello-rust/lychee/workflows/Rust/badge.svg)
[![docs.rs](https://docs.rs/lychee/badge.svg)](https://docs.rs/lychee)
⚡ A fast, async, resource-friendly link checker written in Rust.
Finds broken hyperlinks and mail addresses inside Markdown, HTML, reStructuredText, or any other text file or website!
⚡ A fast, async, resource-friendly link checker written in Rust.\\
Finds broken hyperlinks and mail addresses inside Markdown, HTML, reStructuredText, or any other text file or website!
Available as a CLI utility and as a GitHub Action: [lycheeverse/lychee-action](https://github.com/lycheeverse/lychee-action).
@ -208,11 +208,11 @@ You can use lychee as a library for your own projects.
Here is a "hello world" example:
```rust
use std::error::Error;
use lychee_lib::Result;
#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
let response = lychee::check("https://github.com/lycheeverse/lychee").await?;
async fn main() -> Result<()> {
let response = lychee_lib::check("https://github.com/lycheeverse/lychee").await?;
println!("{}", response);
Ok(())
}
@ -221,22 +221,21 @@ async fn main() -> Result<(), Box<dyn Error>> {
This is equivalent to the following snippet, in which we build our own client:
```rust
use lychee::{ClientBuilder, Status};
use std::error::Error;
use lychee_lib::{ClientBuilder, Result, Status};
#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
async fn main() -> Result<()> {
let client = ClientBuilder::default().build()?;
let response = client.check("https://github.com/lycheeverse/lychee").await?;
assert!(matches!(response.status, Status::Ok(_)));
assert!(response.status().is_success());
Ok(())
}
```
The client builder is very customizable:
```rust,ignore
let client = lychee::ClientBuilder::default()
```rust, ignore
let client = lychee_lib::ClientBuilder::default()
.includes(includes)
.excludes(excludes)
.max_redirects(cfg.max_redirects)

51
lychee-bin/Cargo.toml Normal file
View file

@ -0,0 +1,51 @@
[package]
name = "lychee"
authors = ["Matthias Endler <matthias@endler.dev>"]
description = "A glorious link checker"
documentation = "https://github.com/lycheeverse/lychee/blob/master/README.md"
edition = "2018"
homepage = "https://github.com/lycheeverse/lychee"
keywords = [
"link",
"checker",
"cli",
"link-checker",
"validator",
]
license = "Apache-2.0/MIT"
repository = "https://github.com/lycheeverse/lychee"
version = "0.6.0"
[dependencies]
lychee-lib = { path = "../lychee-lib", version = "0.6.0" }
anyhow = "1.0.40"
console = "0.14.1"
headers = "0.3.4"
http = "0.2.4"
indicatif = "0.15.0"
lazy_static = "1.4.0"
openssl-sys = "0.9.61"
pad = "0.1.6"
regex = "1.4.5"
reqwest = { version = "0.11.3", features = ["gzip"] }
# Make build work on Apple Silicon.
# See https://github.com/briansmith/ring/issues/1163
# This is necessary for the homebrew build
# https://github.com/Homebrew/homebrew-core/pull/70216
ring = "0.16.20"
serde = { version = "1.0.125", features = ["derive"] }
serde_json = "1.0.64"
structopt = "0.3.21"
tokio = { version = "1.5.0", features = ["full"] }
toml = "0.5.8"
[dev-dependencies]
assert_cmd = "1.0.3"
predicates = "1.0.7"
pretty_assertions = "0.7.1"
tempfile = "3.2.0"
uuid = { version = "0.8.2", features = ["v4"] }
wiremock = "0.5.2"
[features]
vendored-openssl = ["openssl-sys/vendored"]

201
lychee-bin/LICENSE-APACHE Normal file
View file

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2020 The lychee maintainers
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

23
lychee-bin/LICENSE-MIT Normal file
View file

@ -0,0 +1,23 @@
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

View file

@ -1,23 +1,43 @@
#![warn(clippy::all, clippy::pedantic)]
#![warn(
absolute_paths_not_starting_with_crate,
invalid_html_tags,
missing_copy_implementations,
missing_debug_implementations,
semicolon_in_expressions_from_macros,
unreachable_pub,
unused_extern_crates,
variant_size_differences,
clippy::missing_const_for_fn
)]
#![deny(anonymous_parameters, macro_use_extern_crate, pointer_structural_match)]
// required for apple silicon
use ring as _;
use std::{collections::HashSet, fs, str::FromStr, time::Duration};
use anyhow::{anyhow, Context, Result};
use headers::authorization::Basic;
use headers::{Authorization, HeaderMap, HeaderMapExt, HeaderName};
use headers::{authorization::Basic, Authorization, HeaderMap, HeaderMapExt, HeaderName};
use http::StatusCode;
use indicatif::{ProgressBar, ProgressStyle};
use options::Format;
use lychee_lib::{
collector::{collect_links, Input},
ClientBuilder, ClientPool, Response,
};
use openssl_sys as _; // required for vendored-openssl feature
use regex::RegexSet;
use stats::color_response;
use std::{collections::HashSet, time::Duration};
use std::{fs, str::FromStr};
use ring as _; // required for apple silicon
use structopt::StructOpt;
use tokio::sync::mpsc;
mod options;
mod stats;
use crate::options::{Config, LycheeOptions};
use crate::stats::ResponseStats;
use lychee::collector::{self, Input};
use lychee::{ClientBuilder, ClientPool, Response};
use crate::{
options::{Config, Format, LycheeOptions},
stats::{color_response, ResponseStats},
};
/// A C-like enum that can be cast to `i32` and used as process exit code.
enum ExitCode {
@ -64,7 +84,7 @@ fn run_main() -> Result<i32> {
}
fn show_progress(progress_bar: &Option<ProgressBar>, response: &Response, verbose: bool) {
let out = color_response(response);
let out = color_response(&response.1);
if let Some(pb) = progress_bar {
pb.inc(1);
pb.set_message(&out);
@ -72,7 +92,7 @@ fn show_progress(progress_bar: &Option<ProgressBar>, response: &Response, verbos
pb.println(out);
}
} else {
if (response.status.is_success() || response.status.is_excluded()) && !verbose {
if (response.status().is_success() || response.status().is_excluded()) && !verbose {
return;
}
println!("{}", out);
@ -117,26 +137,27 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
.github_token(cfg.github_token.clone())
.scheme(cfg.scheme.clone())
.accepted(accepted)
.build()?;
.build()
.map_err(|e| anyhow!(e))?;
let links = collector::collect_links(
let links = collect_links(
&inputs,
cfg.base_url.clone(),
cfg.skip_missing,
max_concurrency,
)
.await?;
.await
.map_err(|e| anyhow!(e))?;
let pb = match cfg.no_progress {
true => None,
false => {
let bar = ProgressBar::new(links.len() as u64)
.with_style(ProgressStyle::default_bar().template(
let pb = if cfg.no_progress {
None
} else {
let bar =
ProgressBar::new(links.len() as u64).with_style(ProgressStyle::default_bar().template(
"{spinner:.red.bright} {pos}/{len:.dim} [{elapsed_precise}] {bar:25} {wide_msg}",
));
bar.enable_steady_tick(100);
Some(bar)
}
bar.enable_steady_tick(100);
Some(bar)
};
let (send_req, recv_req) = mpsc::channel(max_concurrency);
@ -154,9 +175,9 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
}
});
// Start receiving requests
tokio::spawn(async move {
// Start receiving requests
let clients: Vec<_> = (0..max_concurrency).map(|_| client.clone()).collect();
let clients = vec![client; max_concurrency];
let mut clients = ClientPool::new(send_resp, recv_req, clients);
clients.listen().await;
});
@ -184,9 +205,10 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
println!("{}", stats_formatted);
}
match stats.is_success() {
true => Ok(ExitCode::Success as i32),
false => Ok(ExitCode::LinkCheckFailure as i32),
if stats.is_success() {
Ok(ExitCode::Success as i32)
} else {
Ok(ExitCode::LinkCheckFailure as i32)
}
}
@ -201,7 +223,7 @@ fn read_header(input: &str) -> Result<(String, String)> {
Ok((elements[0].into(), elements[1].into()))
}
fn parse_timeout(timeout: usize) -> Duration {
const fn parse_timeout(timeout: usize) -> Duration {
Duration::from_secs(timeout as u64)
}
@ -217,10 +239,10 @@ fn parse_headers<T: AsRef<str>>(headers: &[T]) -> Result<HeaderMap> {
Ok(out)
}
fn parse_statuscodes<T: AsRef<str>>(accept: T) -> Result<HashSet<http::StatusCode>> {
fn parse_statuscodes<T: AsRef<str>>(accept: T) -> Result<HashSet<StatusCode>> {
let mut statuscodes = HashSet::new();
for code in accept.as_ref().split(',').into_iter() {
let code: reqwest::StatusCode = reqwest::StatusCode::from_bytes(code.as_bytes())?;
for code in accept.as_ref().split(',') {
let code: StatusCode = StatusCode::from_bytes(code.as_bytes())?;
statuscodes.insert(code);
}
Ok(statuscodes)
@ -239,12 +261,15 @@ fn parse_basic_auth(auth: &str) -> Result<Authorization<Basic>> {
#[cfg(test)]
mod test {
use super::*;
use pretty_assertions::assert_eq;
use std::{array, collections::HashSet};
use headers::{HeaderMap, HeaderMapExt};
use http::StatusCode;
use pretty_assertions::assert_eq;
use reqwest::header;
use super::{parse_basic_auth, parse_headers, parse_statuscodes};
#[test]
fn test_parse_custom_headers() {
let mut custom = HeaderMap::new();
@ -255,14 +280,13 @@ mod test {
#[test]
fn test_parse_statuscodes() {
let actual = parse_statuscodes("200,204,301").unwrap();
let expected: HashSet<StatusCode> = [
let expected = array::IntoIter::new([
StatusCode::OK,
StatusCode::NO_CONTENT,
StatusCode::MOVED_PERMANENTLY,
]
.iter()
.cloned()
.collect();
])
.collect::<HashSet<_>>();
assert_eq!(actual, expected);
}
@ -277,6 +301,7 @@ mod test {
let mut actual = HeaderMap::new();
let auth_header = parse_basic_auth("aladin:abretesesamo").unwrap();
actual.typed_insert(auth_header);
assert_eq!(expected, actual);
}
}

View file

@ -1,20 +1,27 @@
use lychee::collector::Input;
use std::{fs, io::ErrorKind, path::PathBuf, str::FromStr};
use anyhow::{anyhow, Error, Result};
use lazy_static::lazy_static;
use lychee_lib::collector::Input;
use serde::Deserialize;
use std::str::FromStr;
use std::{fs, io::ErrorKind, path::PathBuf};
use structopt::{clap::crate_version, StructOpt};
pub(crate) const USER_AGENT: &str = concat!("lychee/", crate_version!());
const METHOD: &str = "get";
const TIMEOUT: usize = 20;
const MAX_CONCURRENCY: usize = 128;
const MAX_REDIRECTS: usize = 10;
const USER_AGENT: &str = concat!("lychee/", crate_version!());
// this exists because structopt requires `&str` type values for defaults
// (we can't use e.g. `TIMEOUT` or `timeout()` which gets created for serde)
lazy_static! {
static ref TIMEOUT_STR: String = TIMEOUT.to_string();
static ref MAX_CONCURRENCY_STR: String = MAX_CONCURRENCY.to_string();
static ref MAX_REDIRECTS_STR: String = MAX_REDIRECTS.to_string();
}
#[derive(Debug, Deserialize)]
pub enum Format {
pub(crate) enum Format {
String,
Json,
}
@ -36,18 +43,11 @@ impl Default for Format {
}
}
// this exists because structopt requires `&str` type values for defaults
// (we can't use e.g. `TIMEOUT` or `timeout()` which gets created for serde)
lazy_static! {
static ref TIMEOUT_STR: String = TIMEOUT.to_string();
static ref MAX_CONCURRENCY_STR: String = MAX_CONCURRENCY.to_string();
static ref MAX_REDIRECTS_STR: String = MAX_REDIRECTS.to_string();
}
// Macro for generating default functions to be used by serde
macro_rules! default_function {
( $( $name:ident : $T:ty = $e:expr; )* ) => {
$(
#[allow(clippy::missing_const_for_fn)]
fn $name() -> $T {
$e
}
@ -90,10 +90,10 @@ pub(crate) struct LycheeOptions {
/// Configuration file to use
#[structopt(short, long = "config", default_value = "./lychee.toml")]
pub config_file: String,
pub(crate) config_file: String,
#[structopt(flatten)]
pub config: Config,
pub(crate) config: Config,
}
impl LycheeOptions {
@ -109,142 +109,143 @@ impl LycheeOptions {
}
}
#[allow(clippy::struct_excessive_bools)]
#[derive(Debug, Deserialize, StructOpt)]
pub struct Config {
pub(crate) struct Config {
/// Verbose program output
#[structopt(short, long)]
#[serde(default)]
pub verbose: bool,
pub(crate) verbose: bool,
/// Do not show progress bar.
/// This is recommended for non-interactive shells (e.g. for continuous
/// integration)
#[structopt(short, long)]
#[serde(default)]
pub no_progress: bool,
pub(crate) no_progress: bool,
/// Maximum number of allowed redirects
#[structopt(short, long, default_value = &MAX_REDIRECTS_STR)]
#[serde(default = "max_redirects")]
pub max_redirects: usize,
pub(crate) max_redirects: usize,
/// Maximum number of concurrent network requests
#[structopt(long, default_value = &MAX_CONCURRENCY_STR)]
#[serde(default = "max_concurrency")]
pub max_concurrency: usize,
pub(crate) max_concurrency: usize,
/// Number of threads to utilize.
/// Defaults to number of cores available to the system
#[structopt(short = "T", long)]
#[serde(default)]
pub threads: Option<usize>,
pub(crate) threads: Option<usize>,
/// User agent
#[structopt(short, long, default_value = USER_AGENT)]
#[serde(default = "user_agent")]
pub user_agent: String,
pub(crate) user_agent: String,
/// Proceed for server connections considered insecure (invalid TLS)
#[structopt(short, long)]
#[serde(default)]
pub insecure: bool,
pub(crate) insecure: bool,
/// Only test links with the given scheme (e.g. https)
#[structopt(short, long)]
#[serde(default)]
pub scheme: Option<String>,
pub(crate) scheme: Option<String>,
/// URLs to check (supports regex). Has preference over all excludes.
#[structopt(long)]
#[serde(default)]
pub include: Vec<String>,
pub(crate) include: Vec<String>,
/// Exclude URLs from checking (supports regex)
#[structopt(long)]
#[serde(default)]
pub exclude: Vec<String>,
pub(crate) exclude: Vec<String>,
/// Exclude all private IPs from checking.
/// Equivalent to `--exclude-private --exclude-link-local --exclude-loopback`
#[structopt(short = "E", long)]
#[serde(default)]
pub exclude_all_private: bool,
pub(crate) exclude_all_private: bool,
/// Exclude private IP address ranges from checking
#[structopt(long)]
#[serde(default)]
pub exclude_private: bool,
pub(crate) exclude_private: bool,
/// Exclude link-local IP address range from checking
#[structopt(long)]
#[serde(default)]
pub exclude_link_local: bool,
pub(crate) exclude_link_local: bool,
/// Exclude loopback IP address range from checking
#[structopt(long)]
#[serde(default)]
pub exclude_loopback: bool,
pub(crate) exclude_loopback: bool,
/// Exclude all mail addresses from checking
#[structopt(long)]
#[serde(default)]
pub exclude_mail: bool,
pub(crate) exclude_mail: bool,
/// Custom request headers
#[structopt(short, long)]
#[serde(default)]
pub headers: Vec<String>,
pub(crate) headers: Vec<String>,
/// Comma-separated list of accepted status codes for valid links
#[structopt(short, long)]
#[serde(default)]
pub accept: Option<String>,
pub(crate) accept: Option<String>,
/// Website timeout from connect to response finished
#[structopt(short, long, default_value = &TIMEOUT_STR)]
#[serde(default = "timeout")]
pub timeout: usize,
pub(crate) timeout: usize,
/// Request method
// Using `-X` as a short param similar to curl
#[structopt(short = "X", long, default_value = METHOD)]
#[serde(default = "method")]
pub method: String,
pub(crate) method: String,
/// Base URL to check relative URLs
#[structopt(short, long)]
#[serde(default)]
pub base_url: Option<String>,
pub(crate) base_url: Option<String>,
/// Basic authentication support. E.g. `username:password`
#[structopt(long)]
#[serde(default)]
pub basic_auth: Option<String>,
pub(crate) basic_auth: Option<String>,
/// GitHub API token to use when checking github.com links, to avoid rate limiting
#[structopt(long, env = "GITHUB_TOKEN")]
#[serde(default)]
pub github_token: Option<String>,
pub(crate) github_token: Option<String>,
/// Skip missing input files (default is to error if they don't exist)
#[structopt(long)]
#[serde(default)]
pub skip_missing: bool,
pub(crate) skip_missing: bool,
/// Ignore case when expanding filesystem path glob inputs
#[structopt(long)]
#[serde(default)]
pub glob_ignore_case: bool,
pub(crate) glob_ignore_case: bool,
/// Output file of status report
#[structopt(short, long, parse(from_os_str))]
#[serde(default)]
pub output: Option<PathBuf>,
pub(crate) output: Option<PathBuf>,
/// Output file format of status report (json, string)
#[structopt(short, long, default_value = "string")]
#[serde(default)]
pub format: Format,
pub(crate) format: Format,
}
impl Config {

194
lychee-bin/src/stats.rs Normal file
View file

@ -0,0 +1,194 @@
use std::{
collections::{HashMap, HashSet},
fmt::{self, Display},
};
use console::style;
use lychee_lib::{Input, Response, ResponseBody, Status};
use pad::{Alignment, PadStr};
use serde::Serialize;
// Maximum padding for each entry in the final statistics output
const MAX_PADDING: usize = 20;
pub(crate) fn color_response(response: &ResponseBody) -> String {
let out = match response.status {
Status::Ok(_) => style(response).green().bright(),
Status::Redirected(_) => style(response),
Status::Excluded => style(response).dim(),
Status::Timeout(_) => style(response).yellow().bright(),
Status::Error(_) => style(response).red().bright(),
};
out.to_string()
}
#[derive(Default, Serialize)]
pub(crate) struct ResponseStats {
total: usize,
successful: usize,
failures: usize,
timeouts: usize,
redirects: usize,
excludes: usize,
errors: usize,
fail_map: HashMap<Input, HashSet<ResponseBody>>,
}
impl ResponseStats {
#[inline]
pub(crate) fn new() -> Self {
Self::default()
}
pub(crate) fn add(&mut self, response: Response) {
let Response(source, ResponseBody { ref status, .. }) = response;
self.total += 1;
match status {
Status::Ok(_) => self.successful += 1,
Status::Error(_) => self.failures += 1,
Status::Timeout(_) => self.timeouts += 1,
Status::Redirected(_) => self.redirects += 1,
Status::Excluded => self.excludes += 1,
}
if matches!(
status,
Status::Error(_) | Status::Timeout(_) | Status::Redirected(_)
) {
let fail = self.fail_map.entry(source).or_default();
fail.insert(response.1);
};
}
#[inline]
pub(crate) const fn is_success(&self) -> bool {
self.total == self.successful + self.excludes
}
#[inline]
pub(crate) const fn is_empty(&self) -> bool {
self.total == 0
}
}
fn write_stat(f: &mut fmt::Formatter, title: &str, stat: usize, newline: bool) -> fmt::Result {
let fill = title.chars().count();
f.write_str(title)?;
f.write_str(
&stat
.to_string()
.pad(MAX_PADDING - fill, '.', Alignment::Right, false),
)?;
if newline {
f.write_str("\n")?;
}
Ok(())
}
impl Display for ResponseStats {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let separator = "-".repeat(MAX_PADDING + 1);
writeln!(f, "\u{1f4dd} Summary")?; // 📝
writeln!(f, "{}", separator)?;
write_stat(f, "\u{1f50d} Total", self.total, true)?; // 🔍
write_stat(f, "\u{2705} Successful", self.successful, true)?; // ✅
write_stat(f, "\u{23f3} Timeouts", self.timeouts, true)?; // ⏳
write_stat(f, "\u{1f500} Redirected", self.redirects, true)?; // 🔀
write_stat(f, "\u{1f47b} Excluded", self.excludes, true)?; // 👻
write_stat(f, "\u{1f6ab} Errors", self.errors + self.failures, false)?; // 🚫
for (input, responses) in &self.fail_map {
// Using leading newlines over trailing ones (e.g. `writeln!`)
// lets us avoid extra newlines without any additional logic.
write!(f, "\n\nErrors in {}", input)?;
for response in responses {
write!(f, "\n{}", color_response(response))?
}
}
Ok(())
}
}
#[cfg(test)]
mod test {
use std::collections::{HashMap, HashSet};
use http::StatusCode;
use lychee_lib::{ClientBuilder, Input, Response, ResponseBody, Status, Uri};
use pretty_assertions::assert_eq;
use reqwest::Url;
use wiremock::{matchers::path, Mock, MockServer, ResponseTemplate};
use super::ResponseStats;
fn website(url: &str) -> Uri {
Uri::from(Url::parse(url).expect("Expected valid Website URI"))
}
async fn get_mock_status_response<S>(status_code: S) -> Response
where
S: Into<StatusCode>,
{
let mock_server = MockServer::start().await;
let template = ResponseTemplate::new(status_code.into());
Mock::given(path("/"))
.respond_with(template)
.mount(&mock_server)
.await;
ClientBuilder::default()
.build()
.unwrap()
.check(mock_server.uri())
.await
.unwrap()
}
#[test]
fn test_stats_is_empty() {
let mut stats = ResponseStats::new();
assert!(stats.is_empty());
stats.add(Response(
Input::Stdin,
ResponseBody {
uri: website("http://example.org/ok"),
status: Status::Ok(StatusCode::OK),
},
));
assert!(!stats.is_empty());
}
#[tokio::test]
async fn test_stats() {
let stata = [
StatusCode::OK,
StatusCode::PERMANENT_REDIRECT,
StatusCode::BAD_GATEWAY,
];
let mut stats = ResponseStats::new();
for status in &stata {
stats.add(get_mock_status_response(status).await);
}
let mut expected_map: HashMap<Input, HashSet<ResponseBody>> = HashMap::new();
for status in &stata {
if status.is_server_error() || status.is_client_error() || status.is_redirection() {
let Response(input, response_body) = get_mock_status_response(status).await;
let entry = expected_map.entry(input).or_default();
entry.insert(response_body);
}
}
assert_eq!(stats.fail_map, expected_map);
}
}

View file

@ -1,106 +1,161 @@
#[cfg(test)]
mod cli {
use pretty_assertions::assert_eq;
use std::{
fs::{self, File},
io::Write,
path::{Path, PathBuf},
};
use anyhow::Result;
use assert_cmd::Command;
use lychee::test_utils;
use http::StatusCode;
use lychee_lib::Result;
use predicates::str::contains;
use std::fs::{self, File};
use std::io::Write;
use std::path::{Path, PathBuf};
use pretty_assertions::assert_eq;
use uuid::Uuid;
macro_rules! mock_server {
($status:expr $(, $func:tt ($($arg:expr),*))*) => {{
let mock_server = wiremock::MockServer::start().await;
let template = wiremock::ResponseTemplate::new(http::StatusCode::from($status));
let template = template$(.$func($($arg),*))*;
wiremock::Mock::given(wiremock::matchers::method("GET")).respond_with(template).mount(&mock_server).await;
mock_server
}};
}
fn main_command() -> Command {
// this gets the "main" binary name (e.g. `lychee`)
Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name")
}
fn fixtures_path() -> PathBuf {
Path::new(module_path!()).parent().unwrap().join("fixtures")
Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.join("fixtures")
}
#[derive(Default)]
struct MockResponseStats {
total: usize,
successful: usize,
failures: usize,
timeouts: usize,
redirects: usize,
excludes: usize,
errors: usize,
}
impl MockResponseStats {
fn to_json_str(&self) -> String {
format!(
r#"{{
"total": {},
"successful": {},
"failures": {},
"timeouts": {},
"redirects": {},
"excludes": {},
"errors": {},
"fail_map": {{}}
}}"#,
self.total,
self.successful,
self.failures,
self.timeouts,
self.redirects,
self.excludes,
self.errors
)
}
}
macro_rules! test_json_output {
($test_file:expr, $expected:expr $(, $arg:expr)*) => {{
let mut cmd = main_command();
let test_path = fixtures_path().join($test_file);
let outfile = format!("{}.json", uuid::Uuid::new_v4());
let expected = $expected.to_json_str();
cmd$(.arg($arg))*.arg("--output").arg(&outfile).arg("--format").arg("json").arg(test_path).assert().success();
let output = std::fs::read_to_string(&outfile)?;
assert_eq!(output, expected);
std::fs::remove_file(outfile)?;
Ok(())
}};
}
#[test]
fn test_exclude_all_private() {
let mut cmd = main_command();
let test_all_private_path = fixtures_path().join("TEST_ALL_PRIVATE.md");
// assert that the command runs OK, and that it excluded all the links
cmd.arg("--exclude-all-private")
.arg("--verbose")
.arg(test_all_private_path)
.assert()
.success()
.stdout(contains("Total............7"))
.stdout(contains("Excluded.........7"))
.stdout(contains("Successful.......0"))
.stdout(contains("Errors...........0"));
fn test_exclude_all_private() -> Result<()> {
test_json_output!(
"TEST_ALL_PRIVATE.md",
MockResponseStats {
total: 7,
excludes: 7,
..MockResponseStats::default()
},
"--exclude-all-private",
"--verbose"
)
}
#[test]
fn test_exclude_email() {
let mut cmd = main_command();
let test_path = fixtures_path().join("TEST_EMAIL.md");
// assert that the command runs OK, and that it excluded all the links
cmd.arg("--exclude-mail")
.arg(test_path)
.assert()
.success()
.stdout(contains("Total............6"))
.stdout(contains("Excluded.........4"))
.stdout(contains("Successful.......2"))
.stdout(contains("Errors...........0"));
fn test_exclude_email() -> Result<()> {
test_json_output!(
"TEST_EMAIL.md",
MockResponseStats {
total: 6,
excludes: 4,
successful: 2,
..MockResponseStats::default()
},
"--exclude-mail"
)
}
/// Test that a GitHub link can be checked without specifying the token.
#[test]
fn test_check_github_no_token() {
let mut cmd = main_command();
let test_github_path = fixtures_path().join("TEST_GITHUB.md");
cmd.arg("--verbose")
.arg(test_github_path)
.assert()
.success()
.stdout(contains("Total............1"))
.stdout(contains("Excluded.........0"))
.stdout(contains("Successful.......1"))
.stdout(contains("Errors...........0"));
fn test_check_github_no_token() -> Result<()> {
test_json_output!(
"TEST_GITHUB.md",
MockResponseStats {
total: 1,
successful: 1,
..MockResponseStats::default()
}
)
}
#[test]
fn test_quirks() {
let mut cmd = main_command();
let test_quirks_path = fixtures_path().join("TEST_QUIRKS.txt");
cmd.arg("--verbose")
.arg(test_quirks_path)
.assert()
.success()
.stdout(contains("Total............2"))
.stdout(contains("Excluded.........0"))
.stdout(contains("Successful.......2"))
.stdout(contains("Errors...........0"));
fn test_quirks() -> Result<()> {
test_json_output!(
"TEST_QUIRKS.txt",
MockResponseStats {
total: 2,
successful: 2,
..MockResponseStats::default()
}
)
}
#[tokio::test]
async fn test_failure_404_link() {
let mut cmd = main_command();
let mock_server = test_utils::get_mock_server(http::StatusCode::NOT_FOUND).await;
let dir = tempfile::tempdir().expect("Failed to create tempdir");
async fn test_failure_404_link() -> Result<()> {
let mock_server = mock_server!(StatusCode::NOT_FOUND);
let dir = tempfile::tempdir()?;
let file_path = dir.path().join("test.txt");
let mut file = File::create(&file_path).expect("Failed to create tempfile");
writeln!(file, "{}", mock_server.uri()).expect("Failed to write to file");
let mut file = File::create(&file_path)?;
writeln!(file, "{}", mock_server.uri())?;
let mut cmd = main_command();
cmd.arg(file_path)
.write_stdin(mock_server.uri())
.assert()
.failure()
.code(2);
Ok(())
}
#[test]
@ -121,7 +176,7 @@ mod cli {
#[tokio::test]
async fn test_stdin_input() {
let mut cmd = main_command();
let mock_server = test_utils::get_mock_server(http::StatusCode::OK).await;
let mock_server = mock_server!(StatusCode::OK);
cmd.arg("-")
.write_stdin(mock_server.uri())
@ -132,8 +187,7 @@ mod cli {
#[tokio::test]
async fn test_stdin_input_failure() {
let mut cmd = main_command();
let mock_server =
test_utils::get_mock_server(http::StatusCode::INTERNAL_SERVER_ERROR).await;
let mock_server = mock_server!(StatusCode::INTERNAL_SERVER_ERROR);
cmd.arg("-")
.write_stdin(mock_server.uri())
@ -145,8 +199,8 @@ mod cli {
#[tokio::test]
async fn test_stdin_input_multiple() {
let mut cmd = main_command();
let mock_server_a = test_utils::get_mock_server(http::StatusCode::OK).await;
let mock_server_b = test_utils::get_mock_server(http::StatusCode::OK).await;
let mock_server_a = mock_server!(StatusCode::OK);
let mock_server_b = mock_server!(StatusCode::OK);
// this behavior (treating multiple `-` as separate inputs) is the same as most CLI tools
// that accept `-` as stdin, e.g. `cat`, `bat`, `grep` etc.
@ -168,7 +222,7 @@ mod cli {
.failure()
.code(1)
.stderr(contains(format!(
"Error: Failed to read file: `{}`",
"Error: Failed to read file: `{}`, reason: No such file or directory (os error 2)",
filename
)));
}
@ -187,8 +241,8 @@ mod cli {
let mut cmd = main_command();
let dir = tempfile::tempdir()?;
let mock_server_a = test_utils::get_mock_server(http::StatusCode::OK).await;
let mock_server_b = test_utils::get_mock_server(http::StatusCode::OK).await;
let mock_server_a = mock_server!(StatusCode::OK);
let mock_server_b = mock_server!(StatusCode::OK);
let mut file_a = File::create(dir.path().join("a.md"))?;
let mut file_b = File::create(dir.path().join("b.md"))?;
@ -210,8 +264,8 @@ mod cli {
let mut cmd = main_command();
let dir = tempfile::tempdir()?;
let mock_server_a = test_utils::get_mock_server(http::StatusCode::OK).await;
let mock_server_b = test_utils::get_mock_server(http::StatusCode::OK).await;
let mock_server_a = mock_server!(StatusCode::OK);
let mock_server_b = mock_server!(StatusCode::OK);
let mut file_a = File::create(dir.path().join("README.md"))?;
let mut file_b = File::create(dir.path().join("readme.md"))?;
@ -236,7 +290,7 @@ mod cli {
let subdir_level_1 = tempfile::tempdir_in(&dir)?;
let subdir_level_2 = tempfile::tempdir_in(&subdir_level_1)?;
let mock_server = test_utils::get_mock_server(http::StatusCode::OK).await;
let mock_server = mock_server!(StatusCode::OK);
let mut file = File::create(subdir_level_2.path().join("test.md"))?;
writeln!(file, "{}", mock_server.uri().as_str())?;
@ -266,7 +320,7 @@ mod cli {
.assert()
.success();
let expected = r##"{"total":11,"successful":11,"failures":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"##;
let expected = r#"{"total":11,"successful":11,"failures":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"#;
let output = fs::read_to_string(&outfile)?;
assert_eq!(output.split_whitespace().collect::<String>(), expected);
fs::remove_file(outfile)?;

View file

@ -1,11 +1,13 @@
#[cfg(test)]
mod readme {
use pretty_assertions::assert_eq;
use std::{
fs::File,
io::{BufReader, Read},
path::Path,
};
use assert_cmd::Command;
use std::fs::File;
use std::io::{BufReader, Read};
use std::path::Path;
use pretty_assertions::assert_eq;
fn main_command() -> Command {
// this gets the "main" binary name (e.g. `lychee`)
@ -13,7 +15,7 @@ mod readme {
}
fn load_readme_text() -> String {
let readme_path = Path::new(module_path!())
let readme_path = Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.join("README.md");
@ -38,7 +40,7 @@ mod readme {
fn test_readme_usage_up_to_date() {
let mut cmd = main_command();
let result = cmd.arg("--help").assert().success();
let result = cmd.env_clear().arg("--help").assert().success();
let help_output = std::str::from_utf8(&result.get_output().stdout)
.expect("Invalid utf8 output for `--help`");
let readme = load_readme_text();

51
lychee-lib/Cargo.toml Normal file
View file

@ -0,0 +1,51 @@
[package]
name = "lychee-lib"
authors = ["Matthias Endler <matthias@endler.dev>"]
description = "A glorious link checker"
documentation = "https://github.com/lycheeverse/lychee/blob/master/README.md"
edition = "2018"
homepage = "https://github.com/lycheeverse/lychee"
keywords = [
"link",
"checker",
"cli",
"link-checker",
"validator",
]
license = "Apache-2.0/MIT"
repository = "https://github.com/lycheeverse/lychee"
version = "0.6.0"
[dependencies]
check-if-email-exists = "0.8.21"
deadpool = "0.7.0"
derive_builder = "0.10.0"
fast_chemail = "0.9.6"
glob = "0.3.0"
html5ever = "0.25.1"
http = "0.2.4"
hubcaps = "0.6.2"
linkify = "0.6.0"
markup5ever_rcdom = "0.1.0"
openssl-sys = "0.9.61"
pulldown-cmark = "0.8.0"
regex = "1.4.5"
reqwest = { version = "0.11.3", features = ["gzip"] }
# Make build work on Apple Silicon.
# See https://github.com/briansmith/ring/issues/1163
# This is necessary for the homebrew build
# https://github.com/Homebrew/homebrew-core/pull/70216
ring = "0.16.20"
serde = { version = "1.0.125", features = ["derive"] }
shellexpand = "2.1.0"
tokio = { version = "1.5.0", features = ["full"] }
url = { version = "2.2.1", features = ["serde"] }
[dev-dependencies]
doc-comment = "0.3.3"
pretty_assertions = "0.7.1"
tempfile = "3.2.0"
wiremock = "0.5.2"
[features]
vendored-openssl = ["openssl-sys/vendored"]

201
lychee-lib/LICENSE-APACHE Normal file
View file

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2020 The lychee maintainers
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

23
lychee-lib/LICENSE-MIT Normal file
View file

@ -0,0 +1,23 @@
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

398
lychee-lib/src/client.rs Normal file
View file

@ -0,0 +1,398 @@
#![allow(
clippy::module_name_repetitions,
clippy::struct_excessive_bools,
clippy::default_trait_access
)]
use std::{collections::HashSet, convert::TryFrom, time::Duration};
use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
use derive_builder::Builder;
use http::{
header::{HeaderMap, HeaderValue},
StatusCode,
};
use hubcaps::{Credentials, Github};
use regex::RegexSet;
use reqwest::header;
use tokio::time::sleep;
use crate::{
filter::{Excludes, Filter, Includes},
quirks::Quirks,
uri::Uri,
ErrorKind, Request, Response, Result, Status,
};
const DEFAULT_MAX_REDIRECTS: usize = 5;
const DEFAULT_USER_AGENT: &str = concat!("lychee/", env!("CARGO_PKG_VERSION"));
#[derive(Debug, Clone)]
pub struct Client {
/// Underlying reqwest client instance that handles the HTTP requests.
reqwest_client: reqwest::Client,
/// Github client.
github_client: Option<Github>,
/// Filtered domain handling.
filter: Filter,
/// Default request HTTP method to use.
method: reqwest::Method,
/// The set of accepted HTTP status codes for valid URIs.
accepted: Option<HashSet<StatusCode>>,
/// Override behavior for certain known issues with URIs.
quirks: Quirks,
}
/// A link checker using an API token for Github links
/// otherwise a normal HTTP client.
#[allow(unreachable_pub)]
#[derive(Builder, Debug)]
#[builder(build_fn(skip))]
#[builder(setter(into))]
#[builder(name = "ClientBuilder")]
pub struct ClientBuilderInternal {
/// Set an optional Github token.
/// This allows for more requests before
/// getting rate-limited.
github_token: Option<String>,
/// Check links matching this set of regular expressions
includes: Option<RegexSet>,
/// Exclude links matching this set of regular expressions
excludes: Option<RegexSet>,
/// Exclude all private network addresses
exclude_all_private: bool,
/// Exclude private IP addresses
exclude_private_ips: bool,
/// Exclude link-local IPs
exclude_link_local_ips: bool,
/// Exclude loopback IP addresses (e.g. 127.0.0.1)
exclude_loopback_ips: bool,
/// Don't check mail addresses
exclude_mail: bool,
/// Maximum number of redirects before returning error
max_redirects: usize,
/// User agent used for checking links
user_agent: String,
/// Ignore SSL errors
allow_insecure: bool,
/// Allowed URI scheme (e.g. https, http).
/// This excludes all links from checking, which
/// don't specify that scheme in the URL.
scheme: Option<String>,
/// Map of headers to send to each resource.
/// This allows working around validation issues
/// on some websites.
custom_headers: HeaderMap,
/// Request method (e.g. `GET` or `HEAD`)
method: reqwest::Method,
/// Set of accepted return codes / status codes
accepted: Option<HashSet<StatusCode>>,
/// Response timeout per request
timeout: Option<Duration>,
}
impl ClientBuilder {
fn build_excludes(&self) -> Excludes {
// exclude_all_private option turns on all "private" excludes,
// including private IPs, link-local IPs and loopback IPs
let exclude_all_private = matches!(self.exclude_all_private, Some(true));
let enable_exclude = |opt| exclude_all_private || matches!(opt, Some(true));
Excludes {
regex: self.excludes.clone().unwrap_or_default(),
private_ips: enable_exclude(self.exclude_private_ips),
link_local_ips: enable_exclude(self.exclude_link_local_ips),
loopback_ips: enable_exclude(self.exclude_loopback_ips),
mail: self.exclude_mail.unwrap_or_default(),
}
}
fn build_includes(&self) -> Includes {
let regex = self.includes.clone().flatten();
Includes { regex }
}
/// The build method instantiates the client.
#[allow(clippy::missing_errors_doc)]
pub fn build(&self) -> Result<Client> {
// Faking the user agent is necessary for some websites, unfortunately.
// Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com).
let user_agent = self
.user_agent
.clone()
.unwrap_or_else(|| DEFAULT_USER_AGENT.to_owned());
let mut headers = self.custom_headers.clone().unwrap_or_default();
headers.insert(header::USER_AGENT, HeaderValue::from_str(&user_agent)?);
headers.insert(
header::TRANSFER_ENCODING,
HeaderValue::from_static("chunked"),
);
let allow_insecure = self.allow_insecure.unwrap_or(false);
let max_redirects = self.max_redirects.unwrap_or(DEFAULT_MAX_REDIRECTS);
let builder = reqwest::ClientBuilder::new()
.gzip(true)
.default_headers(headers)
.danger_accept_invalid_certs(allow_insecure)
.redirect(reqwest::redirect::Policy::limited(max_redirects));
let timeout = self.timeout.flatten();
let reqwest_client = (match timeout {
Some(t) => builder.timeout(t),
None => builder,
})
.build()?;
let github_token = match self.github_token.clone().flatten() {
Some(token) if !token.is_empty() => {
Some(Github::new(user_agent, Credentials::Token(token))?)
}
_ => None,
};
let includes = self.build_includes();
let excludes = self.build_excludes();
let scheme = self.scheme.clone().flatten().map(|s| s.to_lowercase());
let filter = Filter::new(Some(includes), Some(excludes), scheme);
let quirks = Quirks::default();
Ok(Client {
reqwest_client,
github_client: github_token,
filter,
quirks,
method: self.method.clone().unwrap_or(reqwest::Method::GET),
accepted: self.accepted.clone().unwrap_or_default(),
})
}
}
impl Client {
pub async fn check<T, E>(&self, request: T) -> Result<Response>
where
Request: TryFrom<T, Error = E>,
ErrorKind: From<E>,
{
let Request { uri, source } = Request::try_from(request)?;
let status = if self.filter.is_excluded(&uri) {
Status::Excluded
} else if uri.scheme() == "mailto" {
self.check_mail(&uri).await
} else {
self.check_website(&uri).await
};
Ok(Response::new(uri, status, source))
}
pub async fn check_website(&self, uri: &Uri) -> Status {
let mut retries: i64 = 3;
let mut wait: u64 = 1;
let mut status = self.check_default(uri).await;
while retries > 0 {
if status.is_success() {
return status;
}
retries -= 1;
sleep(Duration::from_secs(wait)).await;
wait *= 2;
status = self.check_default(uri).await;
}
// Pull out the heavy weapons in case of a failed normal request.
// This could be a Github URL and we run into the rate limiter.
if let Some((owner, repo)) = uri.extract_github() {
return self.check_github(owner, repo).await;
}
status
}
async fn check_github(&self, owner: &str, repo: &str) -> Status {
match &self.github_client {
Some(github) => github
.repo(owner, repo)
.get()
.await
.map_or_else(|e| e.into(), |_| Status::Ok(StatusCode::OK)),
None => ErrorKind::MissingGitHubToken.into(),
}
}
async fn check_default(&self, uri: &Uri) -> Status {
let request = match self
.reqwest_client
.request(self.method.clone(), uri.as_str())
.build()
{
Ok(r) => r,
Err(e) => return e.into(),
};
let request = self.quirks.apply(request);
match self.reqwest_client.execute(request).await {
Ok(ref response) => Status::new(response, self.accepted.clone()),
Err(e) => e.into(),
}
}
pub async fn check_mail(&self, uri: &Uri) -> Status {
let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]);
let result = &(check_email(&input).await)[0];
if let Reachable::Invalid = result.is_reachable {
ErrorKind::UnreachableEmailAddress(uri.clone()).into()
} else {
Status::Ok(StatusCode::OK)
}
}
}
/// A convenience function to check a single URI
/// This is the most simple link check and avoids having to create a client manually.
/// For more complex scenarios, look into using the [`ClientBuilder`] instead.
#[allow(clippy::missing_errors_doc)]
pub async fn check<T, E>(request: T) -> Result<Response>
where
Request: TryFrom<T, Error = E>,
ErrorKind: From<E>,
{
let client = ClientBuilder::default().build()?;
Ok(client.check(request).await?)
}
#[cfg(test)]
mod test {
use std::time::{Duration, Instant};
use http::{header::HeaderMap, StatusCode};
use reqwest::header;
use super::ClientBuilder;
use crate::{mock_server, test_utils::get_mock_client_response};
#[tokio::test]
async fn test_nonexistent() {
let mock_server = mock_server!(StatusCode::NOT_FOUND);
let res = get_mock_client_response(mock_server.uri()).await;
assert!(res.status().is_failure());
}
#[tokio::test]
async fn test_nonexistent_with_path() {
let res = get_mock_client_response("http://127.0.0.1/invalid").await;
assert!(res.status().is_failure());
}
#[tokio::test]
async fn test_exponential_backoff() {
let mock_server = mock_server!(StatusCode::NOT_FOUND);
let start = Instant::now();
let res = get_mock_client_response(mock_server.uri()).await;
let end = start.elapsed();
assert!(res.status().is_failure());
// on slow connections, this might take a bit longer than nominal backed-off timeout (7 secs)
assert!(end.as_secs() >= 7);
assert!(end.as_secs() <= 8);
}
#[tokio::test]
async fn test_github() {
let res = get_mock_client_response("https://github.com/lycheeverse/lychee").await;
assert!(res.status().is_success());
}
#[tokio::test]
async fn test_github_nonexistent() {
let res = get_mock_client_response("https://github.com/lycheeverse/not-lychee").await;
assert!(res.status().is_failure());
}
#[tokio::test]
async fn test_youtube() {
// This is applying a quirk. See the quirks module.
let res = get_mock_client_response("https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7").await;
assert!(res.status().is_success());
let res = get_mock_client_response("https://www.youtube.com/watch?v=invalidNlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7").await;
assert!(res.status().is_failure());
}
#[tokio::test]
async fn test_non_github() {
let mock_server = mock_server!(StatusCode::OK);
let res = get_mock_client_response(mock_server.uri()).await;
assert!(res.status().is_success());
}
#[tokio::test]
async fn test_invalid_ssl() {
let res = get_mock_client_response("https://expired.badssl.com/").await;
assert!(res.status().is_failure());
// Same, but ignore certificate error
let res = ClientBuilder::default()
.allow_insecure(true)
.build()
.unwrap()
.check("https://expired.badssl.com/")
.await
.unwrap();
assert!(res.status().is_success());
}
#[tokio::test]
async fn test_custom_headers() {
let res = get_mock_client_response("https://crates.io/crates/lychee/").await;
assert!(res.status().is_failure());
// Try again, but with a custom header.
// For example, crates.io requires a custom accept header.
// See https://github.com/rust-lang/crates.io/issues/788
let mut custom = HeaderMap::new();
custom.insert(header::ACCEPT, "text/html".parse().unwrap());
let res = ClientBuilder::default()
.custom_headers(custom)
.build()
.unwrap()
.check("https://crates.io/crates/lychee")
.await
.unwrap();
assert!(res.status().is_success());
}
#[tokio::test]
async fn test_timeout() {
// Note: this checks response timeout, not connect timeout.
// To check connect timeout, we'd have to do something more involved,
// see: https://github.com/LukeMathWalker/wiremock-rs/issues/19
let mock_delay = Duration::from_millis(20);
let checker_timeout = Duration::from_millis(10);
assert!(mock_delay > checker_timeout);
let mock_server = mock_server!(StatusCode::OK, set_delay(mock_delay));
let client = ClientBuilder::default()
.timeout(checker_timeout)
.build()
.unwrap();
let res = client.check(mock_server.uri()).await.unwrap();
assert!(res.status().is_timeout());
}
}

View file

@ -4,6 +4,7 @@ use tokio::sync::mpsc;
use crate::{client, types};
#[allow(missing_debug_implementations)]
pub struct ClientPool {
tx: mpsc::Sender<types::Response>,
rx: mpsc::Receiver<types::Request>,
@ -11,6 +12,7 @@ pub struct ClientPool {
}
impl ClientPool {
#[must_use]
pub fn new(
tx: mpsc::Sender<types::Response>,
rx: mpsc::Receiver<types::Request>,
@ -20,12 +22,15 @@ impl ClientPool {
ClientPool { tx, rx, pool }
}
#[allow(clippy::missing_panics_doc)]
pub async fn listen(&mut self) {
while let Some(req) = self.rx.recv().await {
let client = self.pool.get().await;
let tx = self.tx.clone();
tokio::spawn(async move {
let resp = client.check(req).await.expect("Invalid URI");
// Client::check() may fail only because Request::try_from() may fail
// here request is already Request, so it never fails
let resp = client.check(req).await.unwrap();
tx.send(resp)
.await
.expect("Cannot send response to channel");

View file

@ -1,32 +1,42 @@
use crate::{
extract::{extract_links, FileType},
Request,
use std::{
collections::HashSet,
fmt::Display,
path::{Path, PathBuf},
};
use anyhow::{anyhow, Context, Result};
use glob::glob_with;
use reqwest::Url;
use serde::Serialize;
use shellexpand::tilde;
use std::path::Path;
use std::path::PathBuf;
use std::{collections::HashSet, fmt::Display};
use tokio::fs::read_to_string;
use tokio::io::{stdin, AsyncReadExt};
use tokio::{
fs::read_to_string,
io::{stdin, AsyncReadExt},
};
use crate::{
extract::{extract_links, FileType},
Request, Result,
};
const STDIN: &str = "-";
/// Links which need to be validated.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[non_exhaustive]
pub enum Input {
RemoteUrl(Url),
/// URL (of HTTP/HTTPS scheme).
RemoteUrl(Box<Url>),
/// Unix shell style glob pattern.
FsGlob { pattern: String, ignore_case: bool },
/// File path.
FsPath(PathBuf),
/// Standard Input.
Stdin,
/// Raw string input.
String(String),
}
impl Serialize for Input {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
@ -36,26 +46,13 @@ impl Serialize for Input {
impl Display for Input {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Input::RemoteUrl(url) => {
write!(f, "{}", url)
}
Input::FsGlob {
pattern,
ignore_case: _,
} => {
write!(f, "{}", pattern)
}
Input::FsPath(path) => {
write!(f, "{}", path.to_str().unwrap_or_default())
}
Input::Stdin => {
write!(f, "stdin")
}
Input::String(_) => {
write!(f, "raw input string")
}
}
f.write_str(match self {
Input::RemoteUrl(url) => url.as_str(),
Input::FsGlob { pattern, .. } => pattern,
Input::FsPath(path) => path.to_str().unwrap_or_default(),
Input::Stdin => "stdin",
Input::String(_) => "raw input string",
})
}
}
@ -67,6 +64,7 @@ pub struct InputContent {
}
impl InputContent {
#[must_use]
pub fn from_string(s: &str, file_type: FileType) -> Self {
// TODO: consider using Cow (to avoid one .clone() for String types)
Self {
@ -78,58 +76,50 @@ impl InputContent {
}
impl Input {
#[must_use]
pub fn new(value: &str, glob_ignore_case: bool) -> Self {
if value == STDIN {
Self::Stdin
} else if let Ok(url) = Url::parse(&value) {
Self::RemoteUrl(Box::new(url))
} else {
match Url::parse(&value) {
Ok(url) => Self::RemoteUrl(url),
Err(_) => {
// this seems to be the only way to determine if this is a glob pattern
let is_glob = glob::Pattern::escape(value) != value;
// this seems to be the only way to determine if this is a glob pattern
let is_glob = glob::Pattern::escape(value) != value;
if is_glob {
Self::FsGlob {
pattern: value.to_owned(),
ignore_case: glob_ignore_case,
}
} else {
Self::FsPath(value.into())
}
if is_glob {
Self::FsGlob {
pattern: value.to_owned(),
ignore_case: glob_ignore_case,
}
} else {
Self::FsPath(value.into())
}
}
}
#[allow(clippy::missing_panics_doc, clippy::missing_errors_doc)]
pub async fn get_contents(
&self,
file_type_hint: Option<FileType>,
skip_missing: bool,
) -> Result<Vec<InputContent>> {
use Input::*;
match self {
match *self {
// TODO: should skip_missing also affect URLs?
RemoteUrl(url) => Ok(vec![Self::url_contents(url).await?]),
FsGlob {
pattern,
Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]),
Input::FsGlob {
ref pattern,
ignore_case,
} => Ok(Self::glob_contents(pattern, *ignore_case).await?),
FsPath(path) => {
let content = Self::path_content(&path).await.with_context(|| {
format!(
"Failed to read file: `{}`",
path.to_str().unwrap_or("<MALFORMED PATH>")
)
});
} => Ok(Self::glob_contents(pattern, ignore_case).await?),
Input::FsPath(ref path) => {
let content = Self::path_content(path).await;
match content {
Ok(input_content) => Ok(vec![input_content]),
Err(_) if skip_missing => Ok(vec![]),
Err(arg) => Err(anyhow!(arg)),
Err(e) => Err(e),
}
}
Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]),
String(s) => Ok(vec![Self::string_content(s, file_type_hint)]),
Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]),
Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]),
}
}
@ -142,11 +132,10 @@ impl Input {
};
let res = reqwest::get(url.clone()).await?;
let content = res.text().await?;
let input_content = InputContent {
input: Input::RemoteUrl(url.clone()),
input: Input::RemoteUrl(Box::new(url.clone())),
file_type,
content,
content: res.text().await?,
};
Ok(input_content)
@ -172,10 +161,13 @@ impl Input {
Ok(contents)
}
async fn path_content<P: Into<PathBuf> + AsRef<Path>>(path: P) -> Result<InputContent> {
async fn path_content<P: Into<PathBuf> + AsRef<Path> + Clone>(path: P) -> Result<InputContent> {
let content = read_to_string(&path)
.await
.map_err(|e| (path.clone().into(), e))?;
let input_content = InputContent {
file_type: FileType::from(path.as_ref()),
content: read_to_string(&path).await?,
content,
input: Input::FsPath(path.into()),
};
@ -203,15 +195,17 @@ impl Input {
/// Fetch all unique links from a slice of inputs
/// All relative URLs get prefixed with `base_url` if given.
#[allow(clippy::missing_errors_doc)]
pub async fn collect_links(
inputs: &[Input],
base_url: Option<String>,
skip_missing_inputs: bool,
max_concurrency: usize,
) -> Result<HashSet<Request>> {
let base_url = match base_url {
Some(url) => Some(Url::parse(&url)?),
_ => None,
let base_url = if let Some(url) = base_url {
Some(Url::parse(&url).map_err(|e| (url, e))?)
} else {
None
};
let (contents_tx, mut contents_rx) = tokio::sync::mpsc::channel(max_concurrency);
@ -236,7 +230,7 @@ pub async fn collect_links(
for input_content in result? {
let base_url = base_url.clone();
let handle =
tokio::task::spawn_blocking(move || extract_links(&input_content, base_url));
tokio::task::spawn_blocking(move || extract_links(&input_content, &base_url));
extract_links_handles.push(handle);
}
}
@ -257,23 +251,32 @@ pub async fn collect_links(
#[cfg(test)]
mod test {
use super::*;
use pretty_assertions::assert_eq;
use std::{fs::File, io::Write};
use http::StatusCode;
use pretty_assertions::assert_eq;
use reqwest::Url;
use super::{collect_links, Input};
use crate::{
test_utils::{get_mock_server_with_content, website},
Uri,
extract::FileType,
mock_server,
test_utils::{mail, website},
Result, Uri,
};
use std::fs::File;
use std::io::Write;
use std::str::FromStr;
const TEST_STRING: &str = "http://test-string.com";
const TEST_URL: &str = "https://test-url.org";
const TEST_FILE: &str = "https://test-file.io";
const TEST_GLOB_1: &str = "https://test-glob-1.io";
const TEST_GLOB_2_MAIL: &str = "test@glob-2.io";
#[tokio::test]
#[ignore]
async fn test_file_without_extension_is_plaintext() -> Result<()> {
let dir = tempfile::tempdir()?;
let temp_dir = tempfile::tempdir()?;
// Treat as plaintext file (no extension)
let file_path = dir.path().join("README");
let file_path = temp_dir.path().join("README");
let _file = File::create(&file_path)?;
let input = Input::new(&file_path.as_path().display().to_string(), true);
let contents = input.get_contents(None, true).await?;
@ -295,16 +298,12 @@ mod test {
#[tokio::test]
async fn test_collect_links() -> Result<()> {
const TEST_STRING: &str = "http://test-string.com";
const TEST_URL: &str = "https://test-url.org";
const TEST_FILE: &str = "https://test-file.io";
const TEST_GLOB_1: &str = "https://test-glob-1.io";
const TEST_GLOB_2_MAIL: &str = "test@glob-2.io";
let temp_dir = tempfile::tempdir()?;
let temp_dir_path = temp_dir.path();
let dir = tempfile::tempdir()?;
let file_path = dir.path().join("f");
let file_glob_1_path = dir.path().join("glob-1");
let file_glob_2_path = dir.path().join("glob-2");
let file_path = temp_dir_path.join("f");
let file_glob_1_path = temp_dir_path.join("glob-1");
let file_glob_2_path = temp_dir_path.join("glob-2");
let mut file = File::create(&file_path)?;
let mut file_glob_1 = File::create(file_glob_1_path)?;
@ -314,14 +313,16 @@ mod test {
writeln!(file_glob_1, "{}", TEST_GLOB_1)?;
writeln!(file_glob_2, "{}", TEST_GLOB_2_MAIL)?;
let mock_server = get_mock_server_with_content(http::StatusCode::OK, Some(TEST_URL)).await;
let mock_server = mock_server!(StatusCode::OK, set_body_string(TEST_URL));
let inputs = vec![
Input::String(TEST_STRING.to_string()),
Input::RemoteUrl(Url::from_str(&mock_server.uri())?),
Input::String(TEST_STRING.to_owned()),
Input::RemoteUrl(Box::new(
Url::parse(&mock_server.uri()).map_err(|e| (mock_server.uri(), e))?,
)),
Input::FsPath(file_path),
Input::FsGlob {
pattern: dir.path().join("glob*").to_str().unwrap().to_string(),
pattern: temp_dir_path.join("glob*").to_str().unwrap().to_owned(),
ignore_case: true,
},
];
@ -329,12 +330,12 @@ mod test {
let responses = collect_links(&inputs, None, false, 8).await?;
let mut links = responses.into_iter().map(|r| r.uri).collect::<Vec<Uri>>();
let mut expected_links: Vec<Uri> = vec![
let mut expected_links = vec![
website(TEST_STRING),
website(TEST_URL),
website(TEST_FILE),
website(TEST_GLOB_1),
Uri::Mail(TEST_GLOB_2_MAIL.to_string()),
mail(TEST_GLOB_2_MAIL),
];
links.sort();

View file

@ -1,15 +1,17 @@
use crate::uri::Uri;
use crate::{collector::InputContent, Request};
use html5ever::parse_document;
use html5ever::tendril::{StrTendril, TendrilSink};
use std::{collections::HashSet, convert::TryFrom, path::Path};
use html5ever::{
parse_document,
tendril::{StrTendril, TendrilSink},
};
use linkify::LinkFinder;
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
use std::path::Path;
use std::{collections::HashSet, convert::TryFrom};
use url::Url;
#[derive(Clone, Debug, PartialEq, Eq)]
use crate::{collector::InputContent, Request, Uri};
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum FileType {
Html,
Markdown,
@ -26,20 +28,17 @@ impl<P: AsRef<Path>> From<P> for FileType {
/// Detect if the given path points to a Markdown, HTML, or plaintext file.
fn from(p: P) -> FileType {
let path = p.as_ref();
match path.extension() {
Some(ext) => match ext {
_ if (ext == "md" || ext == "markdown") => FileType::Markdown,
_ if (ext == "htm" || ext == "html") => FileType::Html,
_ => FileType::Plaintext,
},
// Assume HTML in case of no extension.
// Note: this is only reasonable for URLs; not paths on disk.
// For example, `README` without an extension is more likely to be a plaintext file.
// A better solution would be to also implement `From<Url> for FileType`.
// Unfortunately that's not possible without refactoring, as
// `AsRef<Path>` could be implemented for `Url` in the future, which is why
// `From<Url> for FileType` is not allowed.
None => FileType::Html,
// Assume HTML in case of no extension.
// Note: this is only reasonable for URLs; not paths on disk.
// For example, `README` without an extension is more likely to be a plaintext file.
// A better solution would be to also implement `From<Url> for FileType`.
// Unfortunately that's not possible without refactoring, as
// `AsRef<Path>` could be implemented for `Url` in the future, which is why
// `From<Url> for FileType` is not allowed.
match path.extension().and_then(std::ffi::OsStr::to_str) {
Some("md") | Some("markdown") => FileType::Markdown,
Some("htm") | Some("html") | None => FileType::Html,
Some(_) => FileType::Plaintext,
}
}
}
@ -55,10 +54,9 @@ fn extract_links_from_markdown(input: &str) -> Vec<String> {
let parser = Parser::new(input);
parser
.flat_map(|event| match event {
MDEvent::Start(tag) => match tag {
Tag::Link(_, url, _) | Tag::Image(_, url, _) => vec![url.to_string()],
_ => vec![],
},
MDEvent::Start(Tag::Link(_, url, _)) | MDEvent::Start(Tag::Image(_, url, _)) => {
vec![url.to_string()]
}
MDEvent::Text(txt) => extract_links_from_plaintext(&txt.to_string()),
MDEvent::Html(html) => extract_links_from_html(&html.to_string()),
_ => vec![],
@ -69,7 +67,7 @@ fn extract_links_from_markdown(input: &str) -> Vec<String> {
/// Extract unparsed URL strings from a HTML string.
fn extract_links_from_html(input: &str) -> Vec<String> {
let tendril = StrTendril::from(input);
let rc_dom = parse_document(RcDom::default(), Default::default()).one(tendril);
let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);
let mut urls = Vec::new();
@ -84,15 +82,11 @@ fn extract_links_from_html(input: &str) -> Vec<String> {
fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
match node.data {
NodeData::Text { ref contents } => {
for link in extract_links_from_plaintext(&contents.borrow()) {
urls.push(link);
}
urls.append(&mut extract_links_from_plaintext(&contents.borrow()));
}
NodeData::Comment { ref contents } => {
for link in extract_links_from_plaintext(contents) {
urls.push(link);
}
urls.append(&mut extract_links_from_plaintext(contents));
}
NodeData::Element {
@ -106,9 +100,7 @@ fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
if elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
urls.push(attr_value);
} else {
for link in extract_links_from_plaintext(&attr_value) {
urls.push(link);
}
urls.append(&mut extract_links_from_plaintext(&attr_value));
}
}
}
@ -148,7 +140,7 @@ fn extract_links_from_plaintext(input: &str) -> Vec<String> {
pub(crate) fn extract_links(
input_content: &InputContent,
base_url: Option<Url>,
base_url: &Option<Url>,
) -> HashSet<Request> {
let links = match input_content.file_type {
FileType::Markdown => extract_links_from_markdown(&input_content.content),
@ -160,21 +152,14 @@ pub(crate) fn extract_links(
// Silently ignore the parse failures for now.
let mut requests: HashSet<Request> = HashSet::new();
for link in links {
match Uri::try_from(link.as_str()) {
Ok(uri) => {
requests.insert(Request::new(uri, input_content.input.clone()));
}
Err(_) => {
if !Path::new(&link).exists() {
if let Some(base_url) = &base_url {
if let Ok(new_url) = base_url.join(&link) {
requests.insert(Request::new(
Uri::Website(new_url),
input_content.input.clone(),
));
}
}
}
if let Ok(uri) = Uri::try_from(link.as_str()) {
requests.insert(Request::new(uri, input_content.input.clone()));
} else if !Path::new(&link).exists() {
if let Some(new_url) = base_url.as_ref().and_then(|u| u.join(&link).ok()) {
requests.insert(Request::new(
Uri { url: new_url },
input_content.input.clone(),
));
}
};
}
@ -183,15 +168,29 @@ pub(crate) fn extract_links(
#[cfg(test)]
mod test {
use crate::test_utils::website;
use std::{
array,
collections::HashSet,
fs::File,
io::{BufReader, Read},
path::Path,
};
use super::*;
use pretty_assertions::assert_eq;
use std::fs::File;
use std::io::{BufReader, Read};
use url::Url;
use super::{
extract_links, extract_links_from_html, extract_links_from_markdown,
extract_links_from_plaintext, find_links, FileType,
};
use crate::{
collector::InputContent,
test_utils::{mail, website},
Uri,
};
fn load_fixture(filename: &str) -> String {
let fixture_path = Path::new(module_path!())
let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.join("fixtures")
@ -208,106 +207,92 @@ mod test {
content
}
fn extract_uris(input: &str, file_type: FileType, base_url: Option<&str>) -> HashSet<Uri> {
extract_links(
&InputContent::from_string(input, file_type),
&base_url.map(|u| Url::parse(u).unwrap()),
)
.into_iter()
.map(|r| r.uri)
.collect()
}
#[test]
fn test_file_type() {
// FIXME: Assume plaintext in case a path has no extension
// assert_eq!(FileType::from(Path::new("/")), FileType::Plaintext);
assert_eq!(FileType::from(Path::new("test.md")), FileType::Markdown);
assert_eq!(FileType::from("test.md"), FileType::Markdown);
assert_eq!(FileType::from("test.markdown"), FileType::Markdown);
assert_eq!(FileType::from("test.html"), FileType::Html);
assert_eq!(FileType::from("test.txt"), FileType::Plaintext);
assert_eq!(FileType::from("test.something"), FileType::Plaintext);
assert_eq!(
FileType::from(Path::new("test.markdown")),
FileType::Markdown
);
assert_eq!(FileType::from(Path::new("test.html")), FileType::Html);
assert_eq!(FileType::from(Path::new("test.txt")), FileType::Plaintext);
assert_eq!(
FileType::from(Path::new("test.something")),
FileType::Plaintext
);
assert_eq!(
FileType::from(Path::new("/absolute/path/to/test.something")),
FileType::from("/absolute/path/to/test.something"),
FileType::Plaintext
);
}
#[test]
fn test_extract_link_at_end_of_line() {
let link = "http://www.apache.org/licenses/LICENSE-2.0";
let input = format!("{}\n", link);
let input = "http://www.apache.org/licenses/LICENSE-2.0\n";
let link = input.trim_end();
let found = extract_links_from_markdown(&input);
assert_eq!(vec![link], found);
let found = extract_links_from_plaintext(&input);
assert_eq!(vec![link], found);
let found = extract_links_from_html(&input);
assert_eq!(vec![link], found);
assert_eq!(vec![link], extract_links_from_markdown(&input));
assert_eq!(vec![link], extract_links_from_plaintext(&input));
assert_eq!(vec![link], extract_links_from_html(&input));
}
#[test]
fn test_extract_markdown_links() {
let input = "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)";
let links: HashSet<Uri> = extract_links(
&InputContent::from_string(input, FileType::Markdown),
Some(Url::parse("https://github.com/hello-rust/lychee/").unwrap()),
)
.into_iter()
.map(|r| r.uri)
.collect();
assert_eq!(
links,
[
website("https://endler.dev"),
website("https://github.com/hello-rust/lychee/relative_link"),
]
.iter()
.cloned()
.collect()
)
let links = extract_uris(
"This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)",
FileType::Markdown,
Some("https://github.com/hello-rust/lychee/"),
);
let expected_links = array::IntoIter::new([
website("https://endler.dev"),
website("https://github.com/hello-rust/lychee/relative_link"),
])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected_links)
}
#[test]
fn test_extract_html_links() {
let input = r#"<html>
let links = extract_uris(
r#"<html>
<div class="row">
<a href="https://github.com/lycheeverse/lychee/">
<a href="blob/master/README.md">README</a>
</div>
</html>"#;
let links: HashSet<Uri> = extract_links(
&InputContent::from_string(input, FileType::Html),
Some(Url::parse("https://github.com/lycheeverse/").unwrap()),
)
.into_iter()
.map(|r| r.uri)
.collect();
assert_eq!(
links,
[
website("https://github.com/lycheeverse/lychee/"),
website("https://github.com/lycheeverse/blob/master/README.md"),
]
.iter()
.cloned()
.collect::<HashSet<Uri>>(),
</html>"#,
FileType::Html,
Some("https://github.com/lycheeverse/"),
);
let expected_links = array::IntoIter::new([
website("https://github.com/lycheeverse/lychee/"),
website("https://github.com/lycheeverse/blob/master/README.md"),
])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}
#[test]
fn test_skip_markdown_anchors() {
let input = "This is [a test](#lol).";
let links = extract_links(&InputContent::from_string(input, FileType::Markdown), None);
assert_eq!(links, HashSet::new())
let links = extract_uris("This is [a test](#lol).", FileType::Markdown, None);
assert!(links.is_empty())
}
#[test]
fn test_skip_markdown_internal_urls() {
let input = "This is [a test](./internal).";
let links = extract_links(&InputContent::from_string(input, FileType::Markdown), None);
assert_eq!(links, HashSet::new())
let links = extract_uris("This is [a test](./internal).", FileType::Markdown, None);
assert!(links.is_empty())
}
#[test]
@ -317,23 +302,16 @@ mod test {
This is [an internal url](@/internal.markdown) \
This is [an internal url](@/internal.markdown#example) \
This is [an internal url](@/internal.md#example)";
let links: HashSet<Uri> = extract_links(
&InputContent::from_string(input, FileType::Markdown),
Some(Url::parse(base_url).unwrap()),
)
.into_iter()
.map(|r| r.uri)
.collect();
let expected = [
let links = extract_uris(input, FileType::Markdown, Some(base_url));
let expected = array::IntoIter::new([
website("https://localhost.com/@/internal.md"),
website("https://localhost.com/@/internal.markdown"),
website("https://localhost.com/@/internal.md#example"),
website("https://localhost.com/@/internal.markdown#example"),
]
.iter()
.cloned()
.collect();
])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected)
}
@ -341,15 +319,9 @@ mod test {
#[test]
fn test_skip_markdown_email() {
let input = "Get in touch - [Contact Us](mailto:test@test.com)";
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(input, FileType::Markdown), None)
.into_iter()
.map(|r| r.uri)
.collect();
let expected: HashSet<Uri> = [Uri::Mail("test@test.com".to_string())]
.iter()
.cloned()
.collect();
let links = extract_uris(input, FileType::Markdown, None);
let expected = array::IntoIter::new([mail("test@test.com")]).collect::<HashSet<Uri>>();
assert_eq!(links, expected)
}
@ -357,55 +329,40 @@ mod test {
fn test_non_markdown_links() {
let input =
"https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.org";
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(input, FileType::Plaintext), None)
.into_iter()
.map(|r| r.uri)
.collect();
let links: HashSet<Uri> = extract_uris(input, FileType::Plaintext, None);
let expected = [
let expected = array::IntoIter::new([
website("https://endler.dev"),
website("https://hello-rust.show/foo/bar?lol=1"),
Uri::Mail("test@example.org".to_string()),
]
.iter()
.cloned()
.collect();
mail("test@example.org"),
])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected)
}
#[test]
#[ignore]
// TODO: Does this escaping need to work properly?
// See https://github.com/tcort/markdown-link-check/issues/37
fn test_md_escape() {
let input = r#"http://msdn.microsoft.com/library/ie/ms535874\(v=vs.85\).aspx"#;
let links = find_links(input);
let expected = "http://msdn.microsoft.com/library/ie/ms535874(v=vs.85).aspx)";
assert!(links.len() == 1);
assert_eq!(links[0].as_str(), expected);
matches!(&links[..], [link] if link.as_str() == expected);
}
#[test]
fn test_extract_html5_not_valid_xml() {
let input = load_fixture("TEST_HTML5.html");
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(&input, FileType::Html), None)
.into_iter()
.map(|r| r.uri)
.collect();
let links = extract_uris(&input, FileType::Html, None);
let expected_links = [
let expected_links = array::IntoIter::new([
website("https://example.org/head/home"),
website("https://example.org/css/style_full_url.css"),
// the body links wouldn't be present if the file was parsed strictly as XML
website("https://example.org/body/a"),
website("https://example.org/body/div_empty_a"),
]
.iter()
.cloned()
.collect();
])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}
@ -413,15 +370,9 @@ mod test {
#[test]
fn test_extract_html5_not_valid_xml_relative_links() {
let input = load_fixture("TEST_HTML5.html");
let links: HashSet<Uri> = extract_links(
&InputContent::from_string(&input, FileType::Html),
Some(Url::parse("https://example.org").unwrap()),
)
.into_iter()
.map(|r| r.uri)
.collect();
let links = extract_uris(&input, FileType::Html, Some("https://example.org"));
let expected_links = [
let expected_links = array::IntoIter::new([
website("https://example.org/head/home"),
website("https://example.org/images/icon.png"),
website("https://example.org/css/style_relative_url.css"),
@ -430,10 +381,8 @@ mod test {
// the body links wouldn't be present if the file was parsed strictly as XML
website("https://example.org/body/a"),
website("https://example.org/body/div_empty_a"),
]
.iter()
.cloned()
.collect();
])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}
@ -442,16 +391,10 @@ mod test {
fn test_extract_html5_lowercase_doctype() {
// this has been problematic with previous XML based parser
let input = load_fixture("TEST_HTML5_LOWERCASE_DOCTYPE.html");
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(&input, FileType::Html), None)
.into_iter()
.map(|r| r.uri)
.collect();
let links = extract_uris(&input, FileType::Html, None);
let expected_links = [website("https://example.org/body/a")]
.iter()
.cloned()
.collect();
let expected_links =
array::IntoIter::new([website("https://example.org/body/a")]).collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}
@ -460,22 +403,16 @@ mod test {
fn test_extract_html5_minified() {
// minified HTML with some quirky elements such as href attribute values specified without quotes
let input = load_fixture("TEST_HTML5_MINIFIED.html");
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(&input, FileType::Html), None)
.into_iter()
.map(|r| r.uri)
.collect();
let links = extract_uris(&input, FileType::Html, None);
let expected_links = [
let expected_links = array::IntoIter::new([
website("https://example.org/"),
website("https://example.org/favicon.ico"),
website("https://fonts.externalsite.com"),
website("https://example.org/docs/"),
website("https://example.org/forum"),
]
.iter()
.cloned()
.collect();
])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}
@ -484,18 +421,10 @@ mod test {
fn test_extract_html5_malformed() {
// malformed links shouldn't stop the parser from further parsing
let input = load_fixture("TEST_HTML5_MALFORMED_LINKS.html");
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(&input, FileType::Html), None)
.into_iter()
.map(|r| r.uri)
.collect();
let links = extract_uris(&input, FileType::Html, None);
let expected_links = [Uri::Website(
Url::parse("https://example.org/valid").unwrap(),
)]
.iter()
.cloned()
.collect();
let expected_links =
array::IntoIter::new([website("https://example.org/valid")]).collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}
@ -504,21 +433,15 @@ mod test {
fn test_extract_html5_custom_elements() {
// the element name shouldn't matter for attributes like href, src, cite etc
let input = load_fixture("TEST_HTML5_CUSTOM_ELEMENTS.html");
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(&input, FileType::Html), None)
.into_iter()
.map(|r| r.uri)
.collect();
let links = extract_uris(&input, FileType::Html, None);
let expected_links = [
let expected_links = array::IntoIter::new([
website("https://example.org/some-weird-element"),
website("https://example.org/even-weirder-src"),
website("https://example.org/even-weirder-href"),
website("https://example.org/citations"),
]
.iter()
.cloned()
.collect();
])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}
@ -527,21 +450,13 @@ mod test {
fn test_extract_urls_with_at_sign_properly() {
// note that these used to parse as emails
let input = "https://example.com/@test/test http://otherdomain.com/test/@test".to_string();
let links: HashSet<Uri> = extract_links(
&InputContent::from_string(&input, FileType::Plaintext),
None,
)
.into_iter()
.map(|r| r.uri)
.collect();
let links = extract_uris(&input, FileType::Plaintext, None);
let expected_links = [
let expected_links = array::IntoIter::new([
website("https://example.com/@test/test"),
website("http://otherdomain.com/test/@test"),
]
.iter()
.cloned()
.collect();
])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}

View file

@ -1,14 +1,14 @@
use lazy_static::lazy_static;
use regex::RegexSet;
use std::net::IpAddr;
use crate::Uri;
/// Pre-defined exclusions for known false-positives
static FALSE_POSITIVE_REGEX: &[&str] = &[r"http://www.w3.org/1999/xhtml"];
static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"];
/// Exclude configuration for the link checker.
/// You can ignore links based on regex patterns or pre-defined IP ranges.
#[allow(clippy::struct_excessive_bools)]
#[derive(Clone, Debug)]
pub struct Excludes {
/// User-defined set of excluded regex patterns
@ -37,54 +37,40 @@ impl Default for Excludes {
}
impl Excludes {
#[inline]
#[must_use]
pub fn regex(&self, input: &str) -> bool {
if let Some(excludes) = &self.regex {
if excludes.is_match(input) {
return true;
}
}
false
self.regex.as_ref().map_or(false, |re| re.is_match(input))
}
pub fn false_positive(&self, input: &str) -> bool {
lazy_static! {
static ref FALSE_POSITIVES: RegexSet = RegexSet::new(FALSE_POSITIVE_REGEX).unwrap();
}
FALSE_POSITIVES.is_match(input)
#[must_use]
pub fn is_false_positive(input: &str) -> bool {
input == FALSE_POSITIVE_PAT[0]
}
#[must_use]
pub fn ip(&self, uri: &Uri) -> bool {
if let Some(ipaddr) = uri.host_ip() {
if self.loopback_ips && ipaddr.is_loopback() {
return true;
}
match uri.host_ip() {
Some(ip_addr) if self.loopback_ips && ip_addr.is_loopback() => true,
// Note: in a pathological case, an IPv6 address can be IPv4-mapped
// (IPv4 address embedded in a IPv6). We purposefully
// don't deal with it here, and assume if an address is IPv6,
// we shouldn't attempt to map it to IPv4.
// See: https://tools.ietf.org/html/rfc4291#section-2.5.5.2
if let IpAddr::V4(v4addr) = ipaddr {
if self.private_ips && v4addr.is_private() {
return true;
}
if self.link_local_ips && v4addr.is_link_local() {
return true;
}
}
Some(IpAddr::V4(v4_addr)) if self.private_ips && v4_addr.is_private() => true,
Some(IpAddr::V4(v4_addr)) if self.link_local_ips && v4_addr.is_link_local() => true,
_ => false,
}
false
}
pub fn is_mail_excluded(&self) -> bool {
#[inline]
#[must_use]
pub const fn is_mail_excluded(&self) -> bool {
self.mail
}
#[inline]
pub fn is_empty(&self) -> bool {
match &self.regex {
None => true,
Some(regex_set) => regex_set.is_empty(),
}
self.regex.as_ref().map_or(true, RegexSet::is_empty)
}
}

View file

@ -0,0 +1,21 @@
use regex::RegexSet;
/// Include configuration for the link checker.
/// You can include links based on regex patterns
#[derive(Clone, Debug, Default)]
pub struct Includes {
pub regex: Option<RegexSet>,
}
impl Includes {
#[inline]
#[must_use]
pub fn regex(&self, input: &str) -> bool {
self.regex.as_ref().map_or(false, |re| re.is_match(input))
}
#[inline]
pub fn is_empty(&self) -> bool {
self.regex.as_ref().map_or(true, RegexSet::is_empty)
}
}

View file

@ -0,0 +1,309 @@
mod excludes;
mod includes;
pub use excludes::Excludes;
pub use includes::Includes;
use crate::uri::Uri;
/// A generic URI filter
/// Used to decide if a given URI should be checked or skipped
#[derive(Clone, Debug, Default)]
pub struct Filter {
pub(crate) includes: Includes,
pub(crate) excludes: Excludes,
pub(crate) scheme: Option<String>,
}
impl Filter {
#[must_use]
pub fn new(
includes: Option<Includes>,
excludes: Option<Excludes>,
scheme: Option<String>,
) -> Self {
Filter {
includes: includes.unwrap_or_default(),
excludes: excludes.unwrap_or_default(),
scheme,
}
}
#[must_use]
pub fn is_excluded(&self, uri: &Uri) -> bool {
// Skip mail?
if self.excludes.is_mail_excluded() && uri.scheme() == "mailto" {
return true;
}
// Skip specific IP address?
if self.excludes.ip(&uri) {
return true;
}
let input = uri.as_str();
if self.includes.is_empty() {
if self.excludes.is_empty() {
// No regex includes/excludes at all?
// Not excluded unless it's a known false positive
return Excludes::is_false_positive(input);
}
} else if self.includes.regex(input) {
// Included explicitly (Includes take precedence over excludes)
return false;
}
// Exclude well-known false-positives.
// This is done after checking includes to allow for user-overwrites.
if Excludes::is_false_positive(uri.as_str()) {
return true;
}
if self.excludes.is_empty() {
if !self.includes.is_empty() {
// In case we have includes and no excludes,
// skip everything that was not included
return true;
}
} else if self.excludes.regex(input) {
// Excluded explicitly
return true;
}
// URI scheme excluded?
matches!(self.scheme, Some(ref scheme) if scheme != uri.scheme())
}
}
#[cfg(test)]
mod test {
use regex::RegexSet;
use reqwest::Url;
use url::Host;
use super::{Excludes, Filter, Includes};
use crate::test_utils::{mail, website};
// Note: the standard library as of Rust stable 1.47.0 does not expose
// "link-local" or "private" IPv6 checks. However, one might argue
// that these concepts do exist in IPv6, albeit the naming is different.
// See: https://en.wikipedia.org/wiki/Link-local_address#IPv6
// See: https://en.wikipedia.org/wiki/Private_network#IPv6
// See: https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html#method.is_unicast_link_local
const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1";
const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1";
const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1";
const V4_LOOPBACK: &str = "http://127.0.0.1";
const V6_LOOPBACK: &str = "http://[::1]";
const V4_LINK_LOCAL: &str = "http://169.254.0.1";
// IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6)
const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]";
const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]";
macro_rules! assert_ip_address {
(v4: $ip:expr, $predicate:tt) => {{
let res = if let Host::Ipv4(ipv4) = Url::parse($ip).map_err(|_| ())?.host().ok_or(())? {
ipv4.$predicate()
} else {
false
};
std::assert!(res);
}};
(v6: $ip:expr, $predicate:tt) => {
let res = if let Host::Ipv6(ipv6) = Url::parse($ip).map_err(|_| ())?.host().ok_or(())? {
ipv6.$predicate()
} else {
false
};
std::assert!(res);
};
}
#[allow(clippy::shadow_unrelated)]
#[test]
fn test_const_sanity() -> Result<(), ()> {
assert_ip_address!(v4: V4_PRIVATE_CLASS_A, is_private);
assert_ip_address!(v4: V4_PRIVATE_CLASS_B, is_private);
assert_ip_address!(v4: V4_PRIVATE_CLASS_C, is_private);
assert_ip_address!(v4: V4_LOOPBACK, is_loopback);
assert_ip_address!(v6: V6_LOOPBACK, is_loopback);
assert_ip_address!(v4: V4_LINK_LOCAL, is_link_local);
Ok(())
}
#[test]
fn test_includes_and_excludes_empty() {
// This is the pre-configured, empty set of excludes for a client
// In this case, only the requests matching the include set will be checked
let filter = Filter::default();
assert!(!filter.is_excluded(&website("https://example.org")));
}
#[test]
fn test_false_positives() {
let filter = Filter::default();
assert!(filter.is_excluded(&website("http://www.w3.org/1999/xhtml")));
assert!(!filter.is_excluded(&website("https://example.org")));
}
#[test]
fn test_overwrite_false_positives() {
let includes = Includes {
regex: Some(RegexSet::new(&[r"http://www.w3.org/1999/xhtml"]).unwrap()),
};
let filter = Filter {
includes,
..Filter::default()
};
assert!(!filter.is_excluded(&website("http://www.w3.org/1999/xhtml")));
}
#[test]
fn test_include_regex() {
let includes = Includes {
regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()),
};
let filter = Filter {
includes,
..Filter::default()
};
// Only the requests matching the include set will be checked
assert!(!filter.is_excluded(&website("https://foo.example.org")));
assert!(filter.is_excluded(&website("https://bar.example.org")));
assert!(filter.is_excluded(&website("https://example.org")));
}
#[test]
fn test_exclude_mail() {
let excludes = Excludes {
mail: true,
..Excludes::default()
};
let filter = Filter {
excludes,
..Filter::default()
};
assert!(filter.is_excluded(&mail("mail@example.org")));
assert!(filter.is_excluded(&mail("foo@bar.dev")));
assert!(!filter.is_excluded(&website("http://bar.dev")));
}
#[test]
fn test_exclude_regex() {
let excludes = Excludes {
regex: Some(
RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.org"]).unwrap(),
),
..Excludes::default()
};
let filter = Filter {
excludes,
..Filter::default()
};
assert!(filter.is_excluded(&website("http://github.com")));
assert!(filter.is_excluded(&website("http://exclude.org")));
assert!(filter.is_excluded(&mail("mail@example.org")));
assert!(!filter.is_excluded(&website("http://bar.dev")));
assert!(!filter.is_excluded(&mail("foo@bar.dev")));
}
#[test]
fn test_exclude_include_regex() {
let includes = Includes {
regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()),
};
let excludes = Excludes {
regex: Some(RegexSet::new(&[r"example.org"]).unwrap()),
..Excludes::default()
};
let filter = Filter {
includes,
excludes,
..Filter::default()
};
// Includes take preference over excludes
assert!(!filter.is_excluded(&website("https://foo.example.org")),);
assert!(filter.is_excluded(&website("https://example.org")));
assert!(filter.is_excluded(&website("https://bar.example.org")));
}
#[test]
fn test_excludes_no_private_ips_by_default() {
let filter = Filter::default();
assert!(!filter.is_excluded(&website(V4_PRIVATE_CLASS_A)));
assert!(!filter.is_excluded(&website(V4_PRIVATE_CLASS_B)));
assert!(!filter.is_excluded(&website(V4_PRIVATE_CLASS_C)));
assert!(!filter.is_excluded(&website(V4_LINK_LOCAL)));
assert!(!filter.is_excluded(&website(V4_LOOPBACK)));
assert!(!filter.is_excluded(&website(V6_LOOPBACK)));
}
#[test]
fn test_exclude_private_ips() {
let filter = Filter {
excludes: Excludes {
private_ips: true,
..Excludes::default()
},
..Filter::default()
};
assert!(filter.is_excluded(&website(V4_PRIVATE_CLASS_A)));
assert!(filter.is_excluded(&website(V4_PRIVATE_CLASS_B)));
assert!(filter.is_excluded(&website(V4_PRIVATE_CLASS_C)));
}
#[test]
fn test_exclude_link_local() {
let filter = Filter {
excludes: Excludes {
link_local_ips: true,
..Excludes::default()
},
..Filter::default()
};
assert!(filter.is_excluded(&website(V4_LINK_LOCAL)));
}
#[test]
fn test_exclude_loopback() {
let filter = Filter {
excludes: Excludes {
loopback_ips: true,
..Excludes::default()
},
..Filter::default()
};
assert!(filter.is_excluded(&website(V4_LOOPBACK)));
assert!(filter.is_excluded(&website(V6_LOOPBACK)));
}
#[test]
fn test_exclude_ip_v4_mapped_ip_v6_not_supported() {
let filter = Filter {
excludes: Excludes {
private_ips: true,
link_local_ips: true,
..Excludes::default()
},
..Filter::default()
};
// if these were pure IPv4, we would exclude
assert!(!filter.is_excluded(&website(V6_MAPPED_V4_PRIVATE_CLASS_A)));
assert!(!filter.is_excluded(&website(V6_MAPPED_V4_LINK_LOCAL)));
}
}

73
lychee-lib/src/lib.rs Normal file
View file

@ -0,0 +1,73 @@
//! `lychee` is a library for checking links.
//! "Hello world" example:
//! ```
//! use lychee_lib::Result;
//!
//! #[tokio::main]
//! async fn main() -> Result<()> {
//! let response = lychee_lib::check("https://github.com/lycheeverse/lychee").await?;
//! println!("{}", response);
//! Ok(())
//! }
//! ```
//!
//! For more specific use-cases you can build a lychee client yourself,
//! using the `ClientBuilder` which can be used to
//! configure and run your own link checker and grants full flexibility:
//!
//! ```
//! use lychee_lib::{ClientBuilder, Result, Status};
//!
//! #[tokio::main]
//! async fn main() -> Result<()> {
//! let client = ClientBuilder::default().build()?;
//! let response = client.check("https://github.com/lycheeverse/lychee").await?;
//! assert!(response.status().is_success());
//! Ok(())
//! }
//! ```
#![warn(clippy::all, clippy::pedantic)]
#![warn(
absolute_paths_not_starting_with_crate,
invalid_html_tags,
missing_copy_implementations,
missing_debug_implementations,
semicolon_in_expressions_from_macros,
unreachable_pub,
unused_crate_dependencies,
unused_extern_crates,
variant_size_differences,
clippy::missing_const_for_fn
)]
#![deny(anonymous_parameters, macro_use_extern_crate, pointer_structural_match)]
// #![deny(missing_docs)]
#[cfg(doctest)]
doc_comment::doctest!("../../README.md");
mod client;
mod client_pool;
mod quirks;
mod types;
mod uri;
pub mod collector;
pub mod extract;
pub mod filter;
#[cfg(test)]
#[macro_use]
pub mod test_utils;
#[cfg(test)]
use doc_comment as _; // required for doctest
use openssl_sys as _; // required for vendored-openssl feature
use ring as _; // required for apple silicon
pub use crate::{
client::{check, ClientBuilder},
client_pool::ClientPool,
collector::Input,
filter::{Excludes, Filter, Includes},
types::{ErrorKind, Request, Response, ResponseBody, Result, Status},
uri::Uri,
};

View file

@ -6,13 +6,13 @@ use reqwest::{Request, Url};
const GOOGLEBOT: &str = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://google.com/bot.html)";
#[derive(Debug, Clone)]
pub struct Quirk {
pub pattern: Regex,
pub rewrite: fn(Request) -> Request,
pub(crate) struct Quirk {
pub(crate) pattern: Regex,
pub(crate) rewrite: fn(Request) -> Request,
}
#[derive(Debug, Clone)]
pub struct Quirks {
pub(crate) struct Quirks {
quirks: Vec<Quirk>,
}
@ -62,7 +62,7 @@ impl Quirks {
/// Apply quirks to a given request. Only the first quirk regex pattern
/// matching the URL will be applied. The rest will be discarded for
/// simplicity reasons. This limitation might be lifted in the future.
pub fn apply(&self, request: Request) -> Request {
pub(crate) fn apply(&self, request: Request) -> Request {
for quirk in &self.quirks {
if quirk.pattern.is_match(request.url().as_str()) {
return (quirk.rewrite)(request);
@ -75,51 +75,68 @@ impl Quirks {
#[cfg(test)]
mod tests {
use super::*;
use http::{header, Method};
use pretty_assertions::assert_eq;
use reqwest::{Request, Url};
use super::{Quirks, GOOGLEBOT};
#[derive(Debug)]
struct MockRequest(Request);
impl MockRequest {
fn new(method: Method, url: Url) -> Self {
Self(Request::new(method, url))
}
}
impl PartialEq for MockRequest {
fn eq(&self, other: &Self) -> bool {
self.0.url() == other.0.url() && self.0.method() == other.0.method()
}
}
#[test]
fn test_twitter_request() {
let orig = Url::parse("https://twitter.com/zarfeblong/status/1339742840142872577").unwrap();
let request = Request::new(Method::GET, orig.clone());
let quirks = Quirks::default();
let modified = quirks.apply(request);
assert_eq!(modified.url(), &orig);
assert_eq!(modified.method(), Method::HEAD);
let url = Url::parse("https://twitter.com/zarfeblong/status/1339742840142872577").unwrap();
let request = Request::new(Method::GET, url.clone());
let modified = Quirks::default().apply(request);
assert_eq!(
modified.headers().get(header::USER_AGENT).unwrap(),
&GOOGLEBOT
);
assert_eq!(MockRequest(modified), MockRequest::new(Method::HEAD, url));
}
#[test]
fn test_youtube_video_request() {
let orig = Url::parse("https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7").unwrap();
let request = Request::new(Method::GET, orig);
let quirks = Quirks::default();
let modified = quirks.apply(request);
let url = Url::parse("https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7").unwrap();
let request = Request::new(Method::GET, url);
let modified = Quirks::default().apply(request);
let expected_url = Url::parse("https://www.youtube.com/oembed?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DNlKuICiT470%26list%3DPLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ%26index%3D7").unwrap();
assert_eq!(modified.url(), &expected_url);
assert_eq!(modified.method(), Method::GET);
assert_eq!(
MockRequest(modified),
MockRequest::new(Method::GET, expected_url)
);
}
#[test]
fn test_non_video_youtube_url_untouched() {
let orig = Url::parse("https://www.youtube.com/channel/UCaYhcUwRBNscFNUKTjgPFiA").unwrap();
let request = Request::new(Method::GET, orig.clone());
let quirks = Quirks::default();
let modified = quirks.apply(request);
assert_eq!(modified.url(), &orig);
assert_eq!(modified.method(), Method::GET);
let url = Url::parse("https://www.youtube.com/channel/UCaYhcUwRBNscFNUKTjgPFiA").unwrap();
let request = Request::new(Method::GET, url.clone());
let modified = Quirks::default().apply(request);
assert_eq!(MockRequest(modified), MockRequest::new(Method::GET, url));
}
#[test]
fn test_no_quirk_applied() {
let orig = Url::parse("https://endler.dev").unwrap();
let request = Request::new(Method::GET, orig.clone());
let quirks = Quirks::default();
let modified = quirks.apply(request);
assert_eq!(modified.url(), &orig);
assert_eq!(modified.method(), Method::GET);
let url = Url::parse("https://endler.dev").unwrap();
let request = Request::new(Method::GET, url.clone());
let modified = Quirks::default().apply(request);
assert_eq!(MockRequest(modified), MockRequest::new(Method::GET, url));
}
}

View file

@ -0,0 +1,45 @@
use std::convert::TryFrom;
use reqwest::Url;
use crate::{ClientBuilder, ErrorKind, Request, Uri};
#[macro_export]
macro_rules! mock_server {
($status:expr $(, $func:tt ($($arg:expr),*))*) => {{
let mock_server = wiremock::MockServer::start().await;
let template = wiremock::ResponseTemplate::new(http::StatusCode::from($status));
let template = template$(.$func($($arg),*))*;
wiremock::Mock::given(wiremock::matchers::method("GET")).respond_with(template).mount(&mock_server).await;
mock_server
}};
}
pub(crate) async fn get_mock_client_response<T, E>(request: T) -> crate::Response
where
Request: TryFrom<T, Error = E>,
ErrorKind: From<E>,
{
ClientBuilder::default()
.build()
.unwrap()
.check(request)
.await
.unwrap()
}
/// Helper method to convert a string into a URI
/// Note: This panics on error, so it should only be used for testing
pub(crate) fn website(url: &str) -> Uri {
Uri::from(Url::parse(url).expect("Expected valid Website URI"))
}
pub(crate) fn mail(address: &str) -> Uri {
if address.starts_with("mailto:") {
Url::parse(address)
} else {
Url::parse(&(String::from("mailto:") + address))
}
.expect("Expected valid Mail Address")
.into()
}

View file

@ -0,0 +1,163 @@
use std::{any::Any, convert::Infallible, fmt::Display, hash::Hash, path::PathBuf};
use http::header::InvalidHeaderValue;
use serde::{Serialize, Serializer};
use crate::Uri;
/// Kinds of status errors.
#[allow(clippy::module_name_repetitions)]
#[derive(Debug)]
#[non_exhaustive]
pub enum ErrorKind {
// TODO: maybe need to be splitted; currently first slot is Some only for reading files
IoError(Option<PathBuf>, std::io::Error),
ReqwestError(reqwest::Error),
HubcapsError(hubcaps::Error),
UrlParseError(String, (url::ParseError, Option<fast_chemail::ParseError>)),
UnreachableEmailAddress(Uri),
InvalidHeader(InvalidHeaderValue),
InvalidGlobPattern(glob::PatternError),
MissingGitHubToken,
}
impl PartialEq for ErrorKind {
fn eq(&self, other: &Self) -> bool {
match (self, other) {
(Self::IoError(p1, e1), Self::IoError(p2, e2)) => p1 == p2 && e1.kind() == e2.kind(),
(Self::ReqwestError(e1), Self::ReqwestError(e2)) => e1.to_string() == e2.to_string(),
(Self::HubcapsError(e1), Self::HubcapsError(e2)) => e1.to_string() == e2.to_string(),
(Self::UrlParseError(s1, e1), Self::UrlParseError(s2, e2)) => s1 == s2 && e1 == e2,
(Self::UnreachableEmailAddress(u1), Self::UnreachableEmailAddress(u2)) => u1 == u2,
(Self::InvalidGlobPattern(e1), Self::InvalidGlobPattern(e2)) => {
e1.msg == e2.msg && e1.pos == e2.pos
}
(Self::InvalidHeader(_), Self::InvalidHeader(_))
| (Self::MissingGitHubToken, Self::MissingGitHubToken) => true,
_ => false,
}
}
}
impl Eq for ErrorKind {}
impl Hash for ErrorKind {
fn hash<H>(&self, state: &mut H)
where
H: std::hash::Hasher,
{
match self {
Self::IoError(p, e) => (p, e.kind()).hash(state),
Self::ReqwestError(e) => e.to_string().hash(state),
Self::HubcapsError(e) => e.to_string().hash(state),
Self::UrlParseError(s, e) => (s, e.type_id()).hash(state),
Self::UnreachableEmailAddress(u) => u.hash(state),
Self::InvalidHeader(e) => e.to_string().hash(state),
Self::InvalidGlobPattern(e) => e.to_string().hash(state),
Self::MissingGitHubToken => std::mem::discriminant(self).hash(state),
}
}
}
impl Display for ErrorKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::IoError(Some(p), e) => write!(
f,
"Failed to read file: `{}`, reason: {}",
p.to_str().unwrap_or("<MALFORMED PATH>"),
e
),
Self::IoError(None, e) => e.fmt(f),
Self::ReqwestError(e) => e.fmt(f),
Self::HubcapsError(e) => e.fmt(f),
Self::UrlParseError(s, (url_err, Some(mail_err))) => {
write!(
f,
"Cannot parse {} as website url ({}) or mail address ({})",
s, url_err, mail_err
)
}
Self::UrlParseError(s, (url_err, None)) => {
write!(f, "Cannot parse {} as website url ({})", s, url_err)
}
Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri),
Self::InvalidHeader(e) => e.fmt(f),
Self::InvalidGlobPattern(e) => e.fmt(f),
Self::MissingGitHubToken => f.write_str(
"GitHub token not specified. To check GitHub links reliably, \
use `--github-token` flag / `GITHUB_TOKEN` env var.",
),
}
}
}
impl Serialize for ErrorKind {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.collect_str(self)
}
}
impl From<(PathBuf, std::io::Error)> for ErrorKind {
fn from(value: (PathBuf, std::io::Error)) -> Self {
Self::IoError(Some(value.0), value.1)
}
}
impl From<std::io::Error> for ErrorKind {
fn from(e: std::io::Error) -> Self {
Self::IoError(None, e)
}
}
impl From<tokio::task::JoinError> for ErrorKind {
fn from(e: tokio::task::JoinError) -> Self {
Self::IoError(None, e.into())
}
}
impl From<reqwest::Error> for ErrorKind {
fn from(e: reqwest::Error) -> Self {
Self::ReqwestError(e)
}
}
impl From<hubcaps::errors::Error> for ErrorKind {
fn from(e: hubcaps::Error) -> Self {
Self::HubcapsError(e)
}
}
impl From<(String, url::ParseError)> for ErrorKind {
fn from(value: (String, url::ParseError)) -> Self {
Self::UrlParseError(value.0, (value.1, None))
}
}
impl From<(String, url::ParseError, fast_chemail::ParseError)> for ErrorKind {
fn from(value: (String, url::ParseError, fast_chemail::ParseError)) -> Self {
Self::UrlParseError(value.0, (value.1, Some(value.2)))
}
}
impl From<InvalidHeaderValue> for ErrorKind {
fn from(e: InvalidHeaderValue) -> Self {
Self::InvalidHeader(e)
}
}
impl From<glob::PatternError> for ErrorKind {
fn from(e: glob::PatternError) -> Self {
Self::InvalidGlobPattern(e)
}
}
impl From<Infallible> for ErrorKind {
fn from(_: Infallible) -> Self {
// tautological
unreachable!()
}
}

View file

@ -0,0 +1,13 @@
#![allow(unreachable_pub)]
mod error;
mod request;
mod response;
mod status;
pub use error::ErrorKind;
pub use request::Request;
pub use response::{Response, ResponseBody};
pub use status::Status;
pub type Result<T> = std::result::Result<T, crate::ErrorKind>;

View file

@ -0,0 +1,41 @@
use std::{convert::TryFrom, fmt::Display};
use crate::{ErrorKind, Input, Uri};
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
pub struct Request {
pub uri: Uri,
pub source: Input,
}
impl Request {
#[inline]
#[must_use]
pub const fn new(uri: Uri, source: Input) -> Self {
Request { uri, source }
}
}
impl Display for Request {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{} ({})", self.uri, self.source)
}
}
impl TryFrom<String> for Request {
type Error = ErrorKind;
fn try_from(s: String) -> Result<Self, Self::Error> {
let uri = Uri::try_from(s.as_str())?;
Ok(Request::new(uri, Input::String(s)))
}
}
impl TryFrom<&str> for Request {
type Error = ErrorKind;
fn try_from(s: &str) -> Result<Self, Self::Error> {
let uri = Uri::try_from(s)?;
Ok(Request::new(uri, Input::String(s.to_owned())))
}
}

View file

@ -0,0 +1,65 @@
use std::fmt::Display;
use serde::Serialize;
use crate::{Input, Status, Uri};
#[derive(Debug)]
pub struct Response(pub Input, pub ResponseBody);
impl Response {
#[inline]
#[must_use]
pub const fn new(uri: Uri, status: Status, source: Input) -> Self {
Response(source, ResponseBody { uri, status })
}
#[inline]
#[must_use]
pub const fn status(&self) -> &Status {
&self.1.status
}
}
impl Display for Response {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
<ResponseBody as Display>::fmt(&self.1, f)
}
}
impl Serialize for Response {
fn serialize<S>(&self, s: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
<ResponseBody as Serialize>::serialize(&self.1, s)
}
}
#[allow(clippy::module_name_repetitions)]
#[derive(Debug, Serialize, Hash, PartialEq, Eq)]
pub struct ResponseBody {
#[serde(flatten)]
pub uri: Uri,
pub status: Status,
}
impl Display for ResponseBody {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let ResponseBody {
ref uri,
ref status,
} = self;
// TODO: Other errors?
let metadata = match status {
Status::Ok(code) | Status::Redirected(code) => {
format!(" [{}]", code)
}
Status::Timeout(Some(code)) => format!(" [{}]", code),
Status::Error(e) => format!(" ({})", e),
_ => "".to_owned(),
};
write!(f, "{} {}{}", status.icon(), uri, metadata)
}
}

View file

@ -0,0 +1,127 @@
use std::{collections::HashSet, fmt::Display};
use http::StatusCode;
use reqwest::Response;
use serde::{Serialize, Serializer};
use crate::ErrorKind;
const ICON_OK: &str = "\u{2714}"; // ✔
const ICON_REDIRECTED: &str = "\u{21c4}"; // ⇄
const ICON_EXCLUDED: &str = "\u{003f}"; // ?
const ICON_ERROR: &str = "\u{2717}"; // ✗
const ICON_TIMEOUT: &str = "\u{29d6}"; // ⧖
/// Response status of the request.
#[allow(variant_size_differences)]
#[derive(Debug, Hash, PartialEq, Eq)]
pub enum Status {
/// Request was successful
Ok(StatusCode),
/// Failed request
Error(Box<ErrorKind>),
/// Request timed out
Timeout(Option<StatusCode>),
/// Got redirected to different resource
Redirected(StatusCode),
/// Resource was excluded from checking
Excluded,
}
impl Display for Status {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Status::Ok(c) => write!(f, "OK ({})", c),
Status::Redirected(c) => write!(f, "Redirect ({})", c),
Status::Excluded => f.write_str("Excluded"),
Status::Error(e) => write!(f, "Failed: {}", e),
Status::Timeout(Some(c)) => write!(f, "Timeout ({})", c),
Status::Timeout(None) => f.write_str("Timeout"),
}
}
}
impl Serialize for Status {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.collect_str(self)
}
}
impl Status {
#[allow(clippy::missing_panics_doc)]
#[must_use]
pub fn new(response: &Response, accepted: Option<HashSet<StatusCode>>) -> Self {
let code = response.status();
if let Some(true) = accepted.map(|a| a.contains(&code)) {
Self::Ok(code)
} else {
match response.error_for_status_ref() {
Ok(_) if code.is_success() => Self::Ok(code),
Ok(_) if code.is_redirection() => Self::Redirected(code),
Err(e) => e.into(),
Ok(_) => unreachable!(),
}
}
}
#[inline]
#[must_use]
pub const fn is_success(&self) -> bool {
matches!(self, Status::Ok(_))
}
#[inline]
#[must_use]
pub const fn is_failure(&self) -> bool {
matches!(self, Status::Error(_))
}
#[inline]
#[must_use]
pub const fn is_excluded(&self) -> bool {
matches!(self, Status::Excluded)
}
#[inline]
#[must_use]
pub const fn is_timeout(&self) -> bool {
matches!(self, Status::Timeout(_))
}
#[must_use]
pub const fn icon(&self) -> &str {
match self {
Status::Ok(_) => ICON_OK,
Status::Redirected(_) => ICON_REDIRECTED,
Status::Excluded => ICON_EXCLUDED,
Status::Error(_) => ICON_ERROR,
Status::Timeout(_) => ICON_TIMEOUT,
}
}
}
impl From<ErrorKind> for Status {
fn from(e: ErrorKind) -> Self {
Self::Error(Box::new(e))
}
}
impl From<reqwest::Error> for Status {
fn from(e: reqwest::Error) -> Self {
if e.is_timeout() {
Self::Timeout(e.status())
} else {
Self::Error(Box::new(ErrorKind::ReqwestError(e)))
}
}
}
impl From<hubcaps::Error> for Status {
fn from(e: hubcaps::Error) -> Self {
Self::Error(Box::new(e.into()))
}
}

214
lychee-lib/src/uri.rs Normal file
View file

@ -0,0 +1,214 @@
use std::{convert::TryFrom, fmt::Display, net::IpAddr};
use fast_chemail::parse_email;
use serde::{Deserialize, Serialize};
use url::Url;
use crate::{ErrorKind, Result};
/// Lychee's own representation of a URI, which encapsulates all support formats.
///
/// If the scheme is `mailto`, it's a mail address.
/// Otherwise it's treated as a website URL.
#[derive(Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct Uri {
/// Website URL or mail address
pub(crate) url: Url,
}
impl Uri {
/// Returns the string representation of the `Uri`.
///
/// If it's an email address, returns the string with scheme stripped.
/// Otherwise returns the string as-is.
#[inline]
#[must_use]
pub fn as_str(&self) -> &str {
self.url.as_ref().trim_start_matches("mailto:")
}
#[inline]
#[must_use]
pub fn scheme(&self) -> &str {
self.url.scheme()
}
#[inline]
#[must_use]
pub fn domain(&self) -> Option<&str> {
self.url.domain()
}
#[inline]
#[must_use]
pub fn path_segments(&self) -> Option<std::str::Split<char>> {
self.url.path_segments()
}
#[must_use]
pub fn host_ip(&self) -> Option<IpAddr> {
match self.url.host()? {
url::Host::Domain(_) => None,
url::Host::Ipv4(v4_addr) => Some(v4_addr.into()),
url::Host::Ipv6(v6_addr) => Some(v6_addr.into()),
}
}
// TODO: Support GitLab etc.
pub(crate) fn extract_github(&self) -> Option<(&str, &str)> {
debug_assert!(
!matches!(self.scheme(), "mailto"),
"Should only be called on a Website type!"
);
// TODO: Support more patterns
if matches!(
self.domain()?,
"github.com" | "www.github.com" | "raw.githubusercontent.com"
) {
let mut path = self.path_segments()?;
let owner = path.next()?;
let repo = path.next()?;
return Some((owner, repo));
}
None
}
}
impl AsRef<str> for Uri {
fn as_ref(&self) -> &str {
self.as_str()
}
}
impl From<Url> for Uri {
fn from(url: Url) -> Self {
Self { url }
}
}
impl TryFrom<String> for Uri {
type Error = ErrorKind;
fn try_from(s: String) -> Result<Self> {
let s = s.trim_start_matches("mailto:");
if let Err(mail_err) = parse_email(s) {
match Url::parse(s) {
Ok(uri) => Ok(uri.into()),
Err(url_err) => Err((s.to_owned(), url_err, mail_err).into()),
}
} else {
Ok(Url::parse(&(String::from("mailto:") + s)).unwrap().into())
}
}
}
impl TryFrom<&str> for Uri {
type Error = ErrorKind;
fn try_from(s: &str) -> Result<Self> {
let s = s.trim_start_matches("mailto:");
if let Err(mail_err) = parse_email(s) {
match Url::parse(s) {
Ok(uri) => Ok(uri.into()),
Err(url_err) => Err((s.to_owned(), url_err, mail_err).into()),
}
} else {
Ok(Url::parse(&(String::from("mailto:") + s)).unwrap().into())
}
}
}
impl Display for Uri {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.as_str())
}
}
#[cfg(test)]
mod test {
use std::{
convert::TryFrom,
net::{IpAddr, Ipv4Addr, Ipv6Addr},
};
use pretty_assertions::assert_eq;
use super::Uri;
use crate::test_utils::{mail, website};
#[test]
fn test_uri_from_str() {
assert!(Uri::try_from("").is_err());
assert_eq!(
Uri::try_from("http://example.org"),
Ok(website("http://example.org"))
);
assert_eq!(
Uri::try_from("http://example.org/@test/testing"),
Ok(website("http://example.org/@test/testing"))
);
assert_eq!(
Uri::try_from("mail@example.org"),
Ok(mail("mail@example.org"))
);
assert_eq!(
Uri::try_from("mailto:mail@example.org"),
Ok(mail("mail@example.org"))
);
}
#[test]
fn test_uri_host_ip_v4() {
assert_eq!(
website("http://127.0.0.1").host_ip(),
Some(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)))
);
}
#[test]
fn test_uri_host_ip_v6() {
assert_eq!(
website("https://[2020::0010]").host_ip(),
Some(IpAddr::V6(Ipv6Addr::new(0x2020, 0, 0, 0, 0, 0, 0, 0x10)))
);
}
#[test]
fn test_uri_host_ip_no_ip() {
assert!(website("https://some.cryptic/url").host_ip().is_none());
}
#[test]
fn test_mail() {
assert_eq!(
website("http://127.0.0.1").host_ip(),
Some(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)))
);
}
#[test]
fn test_is_github() {
assert_eq!(
website("http://github.com/lycheeverse/lychee").extract_github(),
Some(("lycheeverse", "lychee"))
);
assert_eq!(
website("http://www.github.com/lycheeverse/lychee").extract_github(),
Some(("lycheeverse", "lychee"))
);
assert_eq!(
website("https://github.com/lycheeverse/lychee").extract_github(),
Some(("lycheeverse", "lychee"))
);
assert!(
website("https://pkg.go.dev/github.com/Debian/pkg-go-tools/cmd/pgt-gopath")
.extract_github()
.is_none()
);
}
}

1
rust-toolchain Normal file
View file

@ -0,0 +1 @@
stable

View file

@ -1,182 +0,0 @@
use console::style;
use pad::{Alignment, PadStr};
use serde::Serialize;
use std::{
collections::{HashMap, HashSet},
fmt::{self, Display},
};
use lychee::{self, collector::Input, Response, Status};
// Maximum padding for each entry in the final statistics output
const MAX_PADDING: usize = 20;
pub fn color_response(response: &Response) -> String {
let out = match response.status {
Status::Ok(_) => style(response).green().bright(),
Status::Redirected(_) => style(response),
Status::Excluded => style(response).dim(),
Status::Timeout(_) => style(response).yellow().bright(),
Status::Error(_, _) => style(response).red().bright(),
};
out.to_string()
}
#[derive(Serialize)]
pub struct ResponseStats {
total: usize,
successful: usize,
failures: usize,
timeouts: usize,
redirects: usize,
excludes: usize,
errors: usize,
fail_map: HashMap<Input, HashSet<Response>>,
}
impl ResponseStats {
pub fn new() -> Self {
let fail_map = HashMap::new();
ResponseStats {
total: 0,
successful: 0,
failures: 0,
timeouts: 0,
redirects: 0,
excludes: 0,
errors: 0,
fail_map,
}
}
pub fn add(&mut self, response: Response) {
self.total += 1;
match response.status {
Status::Error(_, _) => self.failures += 1,
Status::Timeout(_) => self.timeouts += 1,
Status::Redirected(_) => self.redirects += 1,
Status::Excluded => self.excludes += 1,
_ => self.successful += 1,
}
if matches!(
response.status,
Status::Error(_, _) | Status::Timeout(_) | Status::Redirected(_)
) {
let fail = self.fail_map.entry(response.source.clone()).or_default();
fail.insert(response);
};
}
pub fn is_success(&self) -> bool {
self.total == self.successful + self.excludes
}
pub fn is_empty(&self) -> bool {
self.total == 0
}
}
fn write_stat(f: &mut fmt::Formatter, title: &str, stat: usize, newline: bool) -> fmt::Result {
let fill = title.chars().count();
f.write_str(title)?;
f.write_str(
&stat
.to_string()
.pad(MAX_PADDING - fill, '.', Alignment::Right, false),
)?;
if newline {
f.write_str("\n")?;
}
Ok(())
}
impl Display for ResponseStats {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let separator = "-".repeat(MAX_PADDING + 1);
writeln!(f, "📝 Summary")?;
writeln!(f, "{}", separator)?;
write_stat(f, "🔍 Total", self.total, true)?;
write_stat(f, "✅ Successful", self.successful, true)?;
write_stat(f, "⏳ Timeouts", self.timeouts, true)?;
write_stat(f, "🔀 Redirected", self.redirects, true)?;
write_stat(f, "👻 Excluded", self.excludes, true)?;
write_stat(f, "🚫 Errors", self.errors + self.failures, false)?;
for (input, responses) in &self.fail_map {
// Using leading newlines over trailing ones (e.g. `writeln!`)
// lets us avoid extra newlines without any additional logic.
write!(f, "\n\nErrors in {}", input)?;
for response in responses {
write!(f, "\n{}", color_response(response))?
}
}
Ok(())
}
}
#[cfg(test)]
mod test_super {
use lychee::{test_utils::website, Status};
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn test_stats_is_empty() {
let mut stats = ResponseStats::new();
assert!(stats.is_empty());
stats.add(Response {
uri: website("http://example.org/ok"),
status: Status::Ok(http::StatusCode::OK),
source: Input::Stdin,
});
assert!(!stats.is_empty());
}
#[test]
fn test_stats() {
let mut stats = ResponseStats::new();
stats.add(Response {
uri: website("http://example.org/ok"),
status: Status::Ok(http::StatusCode::OK),
source: Input::Stdin,
});
stats.add(Response {
uri: website("http://example.org/failed"),
status: Status::Error("".to_string(), Some(http::StatusCode::BAD_GATEWAY)),
source: Input::Stdin,
});
stats.add(Response {
uri: website("http://example.org/redirect"),
status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT),
source: Input::Stdin,
});
let mut expected_map = HashMap::new();
expected_map.insert(
Input::Stdin,
vec![
Response {
uri: website("http://example.org/failed"),
status: Status::Error("".to_string(), Some(http::StatusCode::BAD_GATEWAY)),
source: Input::Stdin,
},
Response {
uri: website("http://example.org/redirect"),
status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT),
source: Input::Stdin,
},
]
.into_iter()
.collect::<HashSet<_>>(),
);
assert_eq!(stats.fail_map, expected_map);
}
}

View file

@ -1,519 +0,0 @@
use anyhow::{anyhow, bail, Context, Result};
use check_if_email_exists::{check_email, CheckEmailInput};
use derive_builder::Builder;
use headers::{HeaderMap, HeaderValue};
use hubcaps::{Credentials, Github};
use regex::{Regex, RegexSet};
use reqwest::header;
use std::convert::TryInto;
use std::{collections::HashSet, time::Duration};
use tokio::time::sleep;
use url::Url;
use crate::filter::Excludes;
use crate::filter::Filter;
use crate::filter::Includes;
use crate::quirks::Quirks;
use crate::types::{Response, Status};
use crate::uri::Uri;
use crate::Request;
const VERSION: &str = env!("CARGO_PKG_VERSION");
const DEFAULT_MAX_REDIRECTS: usize = 5;
#[derive(Debug, Clone)]
pub struct Client {
/// The underlying reqwest client instance that handles the HTTP requests
reqwest_client: reqwest::Client,
/// Github API client
github: Option<Github>,
/// Filtered domain handling
filter: Filter,
/// The default request HTTP method to use
method: reqwest::Method,
/// The set of accepted HTTP status codes for valid URIs
accepted: Option<HashSet<reqwest::StatusCode>>,
/// Override behavior for certain known issues with URIs
quirks: Quirks,
}
/// A link checker using an API token for Github links
/// otherwise a normal HTTP client.
#[derive(Builder, Debug)]
#[builder(build_fn(skip))]
#[builder(setter(into))]
#[builder(name = "ClientBuilder")]
pub struct ClientBuilderInternal {
/// Set an optional Github token.
/// This allows for more requests before
/// getting rate-limited.
github_token: Option<String>,
/// Check links matching this set of regular expressions
includes: Option<RegexSet>,
/// Exclude links matching this set of regular expressions
excludes: Option<RegexSet>,
/// Exclude all private network addresses
exclude_all_private: bool,
/// Exclude private IP addresses
exclude_private_ips: bool,
/// Exclude link-local IPs
exclude_link_local_ips: bool,
/// Exclude loopback IP addresses (e.g. 127.0.0.1)
exclude_loopback_ips: bool,
/// Don't check mail addresses
exclude_mail: bool,
/// Maximum number of redirects before returning error
max_redirects: usize,
/// User agent used for checking links
user_agent: String,
/// Ignore SSL errors
allow_insecure: bool,
/// Allowed URI scheme (e.g. https, http).
/// This excludes all links from checking, which
/// don't specify that scheme in the URL.
scheme: Option<String>,
/// Map of headers to send to each resource.
/// This allows working around validation issues
/// on some websites.
custom_headers: HeaderMap,
/// Request method (e.g. `GET` or `HEAD`)
method: reqwest::Method,
/// Set of accepted return codes / status codes
accepted: Option<HashSet<http::StatusCode>>,
/// Response timeout per request
timeout: Option<Duration>,
}
impl ClientBuilder {
fn build_excludes(&mut self) -> Excludes {
// exclude_all_private option turns on all "private" excludes,
// including private IPs, link-local IPs and loopback IPs
let enable_exclude = |opt| opt || self.exclude_all_private.unwrap_or_default();
Excludes {
regex: self.excludes.clone().unwrap_or_default(),
private_ips: enable_exclude(self.exclude_private_ips.unwrap_or_default()),
link_local_ips: enable_exclude(self.exclude_link_local_ips.unwrap_or_default()),
loopback_ips: enable_exclude(self.exclude_loopback_ips.unwrap_or_default()),
mail: self.exclude_mail.unwrap_or_default(),
}
}
fn build_includes(&mut self) -> Includes {
Includes {
regex: self.includes.clone().unwrap_or_default(),
}
}
/// The build method instantiates the client.
pub fn build(&mut self) -> Result<Client> {
let mut headers = HeaderMap::new();
// Faking the user agent is necessary for some websites, unfortunately.
// Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com).
let user_agent = self
.user_agent
.clone()
.unwrap_or_else(|| format!("lychee/{}", VERSION));
headers.insert(header::USER_AGENT, HeaderValue::from_str(&user_agent)?);
headers.insert(header::TRANSFER_ENCODING, HeaderValue::from_str("chunked")?);
if let Some(custom) = &self.custom_headers {
headers.extend(custom.clone());
}
let allow_insecure = self.allow_insecure.unwrap_or(false);
let max_redirects = self.max_redirects.unwrap_or(DEFAULT_MAX_REDIRECTS);
let builder = reqwest::ClientBuilder::new()
.gzip(true)
.default_headers(headers)
.danger_accept_invalid_certs(allow_insecure)
.redirect(reqwest::redirect::Policy::limited(max_redirects));
let builder = match self.timeout {
Some(t) => builder
.timeout(t.ok_or_else(|| anyhow!("cannot parse timeout: {:?}", self.timeout))?),
None => builder,
};
let reqwest_client = builder.build()?;
let token: Option<String> = self.github_token.clone().unwrap_or_default();
let github = match token {
Some(token) => {
if token.is_empty() {
None
} else {
let github = Github::new(user_agent, Credentials::Token(token))?;
Some(github)
}
}
None => None,
};
let scheme = self.scheme.clone().unwrap_or(None);
let scheme = scheme.map(|s| s.to_lowercase());
let includes = self.build_includes();
let excludes = self.build_excludes();
let filter = Filter::new(Some(includes), Some(excludes), scheme);
let quirks = Quirks::default();
Ok(Client {
reqwest_client,
github,
filter,
quirks,
method: self.method.clone().unwrap_or(reqwest::Method::GET),
accepted: self.accepted.clone().unwrap_or(None),
})
}
}
impl Client {
pub async fn check<T: TryInto<Request>>(&self, request: T) -> Result<Response> {
let request: Request = match request.try_into() {
Ok(request) => request,
Err(_e) => bail!("Invalid URI"),
};
if self.filter.excluded(&request) {
return Ok(Response::new(request.uri, Status::Excluded, request.source));
}
let status = self.check_main(&request).await?;
Ok(Response::new(request.uri, status, request.source))
}
async fn check_main(&self, request: &Request) -> Result<Status> {
Ok(match request.uri {
Uri::Website(ref url) => self.check_website(&url).await,
Uri::Mail(ref address) => {
// TODO: We should not be using a HTTP status code for mail
match self.check_mail(&address).await {
true => Status::Ok(http::StatusCode::OK),
false => Status::Error(format!("Invalid mail address: {}", address), None),
}
}
})
}
pub async fn check_website(&self, url: &Url) -> Status {
let mut retries: i64 = 3;
let mut wait: u64 = 1;
let status = loop {
let res = self.check_default(&url).await;
match res.is_success() {
true => return res,
false => {
if retries > 0 {
retries -= 1;
sleep(Duration::from_secs(wait)).await;
wait *= 2;
} else {
break res;
}
}
}
};
// Pull out the heavy weapons in case of a failed normal request.
// This could be a Github URL and we run into the rate limiter.
if let Ok((owner, repo)) = self.extract_github(url.as_str()) {
return self.check_github(owner, repo).await;
}
status
}
async fn check_github(&self, owner: String, repo: String) -> Status {
match &self.github {
Some(github) => {
let repo = github.repo(owner, repo).get().await;
match repo {
Err(e) => Status::Error(e.to_string(), None),
Ok(_) => Status::Ok(http::StatusCode::OK),
}
}
None => Status::Error(
"GitHub token not specified. To check GitHub links reliably, \
use `--github-token` flag / `GITHUB_TOKEN` env var."
.to_string(),
None,
),
}
}
async fn check_default(&self, url: &Url) -> Status {
let request = match self
.reqwest_client
.request(self.method.clone(), url.to_owned())
.build()
{
Ok(r) => r,
Err(e) => return e.into(),
};
let request = self.quirks.apply(request);
match self.reqwest_client.execute(request).await {
Ok(response) => Status::new(response.status(), self.accepted.clone()),
Err(e) => e.into(),
}
}
fn extract_github(&self, url: &str) -> Result<(String, String)> {
let re = Regex::new(r#"^(https?://)?(www.)?github.com/(?P<owner>[^/]*)/(?P<repo>[^/]*)"#)?;
let caps = re.captures(&url).context("Invalid capture")?;
let owner = caps.name("owner").context("Cannot capture owner")?;
let repo = caps.name("repo").context("Cannot capture repo")?;
Ok((owner.as_str().into(), repo.as_str().into()))
}
pub async fn check_mail(&self, address: &str) -> bool {
let input = CheckEmailInput::new(vec![address.to_string()]);
let results = check_email(&input).await;
let result = results.get(0);
match result {
None => false,
Some(result) => {
// Accept everything that is not invalid
!matches!(
result.is_reachable,
check_if_email_exists::Reachable::Invalid
)
}
}
}
}
/// A convenience function to check a single URI
/// This is the most simple link check and avoids having to create a client manually.
/// For more complex scenarios, look into using the `ClientBuilder` instead.
pub async fn check<T: TryInto<Request>>(request: T) -> Result<Response> {
let client = ClientBuilder::default().build()?;
Ok(client.check(request).await?)
}
#[cfg(test)]
mod test {
use super::*;
use pretty_assertions::assert_eq;
use std::time::{Duration, Instant};
use wiremock::matchers::method;
use wiremock::{Mock, MockServer, ResponseTemplate};
#[tokio::test]
async fn test_nonexistent() {
let template = ResponseTemplate::new(404);
let mock_server = MockServer::start().await;
Mock::given(method("GET"))
.respond_with(template)
.mount(&mock_server)
.await;
let res = ClientBuilder::default()
.build()
.unwrap()
.check(mock_server.uri())
.await
.unwrap();
assert!(res.status.is_failure());
}
#[tokio::test]
async fn test_nonexistent_with_path() {
let res = ClientBuilder::default()
.build()
.unwrap()
.check("http://127.0.0.1/invalid")
.await
.unwrap();
assert!(res.status.is_failure());
}
#[tokio::test]
async fn test_exponential_backoff() {
let template = ResponseTemplate::new(404);
let mock_server = MockServer::start().await;
Mock::given(method("GET"))
.respond_with(template)
.mount(&mock_server)
.await;
let start = Instant::now();
let res = ClientBuilder::default()
.build()
.unwrap()
.check(mock_server.uri())
.await
.unwrap();
let end = start.elapsed();
assert!(matches!(res.status, Status::Error(_, _)));
// on slow connections, this might take a bit longer than nominal backed-off timeout (7 secs)
assert!(end.as_secs() >= 7);
assert!(end.as_secs() <= 8);
}
#[test]
fn test_is_github() {
assert_eq!(
ClientBuilder::default()
.build()
.unwrap()
.extract_github("github.com/lycheeverse/lychee")
.unwrap(),
("lycheeverse".into(), "lychee".into())
);
assert_eq!(
ClientBuilder::default()
.build()
.unwrap()
.extract_github("www.github.com/lycheeverse/lychee")
.unwrap(),
("lycheeverse".into(), "lychee".into())
);
assert_eq!(
ClientBuilder::default()
.build()
.unwrap()
.extract_github("https://github.com/lycheeverse/lychee")
.unwrap(),
("lycheeverse".into(), "lychee".into())
);
assert!(ClientBuilder::default()
.build()
.unwrap()
.extract_github("https://pkg.go.dev/github.com/Debian/pkg-go-tools/cmd/pgt-gopath")
.is_err());
}
#[tokio::test]
async fn test_github() {
assert!(ClientBuilder::default()
.build()
.unwrap()
.check("https://github.com/lycheeverse/lychee")
.await
.unwrap()
.status
.is_success());
}
#[tokio::test]
async fn test_github_nonexistent() {
let res = ClientBuilder::default()
.build()
.unwrap()
.check("https://github.com/lycheeverse/not-lychee")
.await
.unwrap()
.status;
assert!(res.is_failure());
}
#[tokio::test]
async fn test_youtube() {
// This is applying a quirk. See the quirks module.
let client: Client = ClientBuilder::default().build().unwrap();
assert!(client.check("https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7")
.await
.unwrap()
.status.is_success());
assert!(client.check("https://www.youtube.com/watch?v=invalidNlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7")
.await
.unwrap()
.status.is_failure());
}
#[tokio::test]
async fn test_non_github() {
let template = ResponseTemplate::new(200);
let mock_server = MockServer::start().await;
Mock::given(method("GET"))
.respond_with(template)
.mount(&mock_server)
.await;
let res = ClientBuilder::default()
.build()
.unwrap()
.check(mock_server.uri())
.await
.unwrap()
.status;
assert!(res.is_success());
}
#[tokio::test]
async fn test_invalid_ssl() {
let res = ClientBuilder::default()
.build()
.unwrap()
.check("https://expired.badssl.com/")
.await
.unwrap();
assert!(res.status.is_failure());
// Same, but ignore certificate error
let res = ClientBuilder::default()
.allow_insecure(true)
.build()
.unwrap()
.check("https://expired.badssl.com/")
.await
.unwrap();
assert!(res.status.is_success());
}
#[tokio::test]
async fn test_custom_headers() {
let res = ClientBuilder::default()
.build()
.unwrap()
.check("https://crates.io/crates/lychee")
.await
.unwrap();
assert!(res.status.is_failure());
// Try again, but with a custom header.
// For example, crates.io requires a custom accept header.
// See https://github.com/rust-lang/crates.io/issues/788
let mut custom = HeaderMap::new();
custom.insert(header::ACCEPT, "text/html".parse().unwrap());
let res = ClientBuilder::default()
.custom_headers(custom)
.build()
.unwrap()
.check("https://crates.io/crates/lychee")
.await
.unwrap();
assert!(res.status.is_success());
}
#[tokio::test]
async fn test_timeout() {
// Note: this checks response timeout, not connect timeout.
// To check connect timeout, we'd have to do something more involved,
// see: https://github.com/LukeMathWalker/wiremock-rs/issues/19
let mock_delay = Duration::from_millis(20);
let checker_timeout = Duration::from_millis(10);
assert!(mock_delay > checker_timeout);
let template = ResponseTemplate::new(200).set_delay(mock_delay);
let mock_server = MockServer::start().await;
Mock::given(method("GET"))
.respond_with(template)
.mount(&mock_server)
.await;
let client = ClientBuilder::default()
.timeout(checker_timeout)
.build()
.unwrap();
let resp = client.check(mock_server.uri()).await.unwrap();
assert!(matches!(resp.status, Status::Timeout(_)));
}
}

View file

@ -1,32 +0,0 @@
use regex::RegexSet;
/// Include configuration for the link checker.
/// You can include links based on regex patterns
#[derive(Clone, Debug)]
pub struct Includes {
pub regex: Option<RegexSet>,
}
impl Default for Includes {
fn default() -> Self {
Self { regex: None }
}
}
impl Includes {
pub fn regex(&self, input: &str) -> bool {
if let Some(includes) = &self.regex {
if includes.is_match(input) {
return true;
}
}
false
}
pub fn is_empty(&self) -> bool {
match &self.regex {
None => true,
Some(regex_set) => regex_set.is_empty(),
}
}
}

View file

@ -1,315 +0,0 @@
mod excludes;
mod includes;
pub use excludes::Excludes;
pub use includes::Includes;
use crate::uri::Uri;
use crate::Request;
/// A generic URI filter
/// Used to decide if a given URI should be checked or skipped
#[derive(Clone, Debug)]
pub struct Filter {
includes: Includes,
excludes: Excludes,
scheme: Option<String>,
}
impl Filter {
pub fn new(
includes: Option<Includes>,
excludes: Option<Excludes>,
scheme: Option<String>,
) -> Self {
let includes = match includes {
Some(includes) => includes,
None => Includes::default(),
};
let excludes = match excludes {
Some(excludes) => excludes,
None => Excludes::default(),
};
Filter {
includes,
excludes,
scheme,
}
}
pub fn excluded(&self, request: &Request) -> bool {
// Skip mail?
if self.excludes.is_mail_excluded() && matches!(request.uri, Uri::Mail(_)) {
return true;
}
// Skip specific IP address?
if self.excludes.ip(&request.uri) {
return true;
}
// No regex includes/excludes at all?
if self.includes.is_empty() && self.excludes.is_empty() {
// Not excluded unless it's a known false positive
return self.excludes.false_positive(request.uri.as_str());
}
// Includes take precedence over excludes
if self.includes.regex(request.uri.as_str()) {
return false;
}
// Exclude well-known false-positives.
// This is done after checking includes to allow for user-overwrites.
if self.excludes.false_positive(request.uri.as_str()) {
return true;
}
// In case we have includes and no excludes,
// skip everything that was not included
if !self.includes.is_empty() && self.excludes.is_empty() {
return true;
}
// We have no includes. Check regex excludes
if self.excludes.regex(request.uri.as_str()) {
return true;
}
// URI scheme excluded?
if self.scheme.is_none() {
return false;
}
request.uri.scheme() != self.scheme
}
}
#[cfg(test)]
mod test {
// Note: the standard library as of Rust stable 1.47.0 does not expose
// "link-local" or "private" IPv6 checks. However, one might argue
// that these concepts do exist in IPv6, albeit the naming is different.
// See: https://en.wikipedia.org/wiki/Link-local_address#IPv6
// See: https://en.wikipedia.org/wiki/Private_network#IPv6
// See: https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html#method.is_unicast_link_local
const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1";
const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1";
const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1";
const V4_LOOPBACK: &str = "http://127.0.0.1";
const V6_LOOPBACK: &str = "http://[::1]";
const V4_LINK_LOCAL: &str = "http://169.254.0.1";
// IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6)
const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]";
const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]";
use regex::RegexSet;
use reqwest::Url;
use super::*;
use pretty_assertions::assert_eq;
use crate::{test_utils::website, Input};
/// Helper method to convert a string into a Request
/// Note: This panics on error, so it should only be used for testing
pub fn request(url: &str) -> Request {
Request::new(website(url), Input::Stdin)
}
#[test]
fn test_const_sanity() {
let get_host = |s| {
Url::parse(s)
.expect("Expected valid URL")
.host()
.expect("Expected host address")
.to_owned()
};
let into_v4 = |host| match host {
url::Host::Ipv4(ipv4) => ipv4,
_ => panic!("Not IPv4"),
};
let into_v6 = |host| match host {
url::Host::Ipv6(ipv6) => ipv6,
_ => panic!("Not IPv6"),
};
assert!(into_v4(get_host(V4_PRIVATE_CLASS_A)).is_private());
assert!(into_v4(get_host(V4_PRIVATE_CLASS_B)).is_private());
assert!(into_v4(get_host(V4_PRIVATE_CLASS_C)).is_private());
assert!(into_v4(get_host(V4_LOOPBACK)).is_loopback());
assert!(into_v6(get_host(V6_LOOPBACK)).is_loopback());
assert!(into_v4(get_host(V4_LINK_LOCAL)).is_link_local());
}
#[test]
fn test_includes_and_excludes_empty() {
// This is the pre-configured, empty set of excludes for a client
// In this case, only the requests matching the include set will be checked
let includes = Some(Includes::default());
let excludes = Some(Excludes::default());
let filter = Filter::new(includes, excludes, None);
assert_eq!(filter.excluded(&request("https://example.org")), false);
}
#[test]
fn test_false_positives() {
let includes = Some(Includes::default());
let excludes = Some(Excludes::default());
let filter = Filter::new(includes, excludes, None);
assert_eq!(
filter.excluded(&request("http://www.w3.org/1999/xhtml")),
true
);
assert_eq!(filter.excluded(&request("https://example.org")), false);
}
#[test]
fn test_overwrite_false_positives() {
let includes = Some(Includes {
regex: Some(RegexSet::new(&[r"http://www.w3.org/1999/xhtml"]).unwrap()),
});
let excludes = Some(Excludes::default());
let filter = Filter::new(includes, excludes, None);
assert_eq!(
filter.excluded(&request("http://www.w3.org/1999/xhtml")),
false
);
}
#[test]
fn test_include_regex() {
let includes = Some(Includes {
regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()),
});
let filter = Filter::new(includes, None, None);
// Only the requests matching the include set will be checked
assert_eq!(filter.excluded(&request("https://foo.example.org")), false);
assert_eq!(filter.excluded(&request("https://bar.example.org")), true);
assert_eq!(filter.excluded(&request("https://example.org")), true);
}
#[test]
fn test_exclude_mail() {
let excludes = Excludes {
mail: true,
..Default::default()
};
let filter = Filter::new(None, Some(excludes), None);
assert_eq!(
filter.excluded(&Request::new(
Uri::Mail("mail@example.org".to_string()),
Input::Stdin,
)),
true
);
assert_eq!(
filter.excluded(&Request::new(
Uri::Mail("foo@bar.dev".to_string()),
Input::Stdin,
)),
true
);
assert_eq!(filter.excluded(&request("http://bar.dev")), false);
}
#[test]
fn test_exclude_regex() {
let excludes = Excludes {
regex: Some(
RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.org"]).unwrap(),
),
..Default::default()
};
let filter = Filter::new(None, Some(excludes), None);
assert_eq!(filter.excluded(&request("http://github.com")), true);
assert_eq!(filter.excluded(&request("http://exclude.org")), true);
assert_eq!(
filter.excluded(&Request::new(
Uri::Mail("mail@example.org".to_string()),
Input::Stdin,
)),
true
);
assert_eq!(filter.excluded(&request("http://bar.dev")), false);
assert_eq!(
filter.excluded(&Request::new(
Uri::Mail("foo@bar.dev".to_string()),
Input::Stdin,
)),
false
);
}
#[test]
fn test_exclude_include_regex() {
let includes = Some(Includes {
regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()),
});
let excludes = Excludes {
regex: Some(RegexSet::new(&[r"example.org"]).unwrap()),
..Default::default()
};
let filter = Filter::new(includes, Some(excludes), None);
// Includes take preference over excludes
assert_eq!(filter.excluded(&request("https://foo.example.org")), false);
assert_eq!(filter.excluded(&request("https://example.org")), true);
assert_eq!(filter.excluded(&request("https://bar.example.org")), true);
}
#[test]
fn test_excludes_no_private_ips_by_default() {
let filter = Filter::new(None, None, None);
assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_A)), false);
assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_B)), false);
assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_C)), false);
assert_eq!(filter.excluded(&request(V4_LINK_LOCAL)), false);
assert_eq!(filter.excluded(&request(V4_LOOPBACK)), false);
assert_eq!(filter.excluded(&request(V6_LOOPBACK)), false);
}
#[test]
fn test_exclude_private_ips() {
let mut filter = Filter::new(None, None, None);
filter.excludes.private_ips = true;
assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_A)), true);
assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_B)), true);
assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_C)), true);
}
#[test]
fn test_exclude_link_local() {
let mut filter = Filter::new(None, None, None);
filter.excludes.link_local_ips = true;
assert_eq!(filter.excluded(&request(V4_LINK_LOCAL)), true);
}
#[test]
fn test_exclude_loopback() {
let mut filter = Filter::new(None, None, None);
filter.excludes.loopback_ips = true;
assert_eq!(filter.excluded(&request(V4_LOOPBACK)), true);
assert_eq!(filter.excluded(&request(V6_LOOPBACK)), true);
}
#[test]
fn test_exclude_ip_v4_mapped_ip_v6_not_supported() {
let mut filter = Filter::new(None, None, None);
filter.excludes.private_ips = true;
filter.excludes.link_local_ips = true;
// if these were pure IPv4, we would exclude
assert_eq!(
filter.excluded(&request(V6_MAPPED_V4_PRIVATE_CLASS_A)),
false
);
assert_eq!(filter.excluded(&request(V6_MAPPED_V4_LINK_LOCAL)), false);
}
}

View file

@ -1,58 +0,0 @@
#[deny(missing_docs)]
/**
* `lychee` is a library for checking links.
* "Hello world" example:
* ```
* use std::error::Error;
*
* #[tokio::main]
* async fn main() -> Result<(), Box<dyn Error>> {
* let response = lychee::check("https://github.com/lycheeverse/lychee").await?;
* println!("{}", response);
* Ok(())
* }
* ```
*
* For more specific use-cases you can build a lychee client yourself,
* using the `ClientBuilder` which can be used to
* configure and run your own link checker and grants full flexibility:
*
* ```
* use lychee::{ClientBuilder, Status};
* use std::error::Error;
*
* #[tokio::main]
* async fn main() -> Result<(), Box<dyn Error>> {
* let client = ClientBuilder::default().build()?;
* let response = client.check("https://github.com/lycheeverse/lychee").await?;
* assert!(matches!(response.status, Status::Ok(_)));
* Ok(())
* }
* ```
*/
#[cfg(doctest)]
#[macro_use]
extern crate doc_comment;
#[cfg(doctest)]
doctest!("../README.md");
mod client;
mod client_pool;
mod filter;
mod quirks;
mod types;
mod uri;
pub mod collector;
pub mod extract;
pub mod test_utils;
pub use client::check;
pub use client::ClientBuilder;
pub use client_pool::ClientPool;
pub use collector::Input;
pub use types::*;
pub use uri::Uri;

View file

@ -1,41 +0,0 @@
use http::StatusCode;
use reqwest::Url;
use wiremock::matchers::path;
use wiremock::{Mock, MockServer, ResponseTemplate};
use crate::Uri;
#[allow(unused)]
pub async fn get_mock_server<S>(response_code: S) -> MockServer
where
S: Into<StatusCode>,
{
get_mock_server_with_content(response_code, None).await
}
pub async fn get_mock_server_with_content<S>(response_code: S, content: Option<&str>) -> MockServer
where
S: Into<StatusCode>,
{
let mock_server = MockServer::start().await;
let template = ResponseTemplate::new(response_code.into());
let template = if let Some(s) = content {
template.set_body_string(s)
} else {
template
};
Mock::given(path("/"))
.respond_with(template)
.mount(&mock_server)
.await;
mock_server
}
/// Helper method to convert a string into a URI
/// Note: This panics on error, so it should only be used for testing
pub fn website(url: &str) -> Uri {
Uri::Website(Url::parse(url).expect("Expected valid Website URI"))
}

View file

@ -1,216 +0,0 @@
use crate::{collector::Input, uri::Uri};
use anyhow::anyhow;
use serde::{Serialize, Serializer};
use std::{collections::HashSet, convert::TryFrom, fmt::Display};
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
pub struct Request {
pub uri: Uri,
pub source: Input,
}
impl Request {
pub fn new(uri: Uri, source: Input) -> Self {
Request { uri, source }
}
}
impl Display for Request {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{} ({})", self.uri, self.source)
}
}
impl TryFrom<String> for Request {
type Error = anyhow::Error;
fn try_from(s: String) -> Result<Self, Self::Error> {
let uri = Uri::try_from(s.as_str())?;
Ok(Request::new(uri, Input::String(s)))
}
}
impl TryFrom<&str> for Request {
type Error = anyhow::Error;
fn try_from(s: &str) -> Result<Self, Self::Error> {
let uri = Uri::try_from(s)?;
Ok(Request::new(uri, Input::String(s.to_owned())))
}
}
/// Specifies how requests to websites will be made
pub(crate) enum RequestMethod {
Get,
Head,
}
impl TryFrom<String> for RequestMethod {
type Error = anyhow::Error;
fn try_from(value: String) -> Result<Self, Self::Error> {
match value.to_lowercase().as_ref() {
"get" => Ok(RequestMethod::Get),
"head" => Ok(RequestMethod::Head),
_ => Err(anyhow!("Only `get` and `head` allowed, got {}", value)),
}
}
}
#[derive(Debug, PartialEq, Eq, Hash, Serialize)]
pub struct Response {
#[serde(flatten)]
pub uri: Uri,
pub status: Status,
#[serde(skip)]
pub source: Input,
}
impl Response {
pub fn new(uri: Uri, status: Status, source: Input) -> Self {
Response {
uri,
status,
source,
}
}
}
impl Display for Response {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let metadata = match &self.status {
Status::Ok(code) | Status::Redirected(code) => {
format!(" [{}]", code)
}
Status::Timeout(code) if code.is_some() => format!(" [{}]", code.unwrap()),
Status::Error(e, code) => {
if let Some(code) = code {
format!(" ({})", code)
} else {
format!(" ({})", e)
}
}
_ => "".to_string(),
};
write!(f, "{} {}{}", self.status.icon(), self.uri, metadata)
}
}
/// Response status of the request
#[derive(Debug, Hash, PartialEq, Eq)]
pub enum Status {
/// Request was successful
Ok(http::StatusCode),
/// Request failed with HTTP error code
Error(String, Option<http::StatusCode>),
/// Request timed out
Timeout(Option<http::StatusCode>),
/// Got redirected to different resource
Redirected(http::StatusCode),
/// Resource was excluded from checking
Excluded,
}
impl Display for Status {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let out = match self {
Status::Ok(c) => format!("OK ({})", c),
Status::Redirected(c) => format!("Redirect ({})", c),
Status::Excluded => "Excluded".to_string(),
Status::Error(err, code) => {
if let Some(code) = code {
format!("Failed: {} ({})", err, code)
} else {
format!("Failed: {}", err)
}
}
Status::Timeout(Some(c)) => format!("Timeout ({})", c),
Status::Timeout(None) => "Timeout".to_string(),
};
write!(f, "{}", out)
}
}
impl Serialize for Status {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.collect_str(self)
}
}
impl Status {
pub fn new(statuscode: http::StatusCode, accepted: Option<HashSet<http::StatusCode>>) -> Self {
if let Some(true) = accepted.map(|a| a.contains(&statuscode)) {
Status::Ok(statuscode)
} else if statuscode.is_success() {
Status::Ok(statuscode)
} else if statuscode.is_redirection() {
Status::Redirected(statuscode)
} else {
Status::Error("".into(), Some(statuscode))
}
}
pub fn is_success(&self) -> bool {
matches!(self, Status::Ok(_))
}
pub fn is_failure(&self) -> bool {
matches!(self, Status::Error(_, _))
}
pub fn is_excluded(&self) -> bool {
matches!(self, Status::Excluded)
}
pub fn icon(&self) -> &str {
match self {
Status::Ok(_) => "",
Status::Redirected(_) => "⇄️",
Status::Excluded => "?",
Status::Error(_, _) => "",
Status::Timeout(_) => "",
}
}
}
impl From<reqwest::Error> for Status {
fn from(e: reqwest::Error) -> Self {
if e.is_timeout() {
Status::Timeout(e.status())
} else {
Status::Error(e.to_string(), e.status())
}
}
}
#[cfg(test)]
mod test {
use crate::test_utils::website;
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
#[test]
fn test_uri_host_ip_v4() {
let uri = website("http://127.0.0.1");
let ip = uri.host_ip().expect("Expected a valid IPv4");
assert_eq!(ip, IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)));
}
#[test]
fn test_uri_host_ip_v6() {
let uri = website("https://[2020::0010]");
let ip = uri.host_ip().expect("Expected a valid IPv6");
assert_eq!(
ip,
IpAddr::V6(Ipv6Addr::new(0x2020, 0, 0, 0, 0, 0, 0, 0x10))
);
}
#[test]
fn test_uri_host_ip_no_ip() {
let uri = website("https://some.cryptic/url");
let ip = uri.host_ip();
assert!(ip.is_none());
}
}

View file

@ -1,135 +0,0 @@
use anyhow::{bail, Result};
use fast_chemail::is_valid_email;
use serde::{Deserialize, Serialize};
use std::net::IpAddr;
use std::{convert::TryFrom, fmt::Display};
use url::Url;
/// Lychee's own representation of a URI, which encapsulates all support formats
#[derive(Clone, PartialOrd, Ord, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Uri {
/// Website URL
Website(Url),
/// Mail address
Mail(String),
}
impl Uri {
pub fn as_str(&self) -> &str {
match self {
Uri::Website(url) => url.as_str(),
Uri::Mail(address) => address.as_str(),
}
}
pub fn scheme(&self) -> Option<String> {
match self {
Uri::Website(url) => Some(url.scheme().to_string()),
Uri::Mail(_address) => None,
}
}
pub fn host_ip(&self) -> Option<IpAddr> {
match self {
Self::Website(url) => match url.host()? {
url::Host::Ipv4(v4_addr) => Some(v4_addr.into()),
url::Host::Ipv6(v6_addr) => Some(v6_addr.into()),
_ => None,
},
Self::Mail(_) => None,
}
}
}
fn is_internal_link(link: &str) -> bool {
// The first element should contain the Markdown file link
// @see https://www.markdownguide.org/basic-syntax/#links
let anchor_links = link.split('#').next().unwrap_or("");
anchor_links.ends_with(".md") | anchor_links.ends_with(".markdown")
}
impl TryFrom<&str> for Uri {
type Error = anyhow::Error;
fn try_from(s: &str) -> Result<Self> {
// Check for internal Markdown links
let is_link_internal = is_internal_link(s);
// Remove the `mailto` scheme if it exists
// to avoid parsing it as a website URL.
let s = s.trim_start_matches("mailto:");
if let Ok(uri) = Url::parse(s) {
return Ok(Uri::Website(uri));
} else if !is_link_internal && is_valid_email(&s) {
return Ok(Uri::Mail(s.to_string()));
}
bail!("Cannot convert to Uri")
}
}
impl Display for Uri {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.as_str())
}
}
#[cfg(test)]
mod test {
use crate::test_utils::website;
use super::*;
use pretty_assertions::assert_eq;
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
#[test]
fn test_uri_from_str() {
assert!(matches!(Uri::try_from(""), Err(_)));
assert_eq!(
Uri::try_from("http://example.org").unwrap(),
website("http://example.org")
);
assert_eq!(
Uri::try_from("http://example.org/@test/testing").unwrap(),
website("http://example.org/@test/testing")
);
assert_eq!(
Uri::try_from("mail@example.org").unwrap(),
Uri::Mail("mail@example.org".to_string())
);
assert_eq!(
Uri::try_from("mailto:mail@example.org").unwrap(),
Uri::Mail("mail@example.org".to_string())
);
}
#[test]
fn test_uri_host_ip_v4() {
let uri = website("http://127.0.0.1");
let ip = uri.host_ip().expect("Expected a valid IPv4");
assert_eq!(ip, IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)));
}
#[test]
fn test_uri_host_ip_v6() {
let uri = website("https://[2020::0010]");
let ip = uri.host_ip().expect("Expected a valid IPv6");
assert_eq!(
ip,
IpAddr::V6(Ipv6Addr::new(0x2020, 0, 0, 0, 0, 0, 0, 0x10))
);
}
#[test]
fn test_uri_host_ip_no_ip() {
let uri = website("https://some.cryptic/url");
let ip = uri.host_ip();
assert!(ip.is_none());
}
#[test]
fn test_mail() {
let uri = website("http://127.0.0.1");
let ip = uri.host_ip().expect("Expected a valid IPv4");
assert_eq!(ip, IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)));
}
}