Add support for local files #262

2026-05-20 11:31:53 +00:00 · 2021-09-09 19:50:37 +02:00 · 2021-09-09 19:50:37 +02:00 · 9b5fc399ed
commit 9b5fc399ed
parent 16c6dcf4d6 de55fbd178
33 changed files with 966 additions and 374 deletions
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -43,17 +43,17 @@ jobs:
      fail-fast: false
    steps:
      - name: Install musl tools
-        if: contains(matrix.target, 'musl')
+        if: ${{ contains(matrix.target, 'musl') }}
        run: sudo apt-get install -y musl-tools

      - name: Install arm tools
-        if: contains(matrix.target, 'arm')
+        if: ${{ contains(matrix.target, 'arm') }}
        run: |
          echo "GNU_PREFIX=arm-linux-gnueabihf-" >> $GITHUB_ENV
          sudo apt-get install -y binutils-arm-linux-gnueabihf   
      
      - name: Install aarch64 tools
-        if: contains(matrix.target, 'aarch64')
+        if: ${{ contains(matrix.target, 'aarch64') }}
        run: |
          echo "GNU_PREFIX=aarch64-linux-gnu-" >> $GITHUB_ENV
          sudo apt-get install -y binutils-aarch64-linux-gnu 
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@ -56,7 +56,7 @@ jobs:
    - run: cargo-publish-all --dry-run

  publish:
-    if: startsWith(github.ref, 'refs/tags/')
+    if: ${{ startsWith(github.ref, 'refs/tags/') }}
    needs:
      - test
      - lint
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1406,8 +1406,11 @@ dependencies = [
 "http",
 "hubcaps",
 "linkify",
+ "log",
 "markup5ever_rcdom",
 "openssl-sys",
+ "path-clean",
+ "percent-encoding",
 "pretty_assertions",
 "pulldown-cmark",
 "regex",
@ -1732,6 +1735,12 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "path-clean"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ecba01bf2678719532c5e3059e0b5f0811273d94b397088b82e3bd0a78c78fdd"
+
 [[package]]
 name = "pem"
 version = "0.8.3"
--- a/6
+++ b/6
@ -18,10 +18,14 @@ docker-run: ## Run Docker image
 docker-push: ## Push image to Docker Hub
 	docker push $(IMAGE_NAME)

-.PHONY: build-local
+.PHONY: build
 build: ## Build Rust code locally
 	cargo build

+.PHONY: install
+install: ## Install project locally
+	cargo install --path lychee-bin
+
 .PHONY: run
 run: ## Run Rust code locally
 	cargo run
--- a/README.md
+++ b/README.md
@ -161,11 +161,15 @@ lychee ~/projects/*/README.md

 # check links in local files (lychee supports advanced globbing and ~ expansion):
 lychee "~/projects/big_project/**/README.*"
+
 # ignore case when globbing and check result for each link:
 lychee --glob-ignore-case --verbose "~/projects/**/[r]eadme.*"

 # check links from epub file (requires atool: https://www.nongnu.org/atool)
 acat -F zip {file.epub} "*.xhtml" "*.html" | lychee -
+
+# check links in directory; block network requests
+lychee --offline path/to/directory
 ```

 ### GitHub token
@ -202,6 +206,7 @@ FLAGS:
    -i, --insecure               Proceed for server connections considered insecure (invalid TLS)
    -n, --no-progress            Do not show progress bar.
                                 This is recommended for non-interactive shells (e.g. for continuous integration)
+        --offline                Only check local files and block network requests
        --require-https          When HTTPS is available, treat HTTP links as errors
        --skip-missing           Skip missing input files (default is to error if they don't exist)
    -V, --version                Prints version information
@ -209,7 +214,8 @@ FLAGS:

 OPTIONS:
    -a, --accept <accept>                      Comma-separated list of accepted status codes for valid links
-    -b, --base-url <base-url>                  Base URL to check relative URLs
+    -b, --base <base>                          Base URL or website root directory to check relative URLs e.g.
+                                               https://example.org or `/path/to/public`
        --basic-auth <basic-auth>              Basic authentication support. E.g. `username:password`
    -c, --config <config-file>                 Configuration file to use [default: ./lychee.toml]
        --exclude <exclude>...                 Exclude URLs from checking (supports regex)
@ -310,7 +316,8 @@ Try one of these links to get started:
 - [good first issues](https://github.com/lycheeverse/lychee/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
 - [help wanted](https://github.com/lycheeverse/lychee/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22)

-Lychee is written in Rust. Install [rust-up](https://rustup.rs/) to get started. Begin my making sure the following commands succeed without errors.
+Lychee is written in Rust. Install [rust-up](https://rustup.rs/) to get started.
+Begin my making sure the following commands succeed without errors.

 ```bash
 cargo test # runs tests
--- a/examples/collect_links/collect_links.rs
+++ b/examples/collect_links/collect_links.rs
@ -14,12 +14,12 @@ async fn main() -> Result<()> {
    ];

    let links = Collector::new(
-        None,  // base_url
+        None,  // base
        false, // don't skip missing inputs
        10,    // max concurrency
    )
    .collect_links(
-        inputs, // base_url
+        inputs, // base url or directory
    )
    .await?;

--- a/fixtures/TEST.md
+++ b/fixtures/TEST.md
@ -1,5 +1,5 @@
-This link should be ignored as it is not a fully qualified URL.
-![Logo](awesome.png)
+Check file link
+![Logo](../assets/banner.svg)

 ![Anchors should be ignored](#awesome)

--- a/fixtures/TEST_SCHEMES.txt
+++ b/fixtures/TEST_SCHEMES.txt
@ -1,3 +1,3 @@
 slack://channel?id=123
-file://foo/bar
+file:///test_folder/test_file
 https://example.org
--- a/fixtures/offline/404.html/.gitkeep
+++ b/fixtures/offline/404.html/.gitkeep
--- a/fixtures/offline/about/index.html
+++ b/fixtures/offline/about/index.html
@ -0,0 +1,21 @@
+<html>
+    <head>
+        <title>About</title>
+    </head>
+    <body>
+        <h1>About</h1>
+        <p>
+            <ul>
+                <li>
+                    <a href="https://example.org">example</a>
+                </li>
+                <li>
+                    <a href="/">home</a>
+                </li>
+                <li>
+                    <a href="/post1">Post 1</a>
+                </li>
+            </ul>
+        </p>
+    </body>
+</html>
--- a/fixtures/offline/another
+++ b/fixtures/offline/another
--- a/fixtures/offline/blog/post1/index.html
+++ b/fixtures/offline/blog/post1/index.html
@ -0,0 +1,21 @@
+<html>
+    <head>
+        <title>Post 2</title>
+    </head>
+    <body>
+        <h1>Post 2 Title</h1>
+        <p>
+            <ul>
+                <li>
+                    <a href="/">home</a>
+                </li>
+                <li>
+                    <a href="/post1">Post 1</a>
+                </li>
+                <li>
+                    <a href="../about">Relative</a>
+                </li>
+            </ul>
+        </p>
+    </body>
+</html>
--- a/fixtures/offline/blog/post2/index.html
+++ b/fixtures/offline/blog/post2/index.html
@ -0,0 +1,18 @@
+<html>
+    <head>
+        <title>Post 1</title>
+    </head>
+    <body>
+        <h1>Post 1 Title</h1>
+        <p>
+            <ul>
+                <li>
+                    <a href="/">home</a>
+                </li>
+                <li>
+                    <a href="/post2">Post 2</a>
+                </li>
+            </ul>
+        </p>
+    </body>
+</html>
--- a/fixtures/offline/index.html
+++ b/fixtures/offline/index.html
@ -0,0 +1,27 @@
+<html>
+    <head>
+        <title>Index</title>
+    </head>
+    <body>
+        <h1>Index Title</h1>
+        <p>
+            <ul>
+                <li>
+                    <a href="/">home</a>
+                </li>
+                <li>
+                    <a href="/about">About</a>
+                </li>
+                <li>
+                    <a href="/about#fragment">About</a>
+                </li>
+                <li>
+                    <a href="/another page">About</a>
+                </li>
+                <li>
+                    <a href="/another%20page">About</a>
+                </li>
+            </ul>
+        </p>
+    </body>
+</html>
--- a/lychee-bin/src/main.rs
+++ b/lychee-bin/src/main.rs
@ -70,10 +70,7 @@ use anyhow::{anyhow, Context, Result};
 use headers::{authorization::Basic, Authorization, HeaderMap, HeaderMapExt, HeaderName};
 use http::StatusCode;
 use indicatif::{ProgressBar, ProgressStyle};
-use lychee_lib::{
-    collector::{Collector, Input},
-    ClientBuilder, ClientPool, Response,
-};
+use lychee_lib::{ClientBuilder, ClientPool, Collector, Input, Response};
 use openssl_sys as _; // required for vendored-openssl feature
 use regex::RegexSet;
 use ring as _; // required for apple silicon
@ -178,6 +175,13 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
    let include = RegexSet::new(&cfg.include)?;
    let exclude = RegexSet::new(&cfg.exclude)?;

+    // Offline mode overrides the scheme
+    let schemes = if cfg.offline {
+        vec!["file".to_string()]
+    } else {
+        cfg.scheme.clone()
+    };
+
    let client = ClientBuilder::builder()
        .includes(include)
        .excludes(exclude)
@ -193,14 +197,14 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
        .method(method)
        .timeout(timeout)
        .github_token(cfg.github_token.clone())
-        .schemes(HashSet::from_iter(cfg.scheme.clone()))
+        .schemes(HashSet::from_iter(schemes))
        .accepted(accepted)
        .require_https(cfg.require_https)
        .build()
        .client()
        .map_err(|e| anyhow!(e))?;

-    let links = Collector::new(cfg.base_url.clone(), cfg.skip_missing, max_concurrency)
+    let links = Collector::new(cfg.base.clone(), cfg.skip_missing, max_concurrency)
        .collect_links(&inputs)
        .await
        .map_err(|e| anyhow!(e))?;
--- a/lychee-bin/src/options.rs
+++ b/lychee-bin/src/options.rs
@ -1,9 +1,8 @@
-use std::{fs, io::ErrorKind, path::PathBuf, str::FromStr};
+use std::{convert::TryFrom, fs, io::ErrorKind, path::PathBuf, str::FromStr};

 use anyhow::{anyhow, Error, Result};
 use lazy_static::lazy_static;
-use lychee_lib::collector::Input;
-use reqwest::Url;
+use lychee_lib::{Base, Input};
 use serde::Deserialize;
 use structopt::{clap::crate_version, StructOpt};

@ -76,6 +75,10 @@ macro_rules! fold_in {
    };
 }

+fn parse_base(src: &str) -> Result<Base, lychee_lib::ErrorKind> {
+    Base::try_from(src)
+}
+
 #[derive(Debug, StructOpt)]
 #[structopt(
    name = "lychee",
@ -161,6 +164,11 @@ pub(crate) struct Config {
    #[serde(default)]
    pub(crate) scheme: Vec<String>,

+    /// Only check local files and block network requests.
+    #[structopt(long)]
+    #[serde(default)]
+    pub(crate) offline: bool,
+
    /// URLs to check (supports regex). Has preference over all excludes.
    #[structopt(long)]
    #[serde(default)]
@ -223,10 +231,11 @@ pub(crate) struct Config {
    #[serde(default = "method")]
    pub(crate) method: String,

-    /// Base URL to check relative URLs
-    #[structopt(short, long, parse(try_from_str))]
+    /// Base URL or website root directory to check relative URLs
+    /// e.g. https://example.org or `/path/to/public`
+    #[structopt(short, long, parse(try_from_str = parse_base))]
    #[serde(default)]
-    pub(crate) base_url: Option<Url>,
+    pub(crate) base: Option<Base>,

    /// Basic authentication support. E.g. `username:password`
    #[structopt(long)]
@ -311,7 +320,7 @@ impl Config {
            accept: None;
            timeout: TIMEOUT;
            method: METHOD;
-            base_url: None;
+            base: None;
            basic_auth: None;
            github_token: None;
            skip_missing: false;
--- a/lychee-bin/tests/cli.rs
+++ b/lychee-bin/tests/cli.rs
@ -133,15 +133,38 @@ mod cli {

    /// Test unsupported URI schemes
    #[test]
-    fn test_unsupported_uri_schemes() -> Result<()> {
-        test_json_output!(
-            "TEST_SCHEMES.txt",
-            MockResponseStats {
-                total: 1,
-                successful: 1,
-                ..MockResponseStats::default()
-            }
-        )
+    fn test_unsupported_uri_schemes() {
+        let mut cmd = main_command();
+        let test_schemes_path = fixtures_path().join("TEST_SCHEMES.txt");
+
+        // Exclude file link because it doesn't exist on the filesystem.
+        // (File URIs are absolute paths, which we don't have.)
+        // Nevertheless, the `file` scheme should be recognized.
+        cmd.arg(test_schemes_path)
+            .arg("--exclude")
+            .arg("file://")
+            .env_clear()
+            .assert()
+            .success()
+            .stdout(contains("Total............2"))
+            .stdout(contains("Successful.......1"))
+            .stdout(contains("Excluded.........1"));
+    }
+
+    #[test]
+    fn test_resolve_paths() {
+        let mut cmd = main_command();
+        let offline_dir = fixtures_path().join("offline");
+
+        cmd.arg("--offline")
+            .arg("--base")
+            .arg(&offline_dir)
+            .arg(&offline_dir.join("index.html"))
+            .env_clear()
+            .assert()
+            .success()
+            .stdout(contains("Total............3"))
+            .stdout(contains("Successful.......3"));
    }

    #[test]
@ -367,7 +390,7 @@ mod cli {
            .assert()
            .success();

-        let expected = r#"{"total":10,"successful":10,"failures":0,"unknown":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"#;
+        let expected = r#"{"total":11,"successful":11,"failures":0,"unknown":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"#;
        let output = fs::read_to_string(&outfile)?;
        assert_eq!(output.split_whitespace().collect::<String>(), expected);
        fs::remove_file(outfile)?;
@ -385,7 +408,7 @@ mod cli {
            .arg(".*")
            .assert()
            .success()
-            .stdout(contains("Excluded........10"));
+            .stdout(contains("Excluded........11"));

        Ok(())
    }
--- a/lychee-bin/tests/local_files.rs
+++ b/lychee-bin/tests/local_files.rs
@ -0,0 +1,36 @@
+#[cfg(test)]
+mod cli {
+    use std::{fs::File, io::Write};
+
+    use assert_cmd::Command;
+    use lychee_lib::Result;
+    use predicates::str::contains;
+
+    fn main_command() -> Command {
+        // this gets the "main" binary name (e.g. `lychee`)
+        Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name")
+    }
+
+    #[tokio::test]
+    async fn test_local_file() -> Result<()> {
+        let dir = tempfile::tempdir()?;
+        let index_path = dir.path().join("index.html");
+        let mut index = File::create(&index_path)?;
+        writeln!(index, r#"<a href="./foo.html">Foo</a>"#)?;
+
+        let foo_path = dir.path().join("foo.html");
+        File::create(&foo_path)?;
+
+        let mut cmd = main_command();
+        cmd.arg(index_path)
+            .arg("--no-progress")
+            .arg("--verbose")
+            .env_clear()
+            .assert()
+            .success()
+            .stdout(contains("Total............1"))
+            .stdout(contains("foo.html"));
+
+        Ok(())
+    }
+}
--- a/lychee-lib/Cargo.toml
+++ b/lychee-lib/Cargo.toml
@ -40,6 +40,9 @@ shellexpand = "2.1.0"
 tokio = { version = "1.6.0", features = ["full"] }
 typed-builder = "0.9.1"
 url = { version = "2.2.2", features = ["serde"] }
+log = "0.4.14"
+path-clean = "0.1.0"
+percent-encoding = "2.1.0"

 [dev-dependencies]
 doc-comment = "0.3.3"
--- a/lychee-lib/src/client.rs
+++ b/lychee-lib/src/client.rs
@ -20,8 +20,7 @@ use typed_builder::TypedBuilder;
 use crate::{
    filter::{Excludes, Filter, Includes},
    quirks::Quirks,
-    uri::Uri,
-    ErrorKind, Request, Response, Result, Status,
+    ErrorKind, Request, Response, Result, Status, Uri,
 };

 const DEFAULT_MAX_REDIRECTS: usize = 5;
@ -178,6 +177,8 @@ impl Client {
        let Request { uri, source } = Request::try_from(request)?;
        let status = if self.filter.is_excluded(&uri) {
            Status::Excluded
+        } else if uri.is_file() {
+            self.check_file(&uri).await
        } else if uri.is_mail() {
            self.check_mail(&uri).await
        } else {
@ -255,6 +256,15 @@ impl Client {
        }
    }

+    pub async fn check_file(&self, uri: &Uri) -> Status {
+        if let Ok(path) = uri.url.to_file_path() {
+            if path.exists() {
+                return Status::Ok(StatusCode::OK);
+            }
+        }
+        ErrorKind::InvalidFilePath(uri.clone()).into()
+    }
+
    pub async fn check_mail(&self, uri: &Uri) -> Status {
        let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]);
        let result = &(check_email(&input).await)[0];
@ -284,11 +294,13 @@ where
 mod test {
    use std::{
        convert::TryInto,
+        fs::File,
        time::{Duration, Instant},
    };

    use http::{header::HeaderMap, StatusCode};
    use reqwest::header;
+    use tempfile::tempdir;

    use super::ClientBuilder;
    use crate::{mock_server, test_utils::get_mock_client_response, Uri};
@ -373,6 +385,17 @@ mod test {
        assert!(res.status().is_success());
    }

+    #[tokio::test]
+    async fn test_file() {
+        let dir = tempdir().unwrap();
+        let file = dir.path().join("temp");
+        File::create(file).unwrap();
+        let uri = format!("file://{}", dir.path().join("temp").to_str().unwrap());
+
+        let res = get_mock_client_response(uri).await;
+        assert!(res.status().is_success());
+    }
+
    #[tokio::test]
    async fn test_custom_headers() {
        // See https://github.com/rust-lang/crates.io/issues/788
--- a/lychee-lib/src/collector.rs
+++ b/lychee-lib/src/collector.rs
@ -1,223 +1,10 @@
-use std::{
-    collections::HashSet,
-    fmt::Display,
-    path::{Path, PathBuf},
-};
-
-use glob::glob_with;
-use reqwest::Url;
-use serde::Serialize;
-use shellexpand::tilde;
-use tokio::{
-    fs::read_to_string,
-    io::{stdin, AsyncReadExt},
-};
-
-use crate::{
-    extract::{extract_links, FileType},
-    uri::Uri,
-    Request, Result,
-};
-
-const STDIN: &str = "-";
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-#[non_exhaustive]
-/// An exhaustive list of input sources, which lychee accepts
-pub enum Input {
-    /// URL (of HTTP/HTTPS scheme).
-    RemoteUrl(Box<Url>),
-    /// Unix shell-style glob pattern.
-    FsGlob {
-        /// The glob pattern matching all input files
-        pattern: String,
-        /// Don't be case sensitive when matching files against a glob
-        ignore_case: bool,
-    },
-    /// File path.
-    FsPath(PathBuf),
-    /// Standard Input.
-    Stdin,
-    /// Raw string input.
-    String(String),
-}
-
-impl Serialize for Input {
-    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        serializer.collect_str(self)
-    }
-}
-
-impl Display for Input {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.write_str(match self {
-            Input::RemoteUrl(url) => url.as_str(),
-            Input::FsGlob { pattern, .. } => pattern,
-            Input::FsPath(path) => path.to_str().unwrap_or_default(),
-            Input::Stdin => "stdin",
-            Input::String(_) => "raw input string",
-        })
-    }
-}
-
-#[derive(Debug)]
-/// Encapsulates the content for a given input
-pub struct InputContent {
-    /// Input source
-    pub input: Input,
-    /// File type of given input
-    pub file_type: FileType,
-    /// Raw UTF-8 string content
-    pub content: String,
-}
-
-impl InputContent {
-    #[must_use]
-    /// Create an instance of `InputContent` from an input string
-    pub fn from_string(s: &str, file_type: FileType) -> Self {
-        // TODO: consider using Cow (to avoid one .clone() for String types)
-        Self {
-            input: Input::String(s.to_owned()),
-            file_type,
-            content: s.to_owned(),
-        }
-    }
-}
-
-impl Input {
-    #[must_use]
-    /// Construct a new `Input` source. In case the input is a `glob` pattern,
-    /// `glob_ignore_case` decides whether matching files against the `glob` is
-    /// case-insensitive or not
-    pub fn new(value: &str, glob_ignore_case: bool) -> Self {
-        if value == STDIN {
-            Self::Stdin
-        } else if let Ok(url) = Url::parse(value) {
-            Self::RemoteUrl(Box::new(url))
-        } else {
-            // this seems to be the only way to determine if this is a glob pattern
-            let is_glob = glob::Pattern::escape(value) != value;
-
-            if is_glob {
-                Self::FsGlob {
-                    pattern: value.to_owned(),
-                    ignore_case: glob_ignore_case,
-                }
-            } else {
-                Self::FsPath(value.into())
-            }
-        }
-    }
-
-    #[allow(clippy::missing_panics_doc)]
-    /// Retrieve the contents from the input
-    ///
-    /// # Errors
-    ///
-    /// Returns an error if the contents can not be retrieved
-    /// because of an underlying I/O error (e.g. an error while making a
-    /// network request or retrieving the contents from the file system)
-    pub async fn get_contents(
-        &self,
-        file_type_hint: Option<FileType>,
-        skip_missing: bool,
-    ) -> Result<Vec<InputContent>> {
-        match *self {
-            // TODO: should skip_missing also affect URLs?
-            Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]),
-            Input::FsGlob {
-                ref pattern,
-                ignore_case,
-            } => Ok(Self::glob_contents(pattern, ignore_case).await?),
-            Input::FsPath(ref path) => {
-                let content = Self::path_content(path).await;
-                match content {
-                    Ok(input_content) => Ok(vec![input_content]),
-                    Err(_) if skip_missing => Ok(vec![]),
-                    Err(e) => Err(e),
-                }
-            }
-            Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]),
-            Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]),
-        }
-    }
-
-    async fn url_contents(url: &Url) -> Result<InputContent> {
-        // Assume HTML for default paths
-        let file_type = if url.path().is_empty() || url.path() == "/" {
-            FileType::Html
-        } else {
-            FileType::from(url.as_str())
-        };
-
-        let res = reqwest::get(url.clone()).await?;
-        let input_content = InputContent {
-            input: Input::RemoteUrl(Box::new(url.clone())),
-            file_type,
-            content: res.text().await?,
-        };
-
-        Ok(input_content)
-    }
-
-    async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result<Vec<InputContent>> {
-        let mut contents = vec![];
-        let glob_expanded = tilde(&path_glob);
-        let mut match_opts = glob::MatchOptions::new();
-
-        match_opts.case_sensitive = !ignore_case;
-
-        for entry in glob_with(&glob_expanded, match_opts)? {
-            match entry {
-                Ok(path) => {
-                    let content = Self::path_content(&path).await?;
-                    contents.push(content);
-                }
-                Err(e) => println!("{:?}", e),
-            }
-        }
-
-        Ok(contents)
-    }
-
-    async fn path_content<P: Into<PathBuf> + AsRef<Path> + Clone>(path: P) -> Result<InputContent> {
-        let content = read_to_string(&path)
-            .await
-            .map_err(|e| (path.clone().into(), e))?;
-        let input_content = InputContent {
-            file_type: FileType::from(path.as_ref()),
-            content,
-            input: Input::FsPath(path.into()),
-        };
-
-        Ok(input_content)
-    }
-
-    async fn stdin_content(file_type_hint: Option<FileType>) -> Result<InputContent> {
-        let mut content = String::new();
-        let mut stdin = stdin();
-        stdin.read_to_string(&mut content).await?;
-
-        let input_content = InputContent {
-            input: Input::Stdin,
-            file_type: file_type_hint.unwrap_or_default(),
-            content,
-        };
-
-        Ok(input_content)
-    }
-
-    fn string_content(s: &str, file_type_hint: Option<FileType>) -> InputContent {
-        InputContent::from_string(s, file_type_hint.unwrap_or_default())
-    }
-}
+use crate::{extract::extract_links, Base, Input, Request, Result, Uri};
+use std::collections::HashSet;

 /// Collector keeps the state of link collection
 #[derive(Debug, Clone)]
 pub struct Collector {
-    base_url: Option<Url>,
+    base: Option<Base>,
    skip_missing_inputs: bool,
    max_concurrency: usize,
    cache: HashSet<Uri>,
@ -226,9 +13,9 @@ pub struct Collector {
 impl Collector {
    /// Create a new collector with an empty cache
    #[must_use]
-    pub fn new(base_url: Option<Url>, skip_missing_inputs: bool, max_concurrency: usize) -> Self {
+    pub fn new(base: Option<Base>, skip_missing_inputs: bool, max_concurrency: usize) -> Self {
        Collector {
-            base_url,
+            base,
            skip_missing_inputs,
            max_concurrency,
            cache: HashSet::new(),
@ -236,7 +23,8 @@ impl Collector {
    }

    /// Fetch all unique links from a slice of inputs
-    /// All relative URLs get prefixed with `base_url` if given.
+    /// All relative URLs get prefixed with `base` if given.
+    /// (This can be a directory or a base URL)
    ///
    /// # Errors
    ///
@ -263,9 +51,9 @@ impl Collector {

        while let Some(result) = contents_rx.recv().await {
            for input_content in result? {
-                let base_url = self.base_url.clone();
+                let base = self.base.clone();
                let handle =
-                    tokio::task::spawn_blocking(move || extract_links(&input_content, &base_url));
+                    tokio::task::spawn_blocking(move || extract_links(&input_content, &base));
                extract_links_handles.push(handle);
            }
        }
@ -278,7 +66,7 @@ impl Collector {

        for handle in extract_links_handles {
            let new_links = handle.await?;
-            links.extend(new_links);
+            links.extend(new_links?);
        }

        // Filter out already cached links (duplicates)
@ -304,9 +92,9 @@ mod test {

    use super::*;
    use crate::{
-        extract::FileType,
        mock_server,
        test_utils::{mail, website},
+        types::{FileType, Input},
        Result, Uri,
    };

--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@ -1,66 +1,62 @@
-use std::{collections::HashSet, convert::TryFrom, path::Path};
+use std::{collections::HashSet, convert::TryFrom, path::Path, path::PathBuf};

 use html5ever::{
    parse_document,
    tendril::{StrTendril, TendrilSink},
 };
-use linkify::LinkFinder;
+use log::info;
 use markup5ever_rcdom::{Handle, NodeData, RcDom};
+use percent_encoding::percent_decode_str;
 use pulldown_cmark::{Event as MDEvent, Parser, Tag};
-use url::Url;
+use reqwest::Url;

-use crate::{collector::InputContent, Request, Uri};
+use crate::{
+    helpers::{path, url},
+    types::{FileType, InputContent},
+    Base, ErrorKind, Input, Request, Result, Uri,
+};

-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-/// `FileType` defines which file types lychee can handle
-pub enum FileType {
-    /// File in HTML format
-    Html,
-    /// File in Markdown format
-    Markdown,
-    /// Generic text file without syntax-specific parsing
-    Plaintext,
-}
+/// Main entrypoint for extracting links from various sources
+/// (Markdown, HTML, and plaintext)
+pub(crate) fn extract_links(
+    input_content: &InputContent,
+    base: &Option<Base>,
+) -> Result<HashSet<Request>> {
+    let links = match input_content.file_type {
+        FileType::Markdown => extract_links_from_markdown(&input_content.content),
+        FileType::Html => extract_links_from_html(&input_content.content),
+        FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
+    };

-impl Default for FileType {
-    fn default() -> Self {
-        Self::Plaintext
+    // Only keep legit URLs. For example this filters out anchors.
+    let mut requests: HashSet<Request> = HashSet::new();
+    for link in links {
+        let req = if let Ok(uri) = Uri::try_from(link.as_str()) {
+            Request::new(uri, input_content.input.clone())
+        } else if let Some(url) = base.as_ref().and_then(|u| u.join(&link)) {
+            Request::new(Uri { url }, input_content.input.clone())
+        } else if let Input::FsPath(root) = &input_content.input {
+            if url::is_anchor(&link) {
+                // Silently ignore anchor links for now
+                continue;
+            }
+            let url = create_uri_from_path(root, base, &link)?;
+            Request::new(Uri { url }, input_content.input.clone())
+        } else {
+            info!("Handling of {} not implemented yet", &link);
+            continue;
+        };
+        requests.insert(req);
    }
+    Ok(requests)
 }

-impl<P: AsRef<Path>> From<P> for FileType {
-    /// Detect if the given path points to a Markdown, HTML, or plaintext file.
-    fn from(p: P) -> FileType {
-        let path = p.as_ref();
-        // Assume HTML in case of no extension.
-        // Note: this is only reasonable for URLs; not paths on disk.
-        // For example, `README` without an extension is more likely to be a plaintext file.
-        // A better solution would be to also implement `From<Url> for FileType`.
-        // Unfortunately that's not possible without refactoring, as
-        // `AsRef<Path>` could be implemented for `Url` in the future, which is why
-        // `From<Url> for FileType` is not allowed.
-        match path.extension().and_then(std::ffi::OsStr::to_str) {
-            Some("md" | "markdown") => FileType::Markdown,
-            Some("htm" | "html") | None => FileType::Html,
-            Some(_) => FileType::Plaintext,
-        }
-    }
-}
-
-// Use LinkFinder here to offload the actual link searching in plaintext.
-fn find_links(input: &str) -> Vec<linkify::Link> {
-    let finder = LinkFinder::new();
-    finder.links(input).collect()
-}
-
-/// Extract unparsed URL strings from a markdown string.
+/// Extract unparsed URL strings from a Markdown string.
 fn extract_links_from_markdown(input: &str) -> Vec<String> {
    let parser = Parser::new(input);
    parser
        .flat_map(|event| match event {
-            MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => {
-                vec![url.to_string()]
-            }
+            MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => vec![url.to_string()],
            MDEvent::Text(txt) => extract_links_from_plaintext(&txt.to_string()),
            MDEvent::Html(html) => extract_links_from_html(&html.to_string()),
            _ => vec![],
@ -68,15 +64,15 @@ fn extract_links_from_markdown(input: &str) -> Vec<String> {
        .collect()
 }

-/// Extract unparsed URL strings from a HTML string.
+/// Extract unparsed URL strings from an HTML string.
 fn extract_links_from_html(input: &str) -> Vec<String> {
    let tendril = StrTendril::from(input);
    let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);

    let mut urls = Vec::new();

-    // we pass mutable urls reference to avoid extra allocations in each
-    // recursive descent
+    // We pass mutable URL references here to avoid
+    // extra allocations in each recursive descent
    walk_html_links(&mut urls, &rc_dom.document);

    urls
@ -101,7 +97,7 @@ fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
            for attr in attrs.borrow().iter() {
                let attr_value = attr.value.to_string();

-                if elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
+                if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
                    urls.push(attr_value);
                } else {
                    urls.append(&mut extract_links_from_plaintext(&attr_value));
@ -113,56 +109,34 @@ fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
    }

    // recursively traverse the document's nodes -- this doesn't need any extra
-    // exit conditions because the document is a tree
+    // exit conditions, because the document is a tree
    for child in node.children.borrow().iter() {
        walk_html_links(&mut urls, child);
    }
 }

-/// Determine if element's attribute contains a link / URL.
-fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
-    // See a comprehensive list of attributes that might contain URLs/URIs
-    // over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
-    matches!(
-        (attr_name, elem_name),
-        ("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
-    )
-}
-
-/// Extract unparsed URL strings from a plaintext.
+/// Extract unparsed URL strings from plaintext
 fn extract_links_from_plaintext(input: &str) -> Vec<String> {
-    find_links(input)
+    url::find_links(input)
        .iter()
        .map(|l| String::from(l.as_str()))
        .collect()
 }

-pub(crate) fn extract_links(
-    input_content: &InputContent,
-    base_url: &Option<Url>,
-) -> HashSet<Request> {
-    let links = match input_content.file_type {
-        FileType::Markdown => extract_links_from_markdown(&input_content.content),
-        FileType::Html => extract_links_from_html(&input_content.content),
-        FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
-    };
-
-    // Only keep legit URLs. This sorts out things like anchors.
-    // Silently ignore the parse failures for now.
-    let mut requests: HashSet<Request> = HashSet::new();
-    for link in links {
-        if let Ok(uri) = Uri::try_from(link.as_str()) {
-            requests.insert(Request::new(uri, input_content.input.clone()));
-        } else if !Path::new(&link).exists() {
-            if let Some(new_url) = base_url.as_ref().and_then(|u| u.join(&link).ok()) {
-                requests.insert(Request::new(
-                    Uri { url: new_url },
-                    input_content.input.clone(),
-                ));
-            }
-        };
-    }
-    requests
+fn create_uri_from_path(src: &Path, base: &Option<Base>, dst: &str) -> Result<Url> {
+    let dst = url::remove_get_params_and_fragment(dst);
+    // Avoid double-encoding already encoded destination paths by removing any
+    // potential encoding (e.g. `web%20site` becomes `web site`).
+    // That's because Url::from_file_path will encode the full URL in the end.
+    // This behavior cannot be configured.
+    // See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
+    // TODO: This is not a perfect solution.
+    // Ideally, only `src` and `base` should be URL encoded (as is done by
+    // `from_file_path` at the moment) while `dst` is left untouched and simply
+    // appended to the end.
+    let decoded = percent_decode_str(dst).decode_utf8()?.to_string();
+    let path = path::resolve(src, &PathBuf::from(decoded), base)?;
+    Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidUrl(path))
 }

 #[cfg(test)]
@ -176,17 +150,24 @@ mod test {
    };

    use pretty_assertions::assert_eq;
-    use url::Url;

-    use super::{
-        extract_links, extract_links_from_html, extract_links_from_markdown,
-        extract_links_from_plaintext, find_links, FileType,
-    };
+    use super::*;
    use crate::{
-        collector::InputContent,
+        helpers::url::find_links,
        test_utils::{mail, website},
        Uri,
    };
+    use crate::{
+        types::{FileType, InputContent},
+        Base,
+    };
+
+    #[test]
+    fn test_create_uri_from_path() {
+        let result =
+            create_uri_from_path(&PathBuf::from("/README.md"), &None, "test+encoding").unwrap();
+        assert_eq!(result.as_str(), "file:///test+encoding");
+    }

    fn load_fixture(filename: &str) -> String {
        let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR"))
@ -207,13 +188,13 @@ mod test {
    }

    fn extract_uris(input: &str, file_type: FileType, base_url: Option<&str>) -> HashSet<Uri> {
-        extract_links(
-            &InputContent::from_string(input, file_type),
-            &base_url.map(|u| Url::parse(u).unwrap()),
-        )
-        .into_iter()
-        .map(|r| r.uri)
-        .collect()
+        let base = base_url.map(|url| Base::Remote(Url::parse(url).unwrap()));
+        extract_links(&InputContent::from_string(input, file_type), &base)
+            // unwrap is fine here as this helper function is only used in tests
+            .unwrap()
+            .into_iter()
+            .map(|r| r.uri)
+            .collect()
    }

    #[test]
--- a/lychee-lib/src/filter/mod.rs
+++ b/lychee-lib/src/filter/mod.rs
@ -6,7 +6,7 @@ use std::{collections::HashSet, net::IpAddr};
 pub use excludes::Excludes;
 pub use includes::Includes;

-use crate::uri::Uri;
+use crate::Uri;

 /// Pre-defined exclusions for known false-positives
 static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"];
--- a/lychee-lib/src/helpers/mod.rs
+++ b/lychee-lib/src/helpers/mod.rs
@ -0,0 +1,2 @@
+pub(crate) mod path;
+pub(crate) mod url;
--- a/lychee-lib/src/helpers/path.rs
+++ b/lychee-lib/src/helpers/path.rs
@ -0,0 +1,141 @@
+use crate::{Base, ErrorKind, Result};
+use path_clean::PathClean;
+use std::env;
+use std::path::{Path, PathBuf};
+
+// Returns the base if it is a valid `PathBuf`
+fn get_base_dir(base: &Option<Base>) -> Option<PathBuf> {
+    base.as_ref().and_then(Base::dir)
+}
+
+// https://stackoverflow.com/a/54817755/270334
+pub(crate) fn absolute_path(path: impl AsRef<Path>) -> Result<PathBuf> {
+    let path = path.as_ref();
+
+    let absolute_path = if path.is_absolute() {
+        path.to_path_buf()
+    } else {
+        env::current_dir()?.join(path)
+    }
+    .clean();
+
+    Ok(absolute_path)
+}
+
+// Get the parent directory of a given `Path`.
+fn dirname(src: &Path) -> PathBuf {
+    if src.is_file() {
+        src.to_path_buf()
+            .parent()
+            .map_or(PathBuf::new(), Path::to_path_buf)
+    } else {
+        src.to_path_buf()
+    }
+}
+
+// Resolve `dst` that was linked to from within `src`
+pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<PathBuf> {
+    if dst.is_relative() {
+        // Find `dst` in the parent directory of `src`
+        if let Some(parent) = src.parent() {
+            let rel_path = parent.join(dst.to_path_buf());
+            return absolute_path(&rel_path);
+        }
+    }
+    if dst.is_absolute() {
+        // Absolute local links (leading slash) require the `base_url` to
+        // define the document root.
+        let base = get_base_dir(base).ok_or_else(|| {
+            ErrorKind::InvalidBase(
+                "<empty>".to_string(),
+                format!("Found absolute local link {:?} but no base directory was set. Set with `--base`.", dst)
+            )
+        })?;
+        let abs_path = join(dirname(&base), dst);
+        return absolute_path(&abs_path);
+    }
+    Err(ErrorKind::FileNotFound(dst.to_path_buf()))
+}
+
+// A cumbersome way to concatenate paths without checking their
+// existence on disk. See https://github.com/rust-lang/rust/issues/16507
+fn join(base: PathBuf, dst: &Path) -> PathBuf {
+    let mut abs = base.into_os_string();
+    let target_str = dst.as_os_str();
+    abs.push(target_str);
+    PathBuf::from(abs)
+}
+
+#[cfg(test)]
+mod test_path {
+    use super::*;
+    use crate::Result;
+
+    // index.html
+    // ./foo.html
+    #[test]
+    fn test_resolve_relative() -> Result<()> {
+        let dummy = PathBuf::from("index.html");
+        let abs_path = PathBuf::from("./foo.html");
+        assert_eq!(
+            resolve(&dummy, &abs_path, &None)?,
+            env::current_dir()?.join("foo.html")
+        );
+        Ok(())
+    }
+
+    // ./index.html
+    // ./foo.html
+    #[test]
+    fn test_resolve_relative_index() -> Result<()> {
+        let dummy = PathBuf::from("./index.html");
+        let abs_path = PathBuf::from("./foo.html");
+        assert_eq!(
+            resolve(&dummy, &abs_path, &None)?,
+            env::current_dir()?.join("foo.html")
+        );
+        Ok(())
+    }
+
+    // /path/to/index.html
+    // ./foo.html
+    #[test]
+    fn test_resolve_from_absolute() -> Result<()> {
+        let abs_index = PathBuf::from("/path/to/index.html");
+        let abs_path = PathBuf::from("./foo.html");
+        assert_eq!(
+            resolve(&abs_index, &abs_path, &None)?,
+            PathBuf::from("/path/to/foo.html")
+        );
+        Ok(())
+    }
+
+    // dummy
+    // foo.html
+    // valid base dir
+    #[test]
+    fn test_resolve_absolute_from_base_dir() -> Result<()> {
+        let dummy = PathBuf::new();
+        let abs_path = PathBuf::from("/foo.html");
+        let base = Some(Base::Local(PathBuf::from("/some/absolute/base/dir")));
+        assert_eq!(
+            resolve(&dummy, &abs_path, &base)?,
+            PathBuf::from("/some/absolute/base/dir/foo.html")
+        );
+        Ok(())
+    }
+
+    // /path/to/index.html
+    // /other/path/to/foo.html
+    #[test]
+    fn test_resolve_absolute_from_absolute() -> Result<()> {
+        let abs_index = PathBuf::from("/path/to/index.html");
+        let abs_path = PathBuf::from("/other/path/to/foo.html");
+        let base = Some(Base::Local(PathBuf::from("/some/absolute/base/dir")));
+        assert_eq!(
+            resolve(&abs_index, &abs_path, &base)?,
+            PathBuf::from("/some/absolute/base/dir/other/path/to/foo.html")
+        );
+        Ok(())
+    }
+}
--- a/lychee-lib/src/helpers/url.rs
+++ b/lychee-lib/src/helpers/url.rs
@ -0,0 +1,93 @@
+use linkify::LinkFinder;
+
+/// Remove all GET parameters from a URL.
+/// The link is not a URL but a String as it may not have a base domain.
+pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str {
+    let path = match url.split_once('#') {
+        Some((path_without_fragment, _fragment)) => path_without_fragment,
+        None => url,
+    };
+    let path = match path.split_once('?') {
+        Some((path_without_params, _params)) => path_without_params,
+        None => path,
+    };
+    path
+}
+
+/// Determine if an element's attribute contains a link / URL.
+pub(crate) fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
+    // See a comprehensive list of attributes that might contain URLs/URIs
+    // over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
+    matches!(
+        (attr_name, elem_name),
+        ("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
+    )
+}
+
+// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
+pub(crate) fn is_anchor(url: &str) -> bool {
+    url.starts_with('#')
+}
+
+// Use `LinkFinder` to offload the raw link searching in plaintext
+pub(crate) fn find_links(input: &str) -> Vec<linkify::Link> {
+    let finder = LinkFinder::new();
+    finder.links(input).collect()
+}
+
+#[cfg(test)]
+mod test_fs_tree {
+    use super::*;
+
+    #[test]
+    fn test_is_anchor() {
+        assert!(is_anchor("#anchor"));
+        assert!(!is_anchor("notan#anchor"));
+    }
+
+    #[test]
+    fn test_remove_get_params_and_fragment() {
+        assert_eq!(remove_get_params_and_fragment("/"), "/");
+        assert_eq!(
+            remove_get_params_and_fragment("index.html?foo=bar"),
+            "index.html"
+        );
+        assert_eq!(
+            remove_get_params_and_fragment("/index.html?foo=bar"),
+            "/index.html"
+        );
+        assert_eq!(
+            remove_get_params_and_fragment("/index.html?foo=bar&baz=zorx?bla=blub"),
+            "/index.html"
+        );
+        assert_eq!(
+            remove_get_params_and_fragment("https://example.org/index.html?foo=bar"),
+            "https://example.org/index.html"
+        );
+        assert_eq!(
+            remove_get_params_and_fragment("test.png?foo=bar"),
+            "test.png"
+        );
+
+        assert_eq!(
+            remove_get_params_and_fragment("https://example.org/index.html#anchor"),
+            "https://example.org/index.html"
+        );
+        assert_eq!(
+            remove_get_params_and_fragment("https://example.org/index.html?foo=bar#anchor"),
+            "https://example.org/index.html"
+        );
+        assert_eq!(
+            remove_get_params_and_fragment("test.png?foo=bar#anchor"),
+            "test.png"
+        );
+        assert_eq!(
+            remove_get_params_and_fragment("test.png#anchor?anchor!?"),
+            "test.png"
+        );
+        assert_eq!(
+            remove_get_params_and_fragment("test.png?foo=bar#anchor?anchor!"),
+            "test.png"
+        );
+    }
+}
--- a/lychee-lib/src/lib.rs
+++ b/lychee-lib/src/lib.rs
@ -41,18 +41,18 @@
 )]
 #![deny(anonymous_parameters, macro_use_extern_crate, pointer_structural_match)]
 #![deny(missing_docs)]
+#![allow(clippy::module_name_repetitions)]

 #[cfg(doctest)]
 doc_comment::doctest!("../../README.md");

 mod client;
 mod client_pool;
-mod quirks;
-mod types;
-mod uri;
-
 /// A pool of clients, to handle concurrent checks
 pub mod collector;
+mod helpers;
+mod quirks;
+mod types;

 /// Functionality to extract URIs from inputs
 pub mod extract;
@ -75,8 +75,7 @@ use ring as _; // required for apple silicon
 pub use crate::{
    client::{check, ClientBuilder},
    client_pool::ClientPool,
-    collector::{Collector, Input},
+    collector::Collector,
    filter::{Excludes, Filter, Includes},
-    types::{ErrorKind, Request, Response, ResponseBody, Result, Status},
-    uri::Uri,
+    types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri},
 };
--- a/lychee-lib/src/types/base.rs
+++ b/lychee-lib/src/types/base.rs
@ -0,0 +1,83 @@
+use reqwest::Url;
+use serde::{Deserialize, Serialize};
+use std::{convert::TryFrom, path::PathBuf};
+
+use crate::ErrorKind;
+
+/// When encountering links without a full domain in a document,
+/// the base determines where this resource can be found.
+/// Both, local and remote targets are supported.
+#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)]
+#[allow(variant_size_differences)]
+pub enum Base {
+    /// Local file path pointing to root directory
+    Local(PathBuf),
+    /// Remote URL pointing to a website homepage
+    Remote(Url),
+}
+
+impl Base {
+    /// Join link with base url
+    #[must_use]
+    pub fn join(&self, link: &str) -> Option<Url> {
+        match self {
+            Self::Remote(url) => url.join(link).ok(),
+            Self::Local(_) => None,
+        }
+    }
+
+    /// Return the directory if the base is local
+    #[must_use]
+    pub fn dir(&self) -> Option<PathBuf> {
+        match self {
+            Self::Remote(_) => None,
+            Self::Local(d) => Some(d.clone()),
+        }
+    }
+}
+
+impl TryFrom<&str> for Base {
+    type Error = ErrorKind;
+
+    fn try_from(value: &str) -> Result<Self, Self::Error> {
+        if let Ok(url) = Url::parse(value) {
+            if url.cannot_be_a_base() {
+                return Err(ErrorKind::InvalidBase(
+                    value.to_string(),
+                    "The given URL cannot be a base".to_string(),
+                ));
+            }
+            return Ok(Self::Remote(url));
+        }
+        Ok(Self::Local(PathBuf::from(value)))
+    }
+}
+
+#[cfg(test)]
+mod test_base {
+    use crate::Result;
+
+    use super::*;
+
+    #[test]
+    fn test_valid_remote() -> Result<()> {
+        let base = Base::try_from("https://endler.dev")?;
+        assert_eq!(
+            base,
+            Base::Remote(Url::parse("https://endler.dev").unwrap())
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_invalid_url() {
+        assert!(Base::try_from("data:text/plain,Hello?World#").is_err());
+    }
+
+    #[test]
+    fn test_valid_local() -> Result<()> {
+        let dir = tempfile::tempdir()?;
+        Base::try_from(dir.as_ref().to_str().unwrap())?;
+        Ok(())
+    }
+}
--- a/lychee-lib/src/types/error.rs
+++ b/lychee-lib/src/types/error.rs
@ -10,21 +10,32 @@ use crate::Uri;
 #[derive(Debug)]
 #[non_exhaustive]
 pub enum ErrorKind {
-    // TODO: maybe need to be splitted; currently first slot is Some only for reading files
+    // TODO: maybe needs to be split; currently first element is `Some` only for
+    // reading files
    /// Any form of I/O error occurred while reading from a given path.
    IoError(Option<PathBuf>, std::io::Error),
+    /// Errors which can occur when attempting to interpret a sequence of u8 as a string
+    Utf8Error(std::str::Utf8Error),
    /// Network error when trying to connect to an endpoint via reqwest.
    ReqwestError(reqwest::Error),
    /// Network error when trying to connect to an endpoint via hubcaps.
    HubcapsError(hubcaps::Error),
-    /// The given string can not be parsed into a valid URL or e-mail address
+    /// The given string can not be parsed into a valid URL, e-mail address, or file path
    UrlParseError(String, (url::ParseError, Option<fast_chemail::ParseError>)),
+    /// The given URI cannot be converted to a file path
+    InvalidFilePath(Uri),
+    /// The given path cannot be converted to a URI
+    InvalidUrl(PathBuf),
    /// The given mail address is unreachable
    UnreachableEmailAddress(Uri),
    /// The given header could not be parsed.
    /// A possible error when converting a `HeaderValue` from a string or byte
    /// slice.
    InvalidHeader(InvalidHeaderValue),
+    /// The given string can not be parsed into a valid base URL or base directory
+    InvalidBase(String, String),
+    /// Cannot find local file
+    FileNotFound(PathBuf),
    /// The given UNIX glob pattern is invalid
    InvalidGlobPattern(glob::PatternError),
    /// The Github API could not be called because of a missing Github token.
@ -63,8 +74,14 @@ impl Hash for ErrorKind {
            Self::IoError(p, e) => (p, e.kind()).hash(state),
            Self::ReqwestError(e) => e.to_string().hash(state),
            Self::HubcapsError(e) => e.to_string().hash(state),
+            Self::FileNotFound(e) => e.to_string_lossy().hash(state),
            Self::UrlParseError(s, e) => (s, e.type_id()).hash(state),
-            Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state),
+            Self::InvalidUrl(p) => p.hash(state),
+            Self::Utf8Error(e) => e.to_string().hash(state),
+            Self::InvalidFilePath(u) | Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => {
+                u.hash(state);
+            }
+            Self::InvalidBase(base, e) => (base, e).hash(state),
            Self::InvalidHeader(e) => e.to_string().hash(state),
            Self::InvalidGlobPattern(e) => e.to_string().hash(state),
            Self::MissingGitHubToken => std::mem::discriminant(self).hash(state),
@ -84,6 +101,7 @@ impl Display for ErrorKind {
            Self::IoError(None, e) => e.fmt(f),
            Self::ReqwestError(e) => e.fmt(f),
            Self::HubcapsError(e) => e.fmt(f),
+            Self::FileNotFound(e) => write!(f, "{}", e.to_string_lossy()),
            Self::UrlParseError(s, (url_err, Some(mail_err))) => {
                write!(
                    f,
@ -94,6 +112,8 @@ impl Display for ErrorKind {
            Self::UrlParseError(s, (url_err, None)) => {
                write!(f, "Cannot parse {} as website url ({})", s, url_err)
            }
+            Self::InvalidFilePath(u) => write!(f, "Invalid file URI: {}", u),
+            Self::InvalidUrl(p) => write!(f, "Invalid path: {}", p.display()),
            Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri),
            Self::InvalidHeader(e) => e.fmt(f),
            Self::InvalidGlobPattern(e) => e.fmt(f),
@ -106,6 +126,8 @@ impl Display for ErrorKind {
                "This URL is available in HTTPS protocol, but HTTP is provided, use '{}' instead",
                uri
            ),
+            Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e),
+            Self::Utf8Error(e) => e.fmt(f),
        }
    }
 }
@ -125,6 +147,12 @@ impl From<(PathBuf, std::io::Error)> for ErrorKind {
    }
 }

+impl From<std::str::Utf8Error> for ErrorKind {
+    fn from(e: std::str::Utf8Error) -> Self {
+        Self::Utf8Error(e)
+    }
+}
+
 impl From<std::io::Error> for ErrorKind {
    fn from(e: std::io::Error) -> Self {
        Self::IoError(None, e)
@ -149,6 +177,12 @@ impl From<hubcaps::errors::Error> for ErrorKind {
    }
 }

+impl From<url::ParseError> for ErrorKind {
+    fn from(e: url::ParseError) -> Self {
+        Self::UrlParseError("Cannot parse URL".to_string(), (e, None))
+    }
+}
+
 impl From<(String, url::ParseError)> for ErrorKind {
    fn from(value: (String, url::ParseError)) -> Self {
        Self::UrlParseError(value.0, (value.1, None))
--- a/lychee-lib/src/types/file.rs
+++ b/lychee-lib/src/types/file.rs
@ -0,0 +1,37 @@
+use std::path::Path;
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+/// `FileType` defines which file types lychee can handle
+pub enum FileType {
+    /// File in HTML format
+    Html,
+    /// File in Markdown format
+    Markdown,
+    /// Generic text file without syntax-specific parsing
+    Plaintext,
+}
+
+impl Default for FileType {
+    fn default() -> Self {
+        Self::Plaintext
+    }
+}
+
+impl<P: AsRef<Path>> From<P> for FileType {
+    /// Detect if the given path points to a Markdown, HTML, or plaintext file.
+    fn from(p: P) -> FileType {
+        let path = p.as_ref();
+        // Assume HTML in case of no extension.
+        // Note: this is only reasonable for URLs; not paths on disk.
+        // For example, `README` without an extension is more likely to be a plaintext file.
+        // A better solution would be to also implement `From<Url> for FileType`.
+        // Unfortunately that's not possible without refactoring, as
+        // `AsRef<Path>` could be implemented for `Url` in the future, which is why
+        // `From<Url> for FileType` is not allowed.
+        match path.extension().and_then(std::ffi::OsStr::to_str) {
+            Some("md" | "markdown") => FileType::Markdown,
+            Some("htm" | "html") | None => FileType::Html,
+            Some(_) => FileType::Plaintext,
+        }
+    }
+}
--- a/lychee-lib/src/types/input.rs
+++ b/lychee-lib/src/types/input.rs
@ -0,0 +1,214 @@
+use crate::types::FileType;
+use crate::Result;
+use glob::glob_with;
+use reqwest::Url;
+use serde::Serialize;
+use shellexpand::tilde;
+use std::path::{Path, PathBuf};
+use std::{fmt::Display, fs::read_to_string};
+use tokio::io::{stdin, AsyncReadExt};
+
+const STDIN: &str = "-";
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+#[non_exhaustive]
+/// An exhaustive list of input sources, which lychee accepts
+pub enum Input {
+    /// URL (of HTTP/HTTPS scheme).
+    RemoteUrl(Box<Url>),
+    /// Unix shell-style glob pattern.
+    FsGlob {
+        /// The glob pattern matching all input files
+        pattern: String,
+        /// Don't be case sensitive when matching files against a glob
+        ignore_case: bool,
+    },
+    /// File path.
+    FsPath(PathBuf),
+    /// Standard Input.
+    Stdin,
+    /// Raw string input.
+    String(String),
+}
+
+impl Serialize for Input {
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.collect_str(self)
+    }
+}
+
+impl Display for Input {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(match self {
+            Input::RemoteUrl(url) => url.as_str(),
+            Input::FsGlob { pattern, .. } => pattern,
+            Input::FsPath(path) => path.to_str().unwrap_or_default(),
+            Input::Stdin => "stdin",
+            Input::String(_) => "raw input string",
+        })
+    }
+}
+
+#[derive(Debug)]
+/// Encapsulates the content for a given input
+pub struct InputContent {
+    /// Input source
+    pub input: Input,
+    /// File type of given input
+    pub file_type: FileType,
+    /// Raw UTF-8 string content
+    pub content: String,
+}
+
+impl InputContent {
+    #[must_use]
+    /// Create an instance of `InputContent` from an input string
+    pub fn from_string(s: &str, file_type: FileType) -> Self {
+        // TODO: consider using Cow (to avoid one .clone() for String types)
+        Self {
+            input: Input::String(s.to_owned()),
+            file_type,
+            content: s.to_owned(),
+        }
+    }
+}
+
+impl Input {
+    #[must_use]
+    /// Construct a new `Input` source. In case the input is a `glob` pattern,
+    /// `glob_ignore_case` decides whether matching files against the `glob` is
+    /// case-insensitive or not
+    pub fn new(value: &str, glob_ignore_case: bool) -> Self {
+        if value == STDIN {
+            Self::Stdin
+        } else if let Ok(url) = Url::parse(value) {
+            Self::RemoteUrl(Box::new(url))
+        } else {
+            // this seems to be the only way to determine if this is a glob pattern
+            let is_glob = glob::Pattern::escape(value) != value;
+
+            if is_glob {
+                Self::FsGlob {
+                    pattern: value.to_owned(),
+                    ignore_case: glob_ignore_case,
+                }
+            } else {
+                Self::FsPath(value.into())
+            }
+        }
+    }
+
+    #[allow(clippy::missing_panics_doc)]
+    /// Retrieve the contents from the input
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if the contents can not be retrieved
+    /// because of an underlying I/O error (e.g. an error while making a
+    /// network request or retrieving the contents from the file system)
+    pub async fn get_contents(
+        &self,
+        file_type_hint: Option<FileType>,
+        skip_missing: bool,
+    ) -> Result<Vec<InputContent>> {
+        match *self {
+            // TODO: should skip_missing also affect URLs?
+            Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]),
+            Input::FsGlob {
+                ref pattern,
+                ignore_case,
+            } => Ok(Self::glob_contents(pattern, ignore_case).await?),
+            Input::FsPath(ref path) => {
+                let content = Self::path_content(path);
+                match content {
+                    Ok(input_content) => Ok(vec![input_content]),
+                    Err(_) if skip_missing => Ok(vec![]),
+                    Err(e) => Err(e),
+                }
+            }
+            Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]),
+            Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]),
+        }
+    }
+
+    async fn url_contents(url: &Url) -> Result<InputContent> {
+        // Assume HTML for default paths
+        let file_type = if url.path().is_empty() || url.path() == "/" {
+            FileType::Html
+        } else {
+            FileType::from(url.as_str())
+        };
+
+        let res = reqwest::get(url.clone()).await?;
+        let input_content = InputContent {
+            input: Input::RemoteUrl(Box::new(url.clone())),
+            file_type,
+            content: res.text().await?,
+        };
+
+        Ok(input_content)
+    }
+
+    async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result<Vec<InputContent>> {
+        let mut contents = vec![];
+        let glob_expanded = tilde(&path_glob);
+        let mut match_opts = glob::MatchOptions::new();
+
+        match_opts.case_sensitive = !ignore_case;
+
+        for entry in glob_with(&glob_expanded, match_opts)? {
+            match entry {
+                Ok(path) => {
+                    if path.is_dir() {
+                        // Directories can still have a suffix which looks like
+                        // a file extension like `foo.html`. This can lead to
+                        // unexpected behavior with glob patterns like
+                        // `**/*.html`. Therefore filter these out.
+                        // https://github.com/lycheeverse/lychee/pull/262#issuecomment-913226819
+                        continue;
+                    }
+                    let content = Self::path_content(&path)?;
+                    contents.push(content);
+                }
+                Err(e) => println!("{:?}", e),
+            }
+        }
+
+        Ok(contents)
+    }
+
+    /// Get the input content of a given path
+    /// # Errors
+    ///
+    /// Will return `Err` if file contents can't be read
+    pub fn path_content<P: Into<PathBuf> + AsRef<Path> + Clone>(path: P) -> Result<InputContent> {
+        let content = read_to_string(&path).map_err(|e| (path.clone().into(), e))?;
+        let input_content = InputContent {
+            file_type: FileType::from(path.as_ref()),
+            content,
+            input: Input::FsPath(path.into()),
+        };
+
+        Ok(input_content)
+    }
+
+    async fn stdin_content(file_type_hint: Option<FileType>) -> Result<InputContent> {
+        let mut content = String::new();
+        let mut stdin = stdin();
+        stdin.read_to_string(&mut content).await?;
+
+        let input_content = InputContent {
+            input: Input::Stdin,
+            file_type: file_type_hint.unwrap_or_default(),
+            content,
+        };
+
+        Ok(input_content)
+    }
+
+    fn string_content(s: &str, file_type_hint: Option<FileType>) -> InputContent {
+        InputContent::from_string(s, file_type_hint.unwrap_or_default())
+    }
+}
--- a/lychee-lib/src/types/mod.rs
+++ b/lychee-lib/src/types/mod.rs
@ -1,14 +1,22 @@
 #![allow(unreachable_pub)]

+mod base;
 mod error;
+mod file;
+mod input;
 mod request;
 mod response;
 mod status;
+mod uri;

+pub use base::Base;
 pub use error::ErrorKind;
+pub use file::FileType;
+pub use input::{Input, InputContent};
 pub use request::Request;
 pub use response::{Response, ResponseBody};
 pub use status::Status;
+pub use uri::Uri;

 /// The lychee `Result` type
 pub type Result<T> = std::result::Result<T, crate::ErrorKind>;
--- a/lychee-lib/src/types/uri.rs
+++ b/lychee-lib/src/types/uri.rs
@ -82,9 +82,16 @@ impl Uri {
    }

    #[inline]
+    /// Check if the URI is a valid mail address
    pub(crate) fn is_mail(&self) -> bool {
        self.scheme() == "mailto"
    }
+
+    #[inline]
+    /// Check if the URI is a file
+    pub(crate) fn is_file(&self) -> bool {
+        self.scheme() == "file"
+    }
 }

 impl AsRef<str> for Uri {