Add support for base_dir

This commit is contained in:
Matthias 2021-06-20 18:58:20 +02:00
parent d5bb7ee7d7
commit f9bf52ef10
6 changed files with 78 additions and 29 deletions

View file

@ -14,8 +14,8 @@ async fn main() -> Result<()> {
];
let links = Collector::new(
None, // base_url
false, // don't skip missing inputs
None, // base_url
None, false, // don't skip missing inputs
10, // max concurrency
)
.collect_links(

View file

@ -197,10 +197,15 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
.client()
.map_err(|e| anyhow!(e))?;
let links = Collector::new(cfg.base_url.clone(), cfg.skip_missing, max_concurrency)
.collect_links(&inputs)
.await
.map_err(|e| anyhow!(e))?;
let links = Collector::new(
cfg.base_url.clone(),
cfg.base_dir.clone(),
cfg.skip_missing,
max_concurrency,
)
.collect_links(&inputs)
.await
.map_err(|e| anyhow!(e))?;
let pb = if cfg.no_progress {
None

View file

@ -218,7 +218,12 @@ pub(crate) struct Config {
pub(crate) method: String,
/// Base URL to check relative URLs
#[structopt(short, long, parse(try_from_str))]
#[structopt(long, parse(try_from_str))]
#[serde(default)]
pub(crate) base_dir: Option<PathBuf>,
/// Base URL to check relative URLs
#[structopt(long, parse(try_from_str))]
#[serde(default)]
pub(crate) base_url: Option<Url>,

View file

@ -1,11 +1,12 @@
use crate::{extract::extract_links, uri::Uri, Input, Request, Result};
use reqwest::Url;
use std::collections::HashSet;
use std::{collections::HashSet, path::PathBuf};
/// Collector keeps the state of link collection
#[derive(Debug, Clone)]
pub struct Collector {
base_url: Option<Url>,
base_dir: Option<PathBuf>,
skip_missing_inputs: bool,
max_concurrency: usize,
cache: HashSet<Uri>,
@ -14,9 +15,15 @@ pub struct Collector {
impl Collector {
/// Create a new collector with an empty cache
#[must_use]
pub fn new(base_url: Option<Url>, skip_missing_inputs: bool, max_concurrency: usize) -> Self {
pub fn new(
base_url: Option<Url>,
base_dir: Option<PathBuf>,
skip_missing_inputs: bool,
max_concurrency: usize,
) -> Self {
Collector {
base_url,
base_dir,
skip_missing_inputs,
max_concurrency,
cache: HashSet::new(),
@ -52,8 +59,10 @@ impl Collector {
while let Some(result) = contents_rx.recv().await {
for input_content in result? {
let base_url = self.base_url.clone();
let handle =
tokio::task::spawn_blocking(move || extract_links(&input_content, &base_url));
let base_dir = self.base_dir.clone();
let handle = tokio::task::spawn_blocking(move || {
extract_links(&input_content, &base_url, &base_dir)
});
extract_links_handles.push(handle);
}
}
@ -160,7 +169,7 @@ mod test {
},
];
let responses = Collector::new(None, false, 8)
let responses = Collector::new(None, None, false, 8)
.collect_links(&inputs)
.await?;
let mut links = responses.into_iter().map(|r| r.uri).collect::<Vec<Uri>>();

View file

@ -11,6 +11,7 @@ use pulldown_cmark::{Event as MDEvent, Parser, Tag};
use url::Url;
use crate::{
fs_tree,
types::{FileType, InputContent},
Input, Request, Result, Uri,
};
@ -106,6 +107,7 @@ fn extract_links_from_plaintext(input: &str) -> Vec<String> {
pub(crate) fn extract_links(
input_content: &InputContent,
base_url: &Option<Url>,
base_dir: &Option<PathBuf>,
) -> Result<HashSet<Request>> {
let links = match input_content.file_type {
FileType::Markdown => extract_links_from_markdown(&input_content.content),
@ -125,9 +127,9 @@ pub(crate) fn extract_links(
input_content.input.clone(),
));
} else if let Input::FsPath(root) = &input_content.input {
if let Ok(path) = crate::fs_tree::find(&root, &PathBuf::from(&link)) {
if let Ok(path) = fs_tree::find(&root, &PathBuf::from(&link), base_dir) {
let input_content = Input::path_content(path)?;
requests.extend(extract_links(&input_content, base_url)?);
requests.extend(extract_links(&input_content, base_url, base_dir)?);
} else {
info!("Cannot find path to {} in filesystem", &link);
}
@ -183,6 +185,7 @@ mod test {
extract_links(
&InputContent::from_string(input, file_type),
&base_url.map(|u| Url::parse(u).unwrap()),
&None,
)
// unwrap is fine here as this helper function is only used in tests
.unwrap()

View file

@ -1,18 +1,30 @@
use crate::{ErrorKind, Result};
use std::path::{Path, PathBuf};
pub(crate) fn find(root: &Path, dst: &Path) -> Result<PathBuf> {
pub(crate) fn find(src: &Path, dst: &Path, base_dir: &Option<PathBuf>) -> Result<PathBuf> {
if dst.exists() {
return Ok(dst.to_path_buf());
}
if dst.is_dir() {
return Err(ErrorKind::FileNotFound(dst.into()));
}
// Find `dst` in the `root` path
if let Some(parent) = root.parent() {
let rel = parent.join(dst.to_path_buf());
if rel.exists() {
return Ok(rel);
if dst.is_absolute() {
// Absolute local links (leading slash) require the base_url to
// define the document root.
if let Some(base_dir) = base_dir {
let absolute = base_dir.join(dst.to_path_buf());
if absolute.exists() {
return Ok(absolute);
}
}
}
if dst.is_relative() {
// Find `dst` in the `root` path
if let Some(parent) = src.parent() {
let relative = parent.join(dst.to_path_buf());
if relative.exists() {
return Ok(relative);
}
}
}
Err(ErrorKind::FileNotFound(dst.to_path_buf()))
@ -33,7 +45,7 @@ mod test_fs_tree {
let dir = tempfile::tempdir()?;
let dst = dir.path().join("foo.html");
File::create(&dst)?;
assert_eq!(find(&dummy, &dst)?, dst);
assert_eq!(find(&dummy, &dst, &None)?, dst);
Ok(())
}
@ -45,7 +57,7 @@ mod test_fs_tree {
let dir = tempfile::tempdir()?;
let dst = dir.path().join("./foo.html");
File::create(&dst)?;
assert_eq!(find(&root, &dst)?, dst);
assert_eq!(find(&root, &dst, &None)?, dst);
Ok(())
}
@ -57,7 +69,7 @@ mod test_fs_tree {
let dir = tempfile::tempdir()?;
let dst = dir.path().join("./foo.html");
File::create(&dst)?;
assert_eq!(find(&root, &dst)?, dst);
assert_eq!(find(&root, &dst, &None)?, dst);
Ok(())
}
@ -66,7 +78,7 @@ mod test_fs_tree {
let root = PathBuf::from("index.html");
// This file does not exist
let dst = PathBuf::from("./foo.html");
assert!(find(&root, &dst).is_err());
assert!(find(&root, &dst, &None).is_err());
Ok(())
}
@ -81,7 +93,22 @@ mod test_fs_tree {
let dst = PathBuf::from("./foo.html");
let dst_absolute = dir.path().join("./foo.html");
File::create(&dst_absolute)?;
assert_eq!(find(&root, &dst)?, dst_absolute);
assert_eq!(find(&root, &dst, &None)?, dst_absolute);
Ok(())
}
// dummy
// ./foo.html
// valid base dir
#[test]
fn test_find_absolute_from_base_dir() -> Result<()> {
let dummy = PathBuf::new();
let dir = tempfile::tempdir()?;
let dst = dir.path().join("foo.html");
File::create(&dst)?;
let base_dir = dir.path().to_path_buf();
let dst_absolute = base_dir.join(dst.to_path_buf());
assert_eq!(find(&dummy, &dst, &Some(base_dir))?, dst_absolute);
Ok(())
}
@ -94,7 +121,7 @@ mod test_fs_tree {
// We create the absolute path to foo.html,
// but we address it under its relative path
let dst = PathBuf::from("./foo.html");
assert!(find(&root, &dst).is_err());
assert!(find(&root, &dst, &None).is_err());
Ok(())
}
@ -106,7 +133,7 @@ mod test_fs_tree {
let dir = tempfile::tempdir()?;
let dst = dir.path().join("foo.html");
File::create(&dst)?;
assert_eq!(find(&root, &dst)?, dst);
assert_eq!(find(&root, &dst, &None)?, dst);
Ok(())
}
@ -118,7 +145,7 @@ mod test_fs_tree {
let dir = tempfile::tempdir()?;
let dst = dir.path().join("foo.html");
File::create(&dst)?;
assert_eq!(find(&root, &dst)?, dst);
assert_eq!(find(&root, &dst, &None)?, dst);
Ok(())
}
@ -129,7 +156,7 @@ mod test_fs_tree {
let root = PathBuf::from("/path/to/");
let dir = tempfile::tempdir()?;
File::create(&dir)?;
assert!(find(&root, &dir.into_path()).is_err());
assert!(find(&root, &dir.into_path(), &None).is_err());
Ok(())
}
}