mirror of
https://github.com/Hopiu/lychee.git
synced 2026-03-16 20:50:25 +00:00
Assume HTML in case there is no extension (e.g. for URLs) (#197)
This commit is contained in:
parent
c62a44aa30
commit
f66aaecf0f
12 changed files with 122 additions and 23 deletions
39
Cargo.lock
generated
39
Cargo.lock
generated
|
|
@ -24,6 +24,15 @@ dependencies = [
|
|||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ansi_term"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.40"
|
||||
|
|
@ -416,7 +425,7 @@ version = "2.33.3"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
|
||||
dependencies = [
|
||||
"ansi_term",
|
||||
"ansi_term 0.11.0",
|
||||
"atty",
|
||||
"bitflags",
|
||||
"strsim 0.8.0",
|
||||
|
|
@ -609,6 +618,12 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "diff"
|
||||
version = "0.1.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0e25ea47919b1560c4e3b7fe0aaab9becf5b84a10325ddf7db0f0ba5e1026499"
|
||||
|
||||
[[package]]
|
||||
name = "difference"
|
||||
version = "2.0.0"
|
||||
|
|
@ -1399,6 +1414,7 @@ dependencies = [
|
|||
"openssl-sys",
|
||||
"pad",
|
||||
"predicates",
|
||||
"pretty_assertions",
|
||||
"pulldown-cmark",
|
||||
"regex",
|
||||
"reqwest",
|
||||
|
|
@ -1676,6 +1692,15 @@ dependencies = [
|
|||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "output_vt100"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "53cdc5b785b7a58c5aad8216b3dfa114df64b0b06ae6e1501cef91df2fbdf8f9"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pad"
|
||||
version = "0.1.6"
|
||||
|
|
@ -1863,6 +1888,18 @@ dependencies = [
|
|||
"treeline",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pretty_assertions"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f297542c27a7df8d45de2b0e620308ab883ad232d06c14b76ac3e144bda50184"
|
||||
dependencies = [
|
||||
"ansi_term 0.12.1",
|
||||
"ctor",
|
||||
"diff",
|
||||
"output_vt100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error"
|
||||
version = "1.0.4"
|
||||
|
|
|
|||
|
|
@ -69,6 +69,7 @@ predicates = "1.0.7"
|
|||
uuid = { version = "0.8.2", features = ["v4"] }
|
||||
tempfile = "3.2.0"
|
||||
doc-comment = "0.3.3"
|
||||
pretty_assertions = "0.7.1"
|
||||
|
||||
[features]
|
||||
vendored-openssl = ["openssl-sys/vendored"]
|
||||
|
|
|
|||
|
|
@ -240,6 +240,8 @@ fn parse_basic_auth(auth: &str) -> Result<Authorization<Basic>> {
|
|||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use http::StatusCode;
|
||||
use reqwest::header;
|
||||
|
||||
|
|
|
|||
|
|
@ -125,6 +125,7 @@ mod test_super {
|
|||
use lychee::{test_utils::website, Status};
|
||||
|
||||
use super::*;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
#[test]
|
||||
fn test_stats_is_empty() {
|
||||
|
|
|
|||
|
|
@ -298,6 +298,7 @@ pub async fn check<T: TryInto<Request>>(request: T) -> Result<Response> {
|
|||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use pretty_assertions::assert_eq;
|
||||
use std::time::{Duration, Instant};
|
||||
use wiremock::matchers::method;
|
||||
use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
|
|
|
|||
|
|
@ -134,11 +134,18 @@ impl Input {
|
|||
}
|
||||
|
||||
async fn url_contents(url: &Url) -> Result<InputContent> {
|
||||
// Assume HTML for default paths
|
||||
let file_type = if url.path().is_empty() || url.path() == "/" {
|
||||
FileType::Html
|
||||
} else {
|
||||
FileType::from(url.as_str())
|
||||
};
|
||||
|
||||
let res = reqwest::get(url.clone()).await?;
|
||||
let content = res.text().await?;
|
||||
let input_content = InputContent {
|
||||
input: Input::RemoteUrl(url.clone()),
|
||||
file_type: FileType::from(url.as_str()),
|
||||
file_type,
|
||||
content,
|
||||
};
|
||||
|
||||
|
|
@ -251,6 +258,8 @@ pub async fn collect_links(
|
|||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use crate::{
|
||||
test_utils::{get_mock_server_with_content, website},
|
||||
Uri,
|
||||
|
|
@ -259,14 +268,39 @@ mod test {
|
|||
use std::io::Write;
|
||||
use std::str::FromStr;
|
||||
|
||||
const TEST_STRING: &str = "http://test-string.com";
|
||||
const TEST_URL: &str = "https://test-url.org";
|
||||
const TEST_FILE: &str = "https://test-file.io";
|
||||
const TEST_GLOB_1: &str = "https://test-glob-1.io";
|
||||
const TEST_GLOB_2_MAIL: &str = "test@glob-2.io";
|
||||
#[tokio::test]
|
||||
async fn test_file_without_extension_is_plaintext() -> Result<()> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
// Treat as plaintext file (no extension)
|
||||
let file_path = dir.path().join("README");
|
||||
let _file = File::create(&file_path)?;
|
||||
let input = Input::new(&file_path.as_path().display().to_string(), true);
|
||||
let contents = input.get_contents(None, true).await?;
|
||||
|
||||
assert_eq!(contents.len(), 1);
|
||||
assert_eq!(contents[0].file_type, FileType::Plaintext);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_url_without_extension_is_html() -> Result<()> {
|
||||
let input = Input::new("https://example.org/", true);
|
||||
let contents = input.get_contents(None, true).await?;
|
||||
|
||||
println!("{:?}", contents);
|
||||
assert_eq!(contents.len(), 1);
|
||||
assert_eq!(contents[0].file_type, FileType::Html);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_collect_links() -> Result<()> {
|
||||
const TEST_STRING: &str = "http://test-string.com";
|
||||
const TEST_URL: &str = "https://test-url.org";
|
||||
const TEST_FILE: &str = "https://test-file.io";
|
||||
const TEST_GLOB_1: &str = "https://test-glob-1.io";
|
||||
const TEST_GLOB_2_MAIL: &str = "test@glob-2.io";
|
||||
|
||||
let dir = tempfile::tempdir()?;
|
||||
let file_path = dir.path().join("f");
|
||||
let file_glob_1_path = dir.path().join("glob-1");
|
||||
|
|
@ -293,18 +327,18 @@ mod test {
|
|||
];
|
||||
|
||||
let responses = collect_links(&inputs, None, false, 8).await?;
|
||||
let links = responses
|
||||
.into_iter()
|
||||
.map(|r| r.uri)
|
||||
.collect::<HashSet<Uri>>();
|
||||
let mut links = responses.into_iter().map(|r| r.uri).collect::<Vec<Uri>>();
|
||||
|
||||
let mut expected_links: HashSet<Uri> = HashSet::new();
|
||||
expected_links.insert(website(TEST_STRING));
|
||||
expected_links.insert(website(TEST_URL));
|
||||
expected_links.insert(website(TEST_FILE));
|
||||
expected_links.insert(website(TEST_GLOB_1));
|
||||
expected_links.insert(Uri::Mail(TEST_GLOB_2_MAIL.to_string()));
|
||||
let mut expected_links: Vec<Uri> = vec![
|
||||
website(TEST_STRING),
|
||||
website(TEST_URL),
|
||||
website(TEST_FILE),
|
||||
website(TEST_GLOB_1),
|
||||
Uri::Mail(TEST_GLOB_2_MAIL.to_string()),
|
||||
];
|
||||
|
||||
links.sort();
|
||||
expected_links.sort();
|
||||
assert_eq!(links, expected_links);
|
||||
|
||||
Ok(())
|
||||
|
|
|
|||
|
|
@ -77,15 +77,13 @@ fn extract_links_from_html(input: &str) -> Vec<String> {
|
|||
fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
|
||||
match node.data {
|
||||
NodeData::Text { ref contents } => {
|
||||
// escape_default turns tab characters into "\t", newlines into "\n", etc.
|
||||
let esc_contents = contents.borrow().escape_default().to_string();
|
||||
for link in extract_links_from_plaintext(&esc_contents) {
|
||||
for link in extract_links_from_plaintext(&contents.borrow()) {
|
||||
urls.push(link);
|
||||
}
|
||||
}
|
||||
|
||||
NodeData::Comment { ref contents } => {
|
||||
for link in extract_links_from_plaintext(&contents.escape_default().to_string()) {
|
||||
for link in extract_links_from_plaintext(contents) {
|
||||
urls.push(link);
|
||||
}
|
||||
}
|
||||
|
|
@ -96,7 +94,7 @@ fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
|
|||
..
|
||||
} => {
|
||||
for attr in attrs.borrow().iter() {
|
||||
let attr_value = attr.value.escape_default().to_string();
|
||||
let attr_value = attr.value.to_string();
|
||||
|
||||
if elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
|
||||
urls.push(attr_value);
|
||||
|
|
@ -181,6 +179,7 @@ mod test {
|
|||
use crate::test_utils::website;
|
||||
|
||||
use super::*;
|
||||
use pretty_assertions::assert_eq;
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, Read};
|
||||
|
||||
|
|
@ -204,6 +203,8 @@ mod test {
|
|||
|
||||
#[test]
|
||||
fn test_file_type() {
|
||||
// Assume Plaintext in case there is no extension
|
||||
assert_eq!(FileType::from(Path::new("/")), FileType::Plaintext);
|
||||
assert_eq!(FileType::from(Path::new("test.md")), FileType::Markdown);
|
||||
assert_eq!(
|
||||
FileType::from(Path::new("test.markdown")),
|
||||
|
|
@ -221,6 +222,21 @@ mod test {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_link_at_end_of_line() {
|
||||
let link = "http://www.apache.org/licenses/LICENSE-2.0";
|
||||
let input = format!("{}\n", link);
|
||||
|
||||
let found = extract_links_from_markdown(&input);
|
||||
assert_eq!(vec![link], found);
|
||||
|
||||
let found = extract_links_from_plaintext(&input);
|
||||
assert_eq!(vec![link], found);
|
||||
|
||||
let found = extract_links_from_html(&input);
|
||||
assert_eq!(vec![link], found);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_markdown_links() {
|
||||
let input = "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)";
|
||||
|
|
|
|||
|
|
@ -102,6 +102,7 @@ mod test {
|
|||
use reqwest::Url;
|
||||
|
||||
use super::*;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use crate::{test_utils::website, Input};
|
||||
|
||||
|
|
|
|||
|
|
@ -76,6 +76,7 @@ impl Quirks {
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
#[test]
|
||||
fn test_twitter_request() {
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ use std::{convert::TryFrom, fmt::Display};
|
|||
use url::Url;
|
||||
|
||||
/// Lychee's own representation of a URI, which encapsulates all support formats
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
#[derive(Clone, PartialOrd, Ord, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum Uri {
|
||||
/// Website URL
|
||||
Website(Url),
|
||||
|
|
@ -78,6 +78,7 @@ mod test {
|
|||
use crate::test_utils::website;
|
||||
|
||||
use super::*;
|
||||
use pretty_assertions::assert_eq;
|
||||
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
#[cfg(test)]
|
||||
mod cli {
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use anyhow::Result;
|
||||
use assert_cmd::Command;
|
||||
use lychee::test_utils;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
#[cfg(test)]
|
||||
mod readme {
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use assert_cmd::Command;
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, Read};
|
||||
|
|
|
|||
Loading…
Reference in a new issue