Assume HTML in case there is no extension (e.g. for URLs) (#197)

This commit is contained in:
Matthias 2021-04-12 14:40:39 +02:00 committed by GitHub
parent c62a44aa30
commit f66aaecf0f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 122 additions and 23 deletions

39
Cargo.lock generated
View file

@ -24,6 +24,15 @@ dependencies = [
"winapi",
]
[[package]]
name = "ansi_term"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
dependencies = [
"winapi",
]
[[package]]
name = "anyhow"
version = "1.0.40"
@ -416,7 +425,7 @@ version = "2.33.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
dependencies = [
"ansi_term",
"ansi_term 0.11.0",
"atty",
"bitflags",
"strsim 0.8.0",
@ -609,6 +618,12 @@ dependencies = [
"syn",
]
[[package]]
name = "diff"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e25ea47919b1560c4e3b7fe0aaab9becf5b84a10325ddf7db0f0ba5e1026499"
[[package]]
name = "difference"
version = "2.0.0"
@ -1399,6 +1414,7 @@ dependencies = [
"openssl-sys",
"pad",
"predicates",
"pretty_assertions",
"pulldown-cmark",
"regex",
"reqwest",
@ -1676,6 +1692,15 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "output_vt100"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53cdc5b785b7a58c5aad8216b3dfa114df64b0b06ae6e1501cef91df2fbdf8f9"
dependencies = [
"winapi",
]
[[package]]
name = "pad"
version = "0.1.6"
@ -1863,6 +1888,18 @@ dependencies = [
"treeline",
]
[[package]]
name = "pretty_assertions"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f297542c27a7df8d45de2b0e620308ab883ad232d06c14b76ac3e144bda50184"
dependencies = [
"ansi_term 0.12.1",
"ctor",
"diff",
"output_vt100",
]
[[package]]
name = "proc-macro-error"
version = "1.0.4"

View file

@ -69,6 +69,7 @@ predicates = "1.0.7"
uuid = { version = "0.8.2", features = ["v4"] }
tempfile = "3.2.0"
doc-comment = "0.3.3"
pretty_assertions = "0.7.1"
[features]
vendored-openssl = ["openssl-sys/vendored"]

View file

@ -240,6 +240,8 @@ fn parse_basic_auth(auth: &str) -> Result<Authorization<Basic>> {
#[cfg(test)]
mod test {
use super::*;
use pretty_assertions::assert_eq;
use http::StatusCode;
use reqwest::header;

View file

@ -125,6 +125,7 @@ mod test_super {
use lychee::{test_utils::website, Status};
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn test_stats_is_empty() {

View file

@ -298,6 +298,7 @@ pub async fn check<T: TryInto<Request>>(request: T) -> Result<Response> {
#[cfg(test)]
mod test {
use super::*;
use pretty_assertions::assert_eq;
use std::time::{Duration, Instant};
use wiremock::matchers::method;
use wiremock::{Mock, MockServer, ResponseTemplate};

View file

@ -134,11 +134,18 @@ impl Input {
}
async fn url_contents(url: &Url) -> Result<InputContent> {
// Assume HTML for default paths
let file_type = if url.path().is_empty() || url.path() == "/" {
FileType::Html
} else {
FileType::from(url.as_str())
};
let res = reqwest::get(url.clone()).await?;
let content = res.text().await?;
let input_content = InputContent {
input: Input::RemoteUrl(url.clone()),
file_type: FileType::from(url.as_str()),
file_type,
content,
};
@ -251,6 +258,8 @@ pub async fn collect_links(
#[cfg(test)]
mod test {
use super::*;
use pretty_assertions::assert_eq;
use crate::{
test_utils::{get_mock_server_with_content, website},
Uri,
@ -259,14 +268,39 @@ mod test {
use std::io::Write;
use std::str::FromStr;
const TEST_STRING: &str = "http://test-string.com";
const TEST_URL: &str = "https://test-url.org";
const TEST_FILE: &str = "https://test-file.io";
const TEST_GLOB_1: &str = "https://test-glob-1.io";
const TEST_GLOB_2_MAIL: &str = "test@glob-2.io";
#[tokio::test]
async fn test_file_without_extension_is_plaintext() -> Result<()> {
let dir = tempfile::tempdir()?;
// Treat as plaintext file (no extension)
let file_path = dir.path().join("README");
let _file = File::create(&file_path)?;
let input = Input::new(&file_path.as_path().display().to_string(), true);
let contents = input.get_contents(None, true).await?;
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].file_type, FileType::Plaintext);
Ok(())
}
#[tokio::test]
async fn test_url_without_extension_is_html() -> Result<()> {
let input = Input::new("https://example.org/", true);
let contents = input.get_contents(None, true).await?;
println!("{:?}", contents);
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].file_type, FileType::Html);
Ok(())
}
#[tokio::test]
async fn test_collect_links() -> Result<()> {
const TEST_STRING: &str = "http://test-string.com";
const TEST_URL: &str = "https://test-url.org";
const TEST_FILE: &str = "https://test-file.io";
const TEST_GLOB_1: &str = "https://test-glob-1.io";
const TEST_GLOB_2_MAIL: &str = "test@glob-2.io";
let dir = tempfile::tempdir()?;
let file_path = dir.path().join("f");
let file_glob_1_path = dir.path().join("glob-1");
@ -293,18 +327,18 @@ mod test {
];
let responses = collect_links(&inputs, None, false, 8).await?;
let links = responses
.into_iter()
.map(|r| r.uri)
.collect::<HashSet<Uri>>();
let mut links = responses.into_iter().map(|r| r.uri).collect::<Vec<Uri>>();
let mut expected_links: HashSet<Uri> = HashSet::new();
expected_links.insert(website(TEST_STRING));
expected_links.insert(website(TEST_URL));
expected_links.insert(website(TEST_FILE));
expected_links.insert(website(TEST_GLOB_1));
expected_links.insert(Uri::Mail(TEST_GLOB_2_MAIL.to_string()));
let mut expected_links: Vec<Uri> = vec![
website(TEST_STRING),
website(TEST_URL),
website(TEST_FILE),
website(TEST_GLOB_1),
Uri::Mail(TEST_GLOB_2_MAIL.to_string()),
];
links.sort();
expected_links.sort();
assert_eq!(links, expected_links);
Ok(())

View file

@ -77,15 +77,13 @@ fn extract_links_from_html(input: &str) -> Vec<String> {
fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
match node.data {
NodeData::Text { ref contents } => {
// escape_default turns tab characters into "\t", newlines into "\n", etc.
let esc_contents = contents.borrow().escape_default().to_string();
for link in extract_links_from_plaintext(&esc_contents) {
for link in extract_links_from_plaintext(&contents.borrow()) {
urls.push(link);
}
}
NodeData::Comment { ref contents } => {
for link in extract_links_from_plaintext(&contents.escape_default().to_string()) {
for link in extract_links_from_plaintext(contents) {
urls.push(link);
}
}
@ -96,7 +94,7 @@ fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
..
} => {
for attr in attrs.borrow().iter() {
let attr_value = attr.value.escape_default().to_string();
let attr_value = attr.value.to_string();
if elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
urls.push(attr_value);
@ -181,6 +179,7 @@ mod test {
use crate::test_utils::website;
use super::*;
use pretty_assertions::assert_eq;
use std::fs::File;
use std::io::{BufReader, Read};
@ -204,6 +203,8 @@ mod test {
#[test]
fn test_file_type() {
// Assume Plaintext in case there is no extension
assert_eq!(FileType::from(Path::new("/")), FileType::Plaintext);
assert_eq!(FileType::from(Path::new("test.md")), FileType::Markdown);
assert_eq!(
FileType::from(Path::new("test.markdown")),
@ -221,6 +222,21 @@ mod test {
);
}
#[test]
fn test_extract_link_at_end_of_line() {
let link = "http://www.apache.org/licenses/LICENSE-2.0";
let input = format!("{}\n", link);
let found = extract_links_from_markdown(&input);
assert_eq!(vec![link], found);
let found = extract_links_from_plaintext(&input);
assert_eq!(vec![link], found);
let found = extract_links_from_html(&input);
assert_eq!(vec![link], found);
}
#[test]
fn test_extract_markdown_links() {
let input = "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)";

View file

@ -102,6 +102,7 @@ mod test {
use reqwest::Url;
use super::*;
use pretty_assertions::assert_eq;
use crate::{test_utils::website, Input};

View file

@ -76,6 +76,7 @@ impl Quirks {
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn test_twitter_request() {

View file

@ -6,7 +6,7 @@ use std::{convert::TryFrom, fmt::Display};
use url::Url;
/// Lychee's own representation of a URI, which encapsulates all support formats
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[derive(Clone, PartialOrd, Ord, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Uri {
/// Website URL
Website(Url),
@ -78,6 +78,7 @@ mod test {
use crate::test_utils::website;
use super::*;
use pretty_assertions::assert_eq;
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
#[test]

View file

@ -1,5 +1,7 @@
#[cfg(test)]
mod cli {
use pretty_assertions::assert_eq;
use anyhow::Result;
use assert_cmd::Command;
use lychee::test_utils;

View file

@ -1,5 +1,7 @@
#[cfg(test)]
mod readme {
use pretty_assertions::assert_eq;
use assert_cmd::Command;
use std::fs::File;
use std::io::{BufReader, Read};