use std::{collections::HashSet, convert::TryFrom, path::Path, path::PathBuf};
use html5ever::{
parse_document,
tendril::{StrTendril, TendrilSink},
};
use log::info;
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use percent_encoding::percent_decode_str;
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
use reqwest::Url;
use crate::{
helpers::{path, url},
types::{FileType, InputContent},
Base, ErrorKind, Input, Request, Result, Uri,
};
/// Main entrypoint for extracting links from various sources
/// (Markdown, HTML, and plaintext)
pub(crate) fn extract_links(
input_content: &InputContent,
base: &Option,
) -> Result> {
let links = match input_content.file_type {
FileType::Markdown => extract_links_from_markdown(&input_content.content),
FileType::Html => extract_links_from_html(&input_content.content),
FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
};
// Only keep legit URLs. For example this filters out anchors.
let mut requests: HashSet = HashSet::new();
for link in links {
let req = if let Ok(uri) = Uri::try_from(&*link) {
Request::new(uri, input_content.input.clone())
} else if let Some(url) = base.as_ref().and_then(|u| u.join(&link)) {
Request::new(Uri { url }, input_content.input.clone())
} else if let Input::FsPath(root) = &input_content.input {
if url::is_anchor(&link) {
// Silently ignore anchor links for now
continue;
}
match create_uri_from_path(root, base, &link)? {
Some(url) => Request::new(Uri { url }, input_content.input.clone()),
None => {
// In case we cannot create a URI from a path but we didn't receive an error,
// it means that some preconditions were not met, e.g. the `base_url` wasn't set.
continue;
}
}
} else {
info!("Handling of {} not implemented yet", &link);
continue;
};
requests.insert(req);
}
Ok(requests)
}
/// Extract unparsed URL strings from a Markdown string.
fn extract_links_from_markdown(input: &str) -> Vec {
let parser = Parser::new(input);
parser
.flat_map(|event| match event {
MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => {
vec![StrTendril::from(url.as_ref())]
}
MDEvent::Text(txt) => extract_links_from_plaintext(&txt),
MDEvent::Html(html) => extract_links_from_html(&html.to_string()),
_ => vec![],
})
.collect()
}
/// Extract unparsed URL strings from an HTML string.
fn extract_links_from_html(input: &str) -> Vec {
let tendril = StrTendril::from(input);
let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);
let mut urls = Vec::new();
// We pass mutable URL references here to avoid
// extra allocations in each recursive descent
walk_html_links(&mut urls, &rc_dom.document);
urls
}
/// Recursively walk links in a HTML document, aggregating URL strings in `urls`.
fn walk_html_links(mut urls: &mut Vec, node: &Handle) {
match node.data {
NodeData::Text { ref contents } => {
urls.append(&mut extract_links_from_plaintext(&contents.borrow()));
}
NodeData::Comment { ref contents } => {
urls.append(&mut extract_links_from_plaintext(contents));
}
NodeData::Element {
ref name,
ref attrs,
..
} => {
for attr in attrs.borrow().iter() {
if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
urls.push(attr.value.clone());
} else {
urls.append(&mut extract_links_from_plaintext(&attr.value));
}
}
}
_ => {}
}
// recursively traverse the document's nodes -- this doesn't need any extra
// exit conditions, because the document is a tree
for child in node.children.borrow().iter() {
walk_html_links(&mut urls, child);
}
}
/// Extract unparsed URL strings from plaintext
fn extract_links_from_plaintext(input: &str) -> Vec {
url::find_links(input)
.iter()
.map(|l| StrTendril::from(l.as_str()))
.collect()
}
fn create_uri_from_path(src: &Path, base: &Option, dst: &str) -> Result