diff --git a/fixtures/TEST_HTML5.html b/fixtures/TEST_HTML5.html
new file mode 100644
index 0000000..f9ff015
--- /dev/null
+++ b/fixtures/TEST_HTML5.html
@@ -0,0 +1,23 @@
+
+
+
+
+
+ Test
+
+
+
+
+
+
+
+
+
+
+
+ Hello world.
+ Link in body
+
+
+
+
diff --git a/src/extract.rs b/src/extract.rs
index 6e7fea7..5f60a82 100644
--- a/src/extract.rs
+++ b/src/extract.rs
@@ -60,11 +60,17 @@ fn extract_links_from_markdown(input: &str) -> Vec {
// Extracting unparsed URL strings from a HTML string
fn extract_links_from_html(input: &str) -> Vec {
let mut reader = Reader::from_str(input);
+
+ // allow not well-formed XML documents, which contain non-closed elements
+ // (e.g. HTML5 which has things like ``)
+ reader.check_end_names(false);
+
let mut buf = Vec::new();
let mut urls = Vec::new();
+
while let Ok(e) = reader.read_event(&mut buf) {
match e {
- HTMLEvent::Start(ref e) => {
+ HTMLEvent::Start(ref e) | HTMLEvent::Empty(ref e) => {
for attr in e.attributes() {
if let Ok(attr) = attr {
match (attr.key, e.name()) {
@@ -161,6 +167,8 @@ pub(crate) fn extract_links(input_content: &InputContent, base_url: Option)
#[cfg(test)]
mod test {
use super::*;
+ use std::fs::File;
+ use std::io::{BufReader, Read};
#[test]
fn test_extract_markdown_links() {
@@ -248,4 +256,73 @@ mod test {
assert!(links.len() == 1);
assert_eq!(links[0].as_str(), expected);
}
+
+ #[test]
+ fn test_extract_html5_not_valid_xml() {
+ let test_html5 = Path::new(module_path!())
+ .parent()
+ .unwrap()
+ .join("fixtures")
+ .join("TEST_HTML5.html");
+
+ let file = File::open(test_html5).expect("Unable to open test file");
+ let mut buf_reader = BufReader::new(file);
+ let mut input = String::new();
+ buf_reader
+ .read_to_string(&mut input)
+ .expect("Unable to read test file contents");
+
+ let links = extract_links(&InputContent::from_string(&input, FileType::HTML), None);
+ let expected_links = [
+ Uri::Website(Url::parse("https://example.com/head/home").unwrap()),
+ Uri::Website(Url::parse("https://example.com/css/style_full_url.css").unwrap()),
+ // the body links wouldn't be present if the file was parsed strictly as XML
+ Uri::Website(Url::parse("https://example.com/body/a").unwrap()),
+ Uri::Website(Url::parse("https://example.com/body/div_empty_a").unwrap()),
+ ]
+ .iter()
+ .cloned()
+ .collect();
+
+ assert_eq!(links, expected_links);
+ }
+
+ #[test]
+ fn test_extract_html5_not_valid_xml_relative_links() {
+ let test_html5 = Path::new(module_path!())
+ .parent()
+ .unwrap()
+ .join("fixtures")
+ .join("TEST_HTML5.html");
+
+ let file = File::open(test_html5).expect("Unable to open test file");
+ let mut buf_reader = BufReader::new(file);
+ let mut input = String::new();
+ buf_reader
+ .read_to_string(&mut input)
+ .expect("Unable to read test file contents");
+
+ let links = extract_links(
+ &InputContent::from_string(&input, FileType::HTML),
+ Some(Url::parse("https://example.com").unwrap()),
+ );
+ let expected_links = [
+ Uri::Website(Url::parse("https://example.com/head/home").unwrap()),
+ Uri::Website(Url::parse("https://example.com/images/icon.png").unwrap()),
+ Uri::Website(Url::parse("https://example.com/css/style_relative_url.css").unwrap()),
+ Uri::Website(Url::parse("https://example.com/css/style_full_url.css").unwrap()),
+ // TODO BUG: the JS link is missing because the parser can't properly deal
+ // with `` (tags that have attributes with no value)
+ // Uri::Website(Url::parse("https://example.com/js/script.js").unwrap()),
+
+ // the body links wouldn't be present if the file was parsed strictly as XML
+ Uri::Website(Url::parse("https://example.com/body/a").unwrap()),
+ Uri::Website(Url::parse("https://example.com/body/div_empty_a").unwrap()),
+ ]
+ .iter()
+ .cloned()
+ .collect();
+
+ assert_eq!(links, expected_links);
+ }
}