From cd00fa643e96afd0f68be806b9bb9b3d0275d700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Sun, 3 Jan 2021 17:32:13 +0100 Subject: [PATCH] Fix HTML parsing for non-closed elements like (#92) * Fix HTML parsing for non-closed elements like The XML parser we use requires all tags to be closed by default, and if they aren't (like HTML5 elements), it simply gives up on further parsing. This change makes it ignore such issues. Also uncover a bug with the current parser (it simply won't parse elements like ``) -- e.g. elements with no attribute values. The XML parser is an XML parser and will have to be replaced with HTML aware parser in the future. * Add check for empty elements * Update extract.rs Co-authored-by: Matthias --- fixtures/TEST_HTML5.html | 23 ++++++++++++ src/extract.rs | 79 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 fixtures/TEST_HTML5.html diff --git a/fixtures/TEST_HTML5.html b/fixtures/TEST_HTML5.html new file mode 100644 index 0000000..f9ff015 --- /dev/null +++ b/fixtures/TEST_HTML5.html @@ -0,0 +1,23 @@ + + + + + + Test + + + + + + + + + + + + Hello world. + Link in body + +
+ + diff --git a/src/extract.rs b/src/extract.rs index 6e7fea7..5f60a82 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -60,11 +60,17 @@ fn extract_links_from_markdown(input: &str) -> Vec { // Extracting unparsed URL strings from a HTML string fn extract_links_from_html(input: &str) -> Vec { let mut reader = Reader::from_str(input); + + // allow not well-formed XML documents, which contain non-closed elements + // (e.g. HTML5 which has things like ``) + reader.check_end_names(false); + let mut buf = Vec::new(); let mut urls = Vec::new(); + while let Ok(e) = reader.read_event(&mut buf) { match e { - HTMLEvent::Start(ref e) => { + HTMLEvent::Start(ref e) | HTMLEvent::Empty(ref e) => { for attr in e.attributes() { if let Ok(attr) = attr { match (attr.key, e.name()) { @@ -161,6 +167,8 @@ pub(crate) fn extract_links(input_content: &InputContent, base_url: Option) #[cfg(test)] mod test { use super::*; + use std::fs::File; + use std::io::{BufReader, Read}; #[test] fn test_extract_markdown_links() { @@ -248,4 +256,73 @@ mod test { assert!(links.len() == 1); assert_eq!(links[0].as_str(), expected); } + + #[test] + fn test_extract_html5_not_valid_xml() { + let test_html5 = Path::new(module_path!()) + .parent() + .unwrap() + .join("fixtures") + .join("TEST_HTML5.html"); + + let file = File::open(test_html5).expect("Unable to open test file"); + let mut buf_reader = BufReader::new(file); + let mut input = String::new(); + buf_reader + .read_to_string(&mut input) + .expect("Unable to read test file contents"); + + let links = extract_links(&InputContent::from_string(&input, FileType::HTML), None); + let expected_links = [ + Uri::Website(Url::parse("https://example.com/head/home").unwrap()), + Uri::Website(Url::parse("https://example.com/css/style_full_url.css").unwrap()), + // the body links wouldn't be present if the file was parsed strictly as XML + Uri::Website(Url::parse("https://example.com/body/a").unwrap()), + Uri::Website(Url::parse("https://example.com/body/div_empty_a").unwrap()), + ] + .iter() + .cloned() + .collect(); + + assert_eq!(links, expected_links); + } + + #[test] + fn test_extract_html5_not_valid_xml_relative_links() { + let test_html5 = Path::new(module_path!()) + .parent() + .unwrap() + .join("fixtures") + .join("TEST_HTML5.html"); + + let file = File::open(test_html5).expect("Unable to open test file"); + let mut buf_reader = BufReader::new(file); + let mut input = String::new(); + buf_reader + .read_to_string(&mut input) + .expect("Unable to read test file contents"); + + let links = extract_links( + &InputContent::from_string(&input, FileType::HTML), + Some(Url::parse("https://example.com").unwrap()), + ); + let expected_links = [ + Uri::Website(Url::parse("https://example.com/head/home").unwrap()), + Uri::Website(Url::parse("https://example.com/images/icon.png").unwrap()), + Uri::Website(Url::parse("https://example.com/css/style_relative_url.css").unwrap()), + Uri::Website(Url::parse("https://example.com/css/style_full_url.css").unwrap()), + // TODO BUG: the JS link is missing because the parser can't properly deal + // with `` (tags that have attributes with no value) + // Uri::Website(Url::parse("https://example.com/js/script.js").unwrap()), + + // the body links wouldn't be present if the file was parsed strictly as XML + Uri::Website(Url::parse("https://example.com/body/a").unwrap()), + Uri::Website(Url::parse("https://example.com/body/div_empty_a").unwrap()), + ] + .iter() + .cloned() + .collect(); + + assert_eq!(links, expected_links); + } }