From cd00fa643e96afd0f68be806b9bb9b3d0275d700 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= <pawroman@pawroman.dev>
Date: Sun, 3 Jan 2021 17:32:13 +0100
Subject: [PATCH] Fix HTML parsing for non-closed elements like <link> (#92)

* Fix HTML parsing for non-closed elements like <link>

The XML parser we use requires all tags to be closed by default,
and if they aren't (like HTML5 <link> elements), it simply gives up
on further parsing.  This change makes it ignore such issues.

Also uncover a bug with the current parser (it simply won't parse
elements like `<script defer src="..."></script>`) -- e.g. elements
with no attribute values.

The XML parser is an XML parser and will have to be replaced with
HTML aware parser in the future.

* Add check for empty elements

* Update extract.rs

Co-authored-by: Matthias <matthias-endler@gmx.net>
---
 fixtures/TEST_HTML5.html | 23 ++++++++++++
 src/extract.rs           | 79 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 101 insertions(+), 1 deletion(-)
 create mode 100644 fixtures/TEST_HTML5.html
diff --git a/fixtures/TEST_HTML5.html b/fixtures/TEST_HTML5.html
new file mode 100644
index 0000000..f9ff015
--- /dev/null
+++ b/fixtures/TEST_HTML5.html
@@ -0,0 +1,23 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <link rel="home" href="https://example.com/head/home">
+  <title>Test</title>
+  <meta name="description" content="Test HTML5 parsing (not valid XML)">
+
+  <!-- The links below have no closing tags (not valid XML) -->
+  <link rel="icon" type="image/png" sizes="32x32" href="images/icon.png">
+  <link rel="stylesheet" type="text/css" href="https://example.com/css/style_full_url.css">
+  <link rel="stylesheet" type="text/css" href="css/style_relative_url.css">
+
+  <!-- The defer attribute has no value (not valid XML) -->
+  <script defer src="js/script.js"></script>
+</head>
+<body>
+  Hello world.
+  <a href="https://example.com/body/a">Link in body</a>
+  <!-- Empty a tag might be problematic (in terms of browser support), but should still be parsed -->
+  <div><a href="https://example.com/body/div_empty_a"/></div>
+</body>
+</html>
diff --git a/src/extract.rs b/src/extract.rs
index 6e7fea7..5f60a82 100644
--- a/src/extract.rs
+++ b/src/extract.rs
@@ -60,11 +60,17 @@ fn extract_links_from_markdown(input: &str) -> Vec<String> {
 // Extracting unparsed URL strings from a HTML string
 fn extract_links_from_html(input: &str) -> Vec<String> {
     let mut reader = Reader::from_str(input);
+
+    // allow not well-formed XML documents, which contain non-closed elements
+    // (e.g. HTML5 which has things like `<link>`)
+    reader.check_end_names(false);
+
     let mut buf = Vec::new();
     let mut urls = Vec::new();
+
     while let Ok(e) = reader.read_event(&mut buf) {
         match e {
-            HTMLEvent::Start(ref e) => {
+            HTMLEvent::Start(ref e) | HTMLEvent::Empty(ref e) => {
                 for attr in e.attributes() {
                     if let Ok(attr) = attr {
                         match (attr.key, e.name()) {
@@ -161,6 +167,8 @@ pub(crate) fn extract_links(input_content: &InputContent, base_url: Option<Url>)
 #[cfg(test)]
 mod test {
     use super::*;
+    use std::fs::File;
+    use std::io::{BufReader, Read};
 
     #[test]
     fn test_extract_markdown_links() {
@@ -248,4 +256,73 @@ mod test {
         assert!(links.len() == 1);
         assert_eq!(links[0].as_str(), expected);
     }
+
+    #[test]
+    fn test_extract_html5_not_valid_xml() {
+        let test_html5 = Path::new(module_path!())
+            .parent()
+            .unwrap()
+            .join("fixtures")
+            .join("TEST_HTML5.html");
+
+        let file = File::open(test_html5).expect("Unable to open test file");
+        let mut buf_reader = BufReader::new(file);
+        let mut input = String::new();
+        buf_reader
+            .read_to_string(&mut input)
+            .expect("Unable to read test file contents");
+
+        let links = extract_links(&InputContent::from_string(&input, FileType::HTML), None);
+        let expected_links = [
+            Uri::Website(Url::parse("https://example.com/head/home").unwrap()),
+            Uri::Website(Url::parse("https://example.com/css/style_full_url.css").unwrap()),
+            // the body links wouldn't be present if the file was parsed strictly as XML
+            Uri::Website(Url::parse("https://example.com/body/a").unwrap()),
+            Uri::Website(Url::parse("https://example.com/body/div_empty_a").unwrap()),
+        ]
+        .iter()
+        .cloned()
+        .collect();
+
+        assert_eq!(links, expected_links);
+    }
+
+    #[test]
+    fn test_extract_html5_not_valid_xml_relative_links() {
+        let test_html5 = Path::new(module_path!())
+            .parent()
+            .unwrap()
+            .join("fixtures")
+            .join("TEST_HTML5.html");
+
+        let file = File::open(test_html5).expect("Unable to open test file");
+        let mut buf_reader = BufReader::new(file);
+        let mut input = String::new();
+        buf_reader
+            .read_to_string(&mut input)
+            .expect("Unable to read test file contents");
+
+        let links = extract_links(
+            &InputContent::from_string(&input, FileType::HTML),
+            Some(Url::parse("https://example.com").unwrap()),
+        );
+        let expected_links = [
+            Uri::Website(Url::parse("https://example.com/head/home").unwrap()),
+            Uri::Website(Url::parse("https://example.com/images/icon.png").unwrap()),
+            Uri::Website(Url::parse("https://example.com/css/style_relative_url.css").unwrap()),
+            Uri::Website(Url::parse("https://example.com/css/style_full_url.css").unwrap()),
+            // TODO BUG: the JS link is missing because the parser can't properly deal
+            //           with `<script defer src="..."></script>` (tags that have attributes with no value)
+            // Uri::Website(Url::parse("https://example.com/js/script.js").unwrap()),
+
+            // the body links wouldn't be present if the file was parsed strictly as XML
+            Uri::Website(Url::parse("https://example.com/body/a").unwrap()),
+            Uri::Website(Url::parse("https://example.com/body/div_empty_a").unwrap()),
+        ]
+        .iter()
+        .cloned()
+        .collect();
+
+        assert_eq!(links, expected_links);
+    }
 }