Fix HTML parsing for non-closed elements like <link> (#92)

* Fix HTML parsing for non-closed elements like <link> The XML parser we use requires all tags to be closed by default, and if they aren't (like HTML5 <link> elements), it simply gives up on further parsing. This change makes it ignore such issues. Also uncover a bug with the current parser (it simply won't parse elements like `<script defer src="..."></script>`) -- e.g. elements with no attribute values. The XML parser is an XML parser and will have to be replaced with HTML aware parser in the future. * Add check for empty elements * Update extract.rs Co-authored-by: Matthias <matthias-endler@gmx.net>
2026-05-22 20:35:49 +00:00 · 2021-01-03 17:32:13 +01:00 · 2021-01-03 17:32:13 +01:00 · cd00fa643e
commit cd00fa643e
parent fa9c5ea2cf
2 changed files with 101 additions and 1 deletions
--- a/fixtures/TEST_HTML5.html
+++ b/fixtures/TEST_HTML5.html
@ -0,0 +1,23 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <link rel="home" href="https://example.com/head/home">
+  <title>Test</title>
+  <meta name="description" content="Test HTML5 parsing (not valid XML)">
+
+  <!-- The links below have no closing tags (not valid XML) -->
+  <link rel="icon" type="image/png" sizes="32x32" href="images/icon.png">
+  <link rel="stylesheet" type="text/css" href="https://example.com/css/style_full_url.css">
+  <link rel="stylesheet" type="text/css" href="css/style_relative_url.css">
+
+  <!-- The defer attribute has no value (not valid XML) -->
+  <script defer src="js/script.js"></script>
+</head>
+<body>
+  Hello world.
+  <a href="https://example.com/body/a">Link in body</a>
+  <!-- Empty a tag might be problematic (in terms of browser support), but should still be parsed -->
+  <div><a href="https://example.com/body/div_empty_a"/></div>
+</body>
+</html>
--- a/src/extract.rs
+++ b/src/extract.rs
@ -60,11 +60,17 @@ fn extract_links_from_markdown(input: &str) -> Vec<String> {
 // Extracting unparsed URL strings from a HTML string
 fn extract_links_from_html(input: &str) -> Vec<String> {
    let mut reader = Reader::from_str(input);
+
+    // allow not well-formed XML documents, which contain non-closed elements
+    // (e.g. HTML5 which has things like `<link>`)
+    reader.check_end_names(false);
+
    let mut buf = Vec::new();
    let mut urls = Vec::new();
+
    while let Ok(e) = reader.read_event(&mut buf) {
        match e {
-            HTMLEvent::Start(ref e) => {
+            HTMLEvent::Start(ref e) | HTMLEvent::Empty(ref e) => {
                for attr in e.attributes() {
                    if let Ok(attr) = attr {
                        match (attr.key, e.name()) {
@ -161,6 +167,8 @@ pub(crate) fn extract_links(input_content: &InputContent, base_url: Option<Url>)
 #[cfg(test)]
 mod test {
    use super::*;
+    use std::fs::File;
+    use std::io::{BufReader, Read};

    #[test]
    fn test_extract_markdown_links() {
@ -248,4 +256,73 @@ mod test {
        assert!(links.len() == 1);
        assert_eq!(links[0].as_str(), expected);
    }
+
+    #[test]
+    fn test_extract_html5_not_valid_xml() {
+        let test_html5 = Path::new(module_path!())
+            .parent()
+            .unwrap()
+            .join("fixtures")
+            .join("TEST_HTML5.html");
+
+        let file = File::open(test_html5).expect("Unable to open test file");
+        let mut buf_reader = BufReader::new(file);
+        let mut input = String::new();
+        buf_reader
+            .read_to_string(&mut input)
+            .expect("Unable to read test file contents");
+
+        let links = extract_links(&InputContent::from_string(&input, FileType::HTML), None);
+        let expected_links = [
+            Uri::Website(Url::parse("https://example.com/head/home").unwrap()),
+            Uri::Website(Url::parse("https://example.com/css/style_full_url.css").unwrap()),
+            // the body links wouldn't be present if the file was parsed strictly as XML
+            Uri::Website(Url::parse("https://example.com/body/a").unwrap()),
+            Uri::Website(Url::parse("https://example.com/body/div_empty_a").unwrap()),
+        ]
+        .iter()
+        .cloned()
+        .collect();
+
+        assert_eq!(links, expected_links);
+    }
+
+    #[test]
+    fn test_extract_html5_not_valid_xml_relative_links() {
+        let test_html5 = Path::new(module_path!())
+            .parent()
+            .unwrap()
+            .join("fixtures")
+            .join("TEST_HTML5.html");
+
+        let file = File::open(test_html5).expect("Unable to open test file");
+        let mut buf_reader = BufReader::new(file);
+        let mut input = String::new();
+        buf_reader
+            .read_to_string(&mut input)
+            .expect("Unable to read test file contents");
+
+        let links = extract_links(
+            &InputContent::from_string(&input, FileType::HTML),
+            Some(Url::parse("https://example.com").unwrap()),
+        );
+        let expected_links = [
+            Uri::Website(Url::parse("https://example.com/head/home").unwrap()),
+            Uri::Website(Url::parse("https://example.com/images/icon.png").unwrap()),
+            Uri::Website(Url::parse("https://example.com/css/style_relative_url.css").unwrap()),
+            Uri::Website(Url::parse("https://example.com/css/style_full_url.css").unwrap()),
+            // TODO BUG: the JS link is missing because the parser can't properly deal
+            //           with `<script defer src="..."></script>` (tags that have attributes with no value)
+            // Uri::Website(Url::parse("https://example.com/js/script.js").unwrap()),
+
+            // the body links wouldn't be present if the file was parsed strictly as XML
+            Uri::Website(Url::parse("https://example.com/body/a").unwrap()),
+            Uri::Website(Url::parse("https://example.com/body/div_empty_a").unwrap()),
+        ]
+        .iter()
+        .cloned()
+        .collect();
+
+        assert_eq!(links, expected_links);
+    }
 }