Fix HTML parsing for non-closed elements like <link> (#92)

* Fix HTML parsing for non-closed elements like <link>

The XML parser we use requires all tags to be closed by default,
and if they aren't (like HTML5 <link> elements), it simply gives up
on further parsing.  This change makes it ignore such issues.

Also uncover a bug with the current parser (it simply won't parse
elements like `<script defer src="..."></script>`) -- e.g. elements
with no attribute values.

The XML parser is an XML parser and will have to be replaced with
HTML aware parser in the future.

* Add check for empty elements

* Update extract.rs

Co-authored-by: Matthias <matthias-endler@gmx.net>
This commit is contained in:
Paweł Romanowski 2021-01-03 17:32:13 +01:00 committed by GitHub
parent fa9c5ea2cf
commit cd00fa643e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 101 additions and 1 deletions

23
fixtures/TEST_HTML5.html Normal file
View file

@ -0,0 +1,23 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<link rel="home" href="https://example.com/head/home">
<title>Test</title>
<meta name="description" content="Test HTML5 parsing (not valid XML)">
<!-- The links below have no closing tags (not valid XML) -->
<link rel="icon" type="image/png" sizes="32x32" href="images/icon.png">
<link rel="stylesheet" type="text/css" href="https://example.com/css/style_full_url.css">
<link rel="stylesheet" type="text/css" href="css/style_relative_url.css">
<!-- The defer attribute has no value (not valid XML) -->
<script defer src="js/script.js"></script>
</head>
<body>
Hello world.
<a href="https://example.com/body/a">Link in body</a>
<!-- Empty a tag might be problematic (in terms of browser support), but should still be parsed -->
<div><a href="https://example.com/body/div_empty_a"/></div>
</body>
</html>

View file

@ -60,11 +60,17 @@ fn extract_links_from_markdown(input: &str) -> Vec<String> {
// Extracting unparsed URL strings from a HTML string
fn extract_links_from_html(input: &str) -> Vec<String> {
let mut reader = Reader::from_str(input);
// allow not well-formed XML documents, which contain non-closed elements
// (e.g. HTML5 which has things like `<link>`)
reader.check_end_names(false);
let mut buf = Vec::new();
let mut urls = Vec::new();
while let Ok(e) = reader.read_event(&mut buf) {
match e {
HTMLEvent::Start(ref e) => {
HTMLEvent::Start(ref e) | HTMLEvent::Empty(ref e) => {
for attr in e.attributes() {
if let Ok(attr) = attr {
match (attr.key, e.name()) {
@ -161,6 +167,8 @@ pub(crate) fn extract_links(input_content: &InputContent, base_url: Option<Url>)
#[cfg(test)]
mod test {
use super::*;
use std::fs::File;
use std::io::{BufReader, Read};
#[test]
fn test_extract_markdown_links() {
@ -248,4 +256,73 @@ mod test {
assert!(links.len() == 1);
assert_eq!(links[0].as_str(), expected);
}
#[test]
fn test_extract_html5_not_valid_xml() {
let test_html5 = Path::new(module_path!())
.parent()
.unwrap()
.join("fixtures")
.join("TEST_HTML5.html");
let file = File::open(test_html5).expect("Unable to open test file");
let mut buf_reader = BufReader::new(file);
let mut input = String::new();
buf_reader
.read_to_string(&mut input)
.expect("Unable to read test file contents");
let links = extract_links(&InputContent::from_string(&input, FileType::HTML), None);
let expected_links = [
Uri::Website(Url::parse("https://example.com/head/home").unwrap()),
Uri::Website(Url::parse("https://example.com/css/style_full_url.css").unwrap()),
// the body links wouldn't be present if the file was parsed strictly as XML
Uri::Website(Url::parse("https://example.com/body/a").unwrap()),
Uri::Website(Url::parse("https://example.com/body/div_empty_a").unwrap()),
]
.iter()
.cloned()
.collect();
assert_eq!(links, expected_links);
}
#[test]
fn test_extract_html5_not_valid_xml_relative_links() {
let test_html5 = Path::new(module_path!())
.parent()
.unwrap()
.join("fixtures")
.join("TEST_HTML5.html");
let file = File::open(test_html5).expect("Unable to open test file");
let mut buf_reader = BufReader::new(file);
let mut input = String::new();
buf_reader
.read_to_string(&mut input)
.expect("Unable to read test file contents");
let links = extract_links(
&InputContent::from_string(&input, FileType::HTML),
Some(Url::parse("https://example.com").unwrap()),
);
let expected_links = [
Uri::Website(Url::parse("https://example.com/head/home").unwrap()),
Uri::Website(Url::parse("https://example.com/images/icon.png").unwrap()),
Uri::Website(Url::parse("https://example.com/css/style_relative_url.css").unwrap()),
Uri::Website(Url::parse("https://example.com/css/style_full_url.css").unwrap()),
// TODO BUG: the JS link is missing because the parser can't properly deal
// with `<script defer src="..."></script>` (tags that have attributes with no value)
// Uri::Website(Url::parse("https://example.com/js/script.js").unwrap()),
// the body links wouldn't be present if the file was parsed strictly as XML
Uri::Website(Url::parse("https://example.com/body/a").unwrap()),
Uri::Website(Url::parse("https://example.com/body/div_empty_a").unwrap()),
]
.iter()
.cloned()
.collect();
assert_eq!(links, expected_links);
}
}