mirror of
https://github.com/Hopiu/lychee.git
synced 2026-04-07 06:51:00 +00:00
Fix HTML parsing for non-closed elements like <link> (#92)
* Fix HTML parsing for non-closed elements like <link> The XML parser we use requires all tags to be closed by default, and if they aren't (like HTML5 <link> elements), it simply gives up on further parsing. This change makes it ignore such issues. Also uncover a bug with the current parser (it simply won't parse elements like `<script defer src="..."></script>`) -- e.g. elements with no attribute values. The XML parser is an XML parser and will have to be replaced with HTML aware parser in the future. * Add check for empty elements * Update extract.rs Co-authored-by: Matthias <matthias-endler@gmx.net>
This commit is contained in:
parent
fa9c5ea2cf
commit
cd00fa643e
2 changed files with 101 additions and 1 deletions
23
fixtures/TEST_HTML5.html
Normal file
23
fixtures/TEST_HTML5.html
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<link rel="home" href="https://example.com/head/home">
|
||||
<title>Test</title>
|
||||
<meta name="description" content="Test HTML5 parsing (not valid XML)">
|
||||
|
||||
<!-- The links below have no closing tags (not valid XML) -->
|
||||
<link rel="icon" type="image/png" sizes="32x32" href="images/icon.png">
|
||||
<link rel="stylesheet" type="text/css" href="https://example.com/css/style_full_url.css">
|
||||
<link rel="stylesheet" type="text/css" href="css/style_relative_url.css">
|
||||
|
||||
<!-- The defer attribute has no value (not valid XML) -->
|
||||
<script defer src="js/script.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
Hello world.
|
||||
<a href="https://example.com/body/a">Link in body</a>
|
||||
<!-- Empty a tag might be problematic (in terms of browser support), but should still be parsed -->
|
||||
<div><a href="https://example.com/body/div_empty_a"/></div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -60,11 +60,17 @@ fn extract_links_from_markdown(input: &str) -> Vec<String> {
|
|||
// Extracting unparsed URL strings from a HTML string
|
||||
fn extract_links_from_html(input: &str) -> Vec<String> {
|
||||
let mut reader = Reader::from_str(input);
|
||||
|
||||
// allow not well-formed XML documents, which contain non-closed elements
|
||||
// (e.g. HTML5 which has things like `<link>`)
|
||||
reader.check_end_names(false);
|
||||
|
||||
let mut buf = Vec::new();
|
||||
let mut urls = Vec::new();
|
||||
|
||||
while let Ok(e) = reader.read_event(&mut buf) {
|
||||
match e {
|
||||
HTMLEvent::Start(ref e) => {
|
||||
HTMLEvent::Start(ref e) | HTMLEvent::Empty(ref e) => {
|
||||
for attr in e.attributes() {
|
||||
if let Ok(attr) = attr {
|
||||
match (attr.key, e.name()) {
|
||||
|
|
@ -161,6 +167,8 @@ pub(crate) fn extract_links(input_content: &InputContent, base_url: Option<Url>)
|
|||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, Read};
|
||||
|
||||
#[test]
|
||||
fn test_extract_markdown_links() {
|
||||
|
|
@ -248,4 +256,73 @@ mod test {
|
|||
assert!(links.len() == 1);
|
||||
assert_eq!(links[0].as_str(), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html5_not_valid_xml() {
|
||||
let test_html5 = Path::new(module_path!())
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("fixtures")
|
||||
.join("TEST_HTML5.html");
|
||||
|
||||
let file = File::open(test_html5).expect("Unable to open test file");
|
||||
let mut buf_reader = BufReader::new(file);
|
||||
let mut input = String::new();
|
||||
buf_reader
|
||||
.read_to_string(&mut input)
|
||||
.expect("Unable to read test file contents");
|
||||
|
||||
let links = extract_links(&InputContent::from_string(&input, FileType::HTML), None);
|
||||
let expected_links = [
|
||||
Uri::Website(Url::parse("https://example.com/head/home").unwrap()),
|
||||
Uri::Website(Url::parse("https://example.com/css/style_full_url.css").unwrap()),
|
||||
// the body links wouldn't be present if the file was parsed strictly as XML
|
||||
Uri::Website(Url::parse("https://example.com/body/a").unwrap()),
|
||||
Uri::Website(Url::parse("https://example.com/body/div_empty_a").unwrap()),
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html5_not_valid_xml_relative_links() {
|
||||
let test_html5 = Path::new(module_path!())
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("fixtures")
|
||||
.join("TEST_HTML5.html");
|
||||
|
||||
let file = File::open(test_html5).expect("Unable to open test file");
|
||||
let mut buf_reader = BufReader::new(file);
|
||||
let mut input = String::new();
|
||||
buf_reader
|
||||
.read_to_string(&mut input)
|
||||
.expect("Unable to read test file contents");
|
||||
|
||||
let links = extract_links(
|
||||
&InputContent::from_string(&input, FileType::HTML),
|
||||
Some(Url::parse("https://example.com").unwrap()),
|
||||
);
|
||||
let expected_links = [
|
||||
Uri::Website(Url::parse("https://example.com/head/home").unwrap()),
|
||||
Uri::Website(Url::parse("https://example.com/images/icon.png").unwrap()),
|
||||
Uri::Website(Url::parse("https://example.com/css/style_relative_url.css").unwrap()),
|
||||
Uri::Website(Url::parse("https://example.com/css/style_full_url.css").unwrap()),
|
||||
// TODO BUG: the JS link is missing because the parser can't properly deal
|
||||
// with `<script defer src="..."></script>` (tags that have attributes with no value)
|
||||
// Uri::Website(Url::parse("https://example.com/js/script.js").unwrap()),
|
||||
|
||||
// the body links wouldn't be present if the file was parsed strictly as XML
|
||||
Uri::Website(Url::parse("https://example.com/body/a").unwrap()),
|
||||
Uri::Website(Url::parse("https://example.com/body/div_empty_a").unwrap()),
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue