Exclude <script> tags by default (#848)

This is a naive approach to exclude script tags from
getting checked. The reason is that the tag leads to
a lot of false-positives (e.g. `//unpkg.com/docsify-edit-on-github@1`
within a script block gets detected as an e-mail address).

A more thorough approach would be the use of a tree-builder in
html5gum and html5ever, but this could have a negative performance
impact.

I also did not want to add a new flag (e.g. `--include-scripts`) for this
setting because the current set of flags around exclusion/inclusion is
already quite long.

Fixes #821.
This commit is contained in:
Matthias 2022-11-29 00:38:43 +01:00 committed by GitHub
parent 982d978e47
commit 9eeea250cd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 38 additions and 2 deletions

View file

@ -319,7 +319,6 @@ mod tests {
website("https://example.com/css/style_relative_url.css"),
website("https://example.com/head/home"),
website("https://example.com/images/icon.png"),
website("https://example.com/js/script.js"),
]);
assert_eq!(links, expected_links);

View file

@ -228,4 +228,21 @@ mod tests {
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}
#[test]
fn test_exclude_script_tags() {
let input = r#"
<script>
var foo = "https://example.com";
</script>
<a href="https://example.org">i'm fine</a>
"#;
let expected = vec![RawUri {
text: "https://example.org".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}];
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}
}

View file

@ -316,4 +316,21 @@ mod tests {
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}
#[test]
fn test_exclude_script_tags() {
let input = r#"
<script>
var foo = "https://example.com";
</script>
<a href="https://example.org">i'm fine</a>
"#;
let expected = vec![RawUri {
text: "https://example.org".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}];
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}
}

View file

@ -11,10 +11,13 @@ use plaintext::extract_plaintext;
/// Check if the given element is in the list of preformatted ("verbatim") tags.
///
/// These will be excluded from link checking by default.
// Including the <script> tag is debatable, but the alternative is to
// have a separate list of tags which need a separate config setting and that
// seems worse.
pub(crate) fn is_verbatim_elem(name: &str) -> bool {
matches!(
name,
"pre" | "code" | "textarea" | "samp" | "xmp" | "plaintext" | "listing"
"code" | "listing" | "plaintext" | "samp" | "script" | "textarea" | "xmp" | "pre"
)
}