mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
Added email syntax check.
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3960 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
7214943f38
commit
2e918a7b7a
5 changed files with 214 additions and 86 deletions
|
|
@ -1,3 +1,8 @@
|
|||
5.1 "" (released xx.xx.2009)
|
||||
|
||||
* Added email syntax checking.
|
||||
Closes: SF bug #2595437
|
||||
|
||||
5.0.2 "All the boys love Mandy Lane" (released 13.2.2009)
|
||||
|
||||
* Properly detect location of the log configuration file in the Windows
|
||||
|
|
|
|||
|
|
@ -45,12 +45,14 @@ checking. All connection check types are described below.
|
|||
If one address fails, the whole list will fail.
|
||||
For each mail address we check the following things:
|
||||
|
||||
1) Look up the MX DNS records. If we found no MX record,
|
||||
1) Check the adress syntax, both of the part before and after
|
||||
the @ sign.
|
||||
2) Look up the MX DNS records. If we found no MX record,
|
||||
print an error.
|
||||
2) Check if one of the mail hosts accept an SMTP connection.
|
||||
3) Check if one of the mail hosts accept an SMTP connection.
|
||||
Check hosts with higher priority first.
|
||||
If no host accepts SMTP, we print a warning.
|
||||
3) Try to verify the address with the VRFY command. If we got
|
||||
4) Try to verify the address with the VRFY command. If we got
|
||||
an answer, print the verified address as an info.
|
||||
|
||||
- FTP links (``ftp:``)
|
||||
|
|
|
|||
|
|
@ -91,7 +91,6 @@ WARN_HTTP_COOKIE_STORE_ERROR = "http-cookie-store-error"
|
|||
WARN_HTTP_DECOMPRESS_ERROR = "http-decompress-error"
|
||||
WARN_HTTP_UNSUPPORTED_ENCODING = "http-unsupported-encoding"
|
||||
WARN_IGNORE_URL = "ignore-url"
|
||||
WARN_MAIL_NO_ADDRESSES = "mail-no-addresses"
|
||||
WARN_MAIL_NO_MX_HOST = "mail-no-mx-host"
|
||||
WARN_MAIL_UNVERIFIED_ADDRESS = "mail-unverified-address"
|
||||
WARN_MAIL_NO_CONNECTION = "mail-no-connection"
|
||||
|
|
@ -128,7 +127,6 @@ Warnings = {
|
|||
WARN_HTTP_UNSUPPORTED_ENCODING:
|
||||
_("The URL content is encoded with an unknown encoding."),
|
||||
WARN_IGNORE_URL: _("The URL has been ignored."),
|
||||
WARN_MAIL_NO_ADDRESSES: _("The mailto: URL contained no addresses."),
|
||||
WARN_MAIL_NO_MX_HOST: _("The mail MX host could not be found."),
|
||||
WARN_MAIL_UNVERIFIED_ADDRESS:
|
||||
_("The mailto: address could not be verified."),
|
||||
|
|
|
|||
|
|
@ -19,38 +19,43 @@ Handle for mailto: links.
|
|||
"""
|
||||
|
||||
import cgi
|
||||
import re
|
||||
import urllib
|
||||
import smtplib
|
||||
import email.Utils
|
||||
from email._parseaddr import AddressList
|
||||
import sys
|
||||
|
||||
from . import urlbase
|
||||
from .. import log, LOG_CHECK, strformat, LinkCheckerError, url as urlutil
|
||||
from .. import log, LOG_CHECK, strformat, url as urlutil
|
||||
from ..dns import resolver
|
||||
from .const import WARN_MAIL_NO_ADDRESSES, WARN_MAIL_NO_MX_HOST, \
|
||||
from ..network import iputil
|
||||
from .const import WARN_MAIL_NO_MX_HOST, \
|
||||
WARN_MAIL_UNVERIFIED_ADDRESS, WARN_MAIL_NO_CONNECTION
|
||||
|
||||
|
||||
def _split_address (address):
|
||||
"""
|
||||
Split username and hostname of address. The hostname defaults
|
||||
to 'localhost' if it is not specified.
|
||||
def getaddresses (addr):
|
||||
"""Return list of email addresses from given field value."""
|
||||
return [mail for name, mail in AddressList(addr).addresslist if mail]
|
||||
|
||||
@param address: an email address
|
||||
@type address: string
|
||||
@return: a tuple (username, hostname)
|
||||
@rtype: tuple
|
||||
@raise: LinkCheckerError if address could not be split
|
||||
"""
|
||||
split = address.split("@", 1)
|
||||
if len(split) == 2:
|
||||
if not split[1]:
|
||||
return (split[0], "localhost")
|
||||
return tuple(split)
|
||||
if len(split) == 1:
|
||||
return (split[0], "localhost")
|
||||
raise LinkCheckerError(_("Could not split the mail address"))
|
||||
|
||||
def is_quoted (addr):
|
||||
"""Return True iff address string is quoted."""
|
||||
return addr.startswith(u'"') and addr.endswith(u'"')
|
||||
|
||||
|
||||
def is_literal (domain):
|
||||
"""Return True iff domain string is a literal."""
|
||||
return domain.startswith(u'[') and domain.endswith(u']')
|
||||
|
||||
|
||||
_remove_quoted = re.compile(ur'\\.').sub
|
||||
_quotes = re.compile(ur'["\\]')
|
||||
def is_missing_quote (addr):
|
||||
return _quotes.match(_remove_quoted(u"", addr[1:-1]))
|
||||
|
||||
|
||||
# list of CGI keys to search for email addresses
|
||||
EMAIL_CGI = ("to", "cc", "bcc")
|
||||
|
||||
class MailtoUrl (urlbase.UrlBase):
|
||||
"""
|
||||
|
|
@ -58,32 +63,25 @@ class MailtoUrl (urlbase.UrlBase):
|
|||
"""
|
||||
|
||||
def build_url (self):
|
||||
"""
|
||||
Call super.build_url(), extract list of mail addresses from URL,
|
||||
"""Call super.build_url(), extract list of mail addresses from URL,
|
||||
and check their syntax.
|
||||
"""
|
||||
super(MailtoUrl, self).build_url()
|
||||
self.headers = {}
|
||||
self.addresses = email.Utils.getaddresses([self.cutout_addresses()])
|
||||
for key in ("to", "cc", "bcc"):
|
||||
if key in self.headers:
|
||||
for val in self.headers[key]:
|
||||
a = urllib.unquote(val)
|
||||
self.addresses.extend(email.Utils.getaddresses([a]))
|
||||
# check syntax of emails
|
||||
for _name, addr in self.addresses:
|
||||
domain = _split_address(addr)[1]
|
||||
if not urlutil.is_safe_domain(domain):
|
||||
raise LinkCheckerError(_("Invalid mail syntax"))
|
||||
log.debug(LOG_CHECK, "addresses: %s", self.addresses)
|
||||
self.addresses = set()
|
||||
self.parse_addresses()
|
||||
if self.addresses:
|
||||
for addr in sorted(self.addresses):
|
||||
self.check_email_syntax(addr)
|
||||
if not self.valid:
|
||||
break
|
||||
else:
|
||||
self.set_result(_("No mail addresses found in `%(url)s'.") % \
|
||||
{"url": self.url}, valid=False, overwrite=False)
|
||||
|
||||
def cutout_addresses (self):
|
||||
"""
|
||||
Parse all mail addresses out of the URL target. Additionally
|
||||
store headers.
|
||||
|
||||
@return: comma separated list of email addresses
|
||||
@rtype: string
|
||||
def parse_addresses (self):
|
||||
"""Parse all mail addresses out of the URL target. Also parses
|
||||
optional CGI headers like "?to=foo@example.org".
|
||||
Stores parsed paddresses in the self.addresses set.
|
||||
"""
|
||||
# cut off leading mailto: and unquote
|
||||
url = urllib.unquote(self.base_url[7:])
|
||||
|
|
@ -108,17 +106,111 @@ class MailtoUrl (urlbase.UrlBase):
|
|||
elif mode == 2:
|
||||
mode = 0
|
||||
if i < (len(url) - 1):
|
||||
self.addresses.update(getaddresses(url[:i]))
|
||||
try:
|
||||
headers = cgi.parse_qs(url[(i+1):], strict_parsing=True)
|
||||
for key, val in headers.items():
|
||||
self.headers.setdefault(key.lower(), []).extend(val)
|
||||
for key, vals in headers.items():
|
||||
if key.lower() in EMAIL_CGI:
|
||||
# Only the first header value is added
|
||||
self.addresses.update(getaddresses(urllib.unquote(vals[0])))
|
||||
except ValueError, err:
|
||||
self.add_warning(_("Error parsing CGI values: %s") % str(err))
|
||||
addrs = url[:i]
|
||||
else:
|
||||
addrs = url
|
||||
# addrs is comma-separated list of mails now
|
||||
return addrs
|
||||
self.addresses.update(getaddresses(url))
|
||||
log.debug(LOG_CHECK, "addresses: %s", self.addresses)
|
||||
|
||||
def check_email_syntax (self, mail):
|
||||
"""Check email syntax. The relevant RFCs:
|
||||
- How to check names (memo):
|
||||
http://tools.ietf.org/html/rfc3696
|
||||
- Email address syntax
|
||||
http://tools.ietf.org/html/rfc2822
|
||||
- SMTP protocol
|
||||
http://tools.ietf.org/html/rfc5321#section-4.1.3
|
||||
- IPv6
|
||||
http://tools.ietf.org/html/rfc4291#section-2.2
|
||||
- Host syntax
|
||||
http://tools.ietf.org/html/rfc1123#section-2
|
||||
"""
|
||||
# length checks
|
||||
|
||||
# restrict email length to 256 characters
|
||||
# http://www.rfc-editor.org/errata_search.php?eid=1003
|
||||
if len(mail) > 256:
|
||||
self.set_result(_("Mail address `%(addr)s' too long. Allowed 256 chars, was %(length)d chars.") % \
|
||||
{"addr": mail, "length": len(mail)}, valid=False, overwrite=False)
|
||||
return
|
||||
if "@" not in mail:
|
||||
self.set_result(_("Missing `@' in mail address `%(addr)s'.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
return
|
||||
# note: be sure to use rsplit since "@" can occur in local part
|
||||
local, domain = mail.rsplit("@", 1)
|
||||
if not local:
|
||||
self.set_result(_("Missing local part of mail address `%(addr)s'.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
return
|
||||
if not domain:
|
||||
self.set_result(_("Missing domain part of mail address `%(addr)s'.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
return
|
||||
if len(local) > 64:
|
||||
self.set_result(_("Local part of mail address `%(addr)s' too long. Allowed 64 chars, was %(length)d chars.") % \
|
||||
{"addr": mail, "length": len(local)}, valid=False, overwrite=False)
|
||||
return
|
||||
if len(domain) > 255:
|
||||
self.set_result(_("Domain part of mail address `%(addr)s' too long. Allowed 255 chars, was %(length)d chars.") % \
|
||||
{"addr": mail, "length": len(local)}, valid=False, overwrite=False)
|
||||
return
|
||||
|
||||
# local part syntax check
|
||||
|
||||
# Rules taken from http://tools.ietf.org/html/rfc3696#section-3
|
||||
if is_quoted(local):
|
||||
if is_missing_quote(local):
|
||||
self.set_result(_("Unquoted double quote or backslash in mail address `%(addr)s'.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
return
|
||||
else:
|
||||
if local.startswith(u"."):
|
||||
self.set_result(_("Local part of mail address `%(addr)s' may not start with a dot.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
return
|
||||
if local.endswith(u"."):
|
||||
self.set_result(_("Local part of mail address `%(addr)s' may not end with a dot.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
return
|
||||
if u".." in local:
|
||||
self.set_result(_("Local part of mail address `%(addr)s' may not contain two dots.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
return
|
||||
for char in u'@ \\",[]':
|
||||
if char in local.replace(u"\\%s"%char, u""):
|
||||
self.set_result(_("Local part of mail address `%(addr)s' contains unquoted character `%(char)s.") % \
|
||||
{"addr": mail, "char": char}, valid=False, overwrite=False)
|
||||
return
|
||||
|
||||
# domain part syntax check
|
||||
|
||||
if is_literal(domain):
|
||||
# it's an IP address
|
||||
ip = domain[1:-1]
|
||||
if ip.startswith(u"IPv6:"):
|
||||
ip = ip[5:]
|
||||
if not iputil.is_valid_ip(ip):
|
||||
self.set_result(_("Domain part of mail address `%(addr)s' has invalid IP.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
return
|
||||
else:
|
||||
# it's a domain name
|
||||
if not urlutil.is_safe_domain(domain):
|
||||
self.set_result(_("Invalid domain part of mail address `%(addr)s'.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
return
|
||||
if domain.endswith(".") or domain.split(".")[-1].isdigit():
|
||||
self.set_result(_("Invalid top level domain part of mail address `%(addr)s'.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
return
|
||||
|
||||
def check_connection (self):
|
||||
"""
|
||||
|
|
@ -135,23 +227,20 @@ class MailtoUrl (urlbase.UrlBase):
|
|||
an answer, print the verified address as an info.
|
||||
If not, print a warning.
|
||||
"""
|
||||
if not self.addresses:
|
||||
self.add_warning(_("No addresses found."),
|
||||
tag=WARN_MAIL_NO_ADDRESSES)
|
||||
return
|
||||
for name, mail in self.addresses:
|
||||
self.check_smtp_domain(name, mail)
|
||||
assert self.addresses
|
||||
for mail in sorted(self.addresses):
|
||||
self.check_smtp_domain(mail)
|
||||
if not self.valid:
|
||||
break
|
||||
|
||||
def check_smtp_domain (self, name, mail):
|
||||
def check_smtp_domain (self, mail):
|
||||
"""
|
||||
Check a single mail address.
|
||||
"""
|
||||
from ..dns.exception import DNSException
|
||||
log.debug(LOG_CHECK, "checking mail address %r", mail)
|
||||
mail = strformat.ascii_safe(mail)
|
||||
username, domain = _split_address(mail)
|
||||
username, domain = mail.rsplit('@', 1)
|
||||
log.debug(LOG_CHECK, "looking up MX mailhost %r", domain)
|
||||
try:
|
||||
answers = resolver.query(domain, 'MX')
|
||||
|
|
@ -271,7 +360,7 @@ class MailtoUrl (urlbase.UrlBase):
|
|||
"""
|
||||
The cache key is a comma separated list of emails.
|
||||
"""
|
||||
emails = u",".join(sorted(addr[1] for addr in self.addresses))
|
||||
emails = u",".join(sorted(self.addresses))
|
||||
self.cache_url_key = u"%s:%s" % (self.scheme, emails)
|
||||
assert isinstance(self.cache_url_key, unicode), self.cache_url_key
|
||||
# cache_content_key remains None, recursion is not allowed
|
||||
|
|
|
|||
|
|
@ -17,11 +17,13 @@
|
|||
"""
|
||||
Test mail checking.
|
||||
"""
|
||||
import urllib
|
||||
from tests import has_network
|
||||
from nose import SkipTest
|
||||
from . import LinkCheckTest
|
||||
|
||||
|
||||
|
||||
class TestMail (LinkCheckTest):
|
||||
"""
|
||||
Test mailto: link checking.
|
||||
|
|
@ -37,8 +39,7 @@ class TestMail (LinkCheckTest):
|
|||
"Killer <calvin@users.sourceforge.net>?subject=bla")
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
u"cache key mailto:calvin@users.sourceforge.net,"
|
||||
u"calvin@users.sourceforge.net",
|
||||
u"cache key mailto:calvin@users.sourceforge.net",
|
||||
u"real url %s" % url,
|
||||
u"info Verified address calvin@users.sourceforge.net: 250 <calvin@users.sourceforge.net> is deliverable.",
|
||||
u"valid",
|
||||
|
|
@ -48,8 +49,7 @@ class TestMail (LinkCheckTest):
|
|||
"bcc=calvin%40users.sourceforge.net")
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
u"cache key mailto:calvin@users.sourceforge.net,"
|
||||
u"calvin@users.sourceforge.net",
|
||||
u"cache key mailto:calvin@users.sourceforge.net",
|
||||
u"real url %s" % url,
|
||||
u"info Verified address calvin@users.sourceforge.net: 250 <calvin@users.sourceforge.net> is deliverable.",
|
||||
u"valid",
|
||||
|
|
@ -81,8 +81,8 @@ class TestMail (LinkCheckTest):
|
|||
u"calvin_CC@users.sourceforge.net,calvin_cc@users.sourceforge.net",
|
||||
u"real url %s" % url,
|
||||
u"info Verified address calvin@users.sourceforge.net: 250 <calvin@users.sourceforge.net> is deliverable.",
|
||||
u"warning Unverified address: 550 <calvin_cc@users.sourceforge.net> Unrouteable address.",
|
||||
u"warning Unverified address: 550 <calvin_CC@users.sourceforge.net> Unrouteable address.",
|
||||
u"warning Unverified address: 550 <calvin_cc@users.sourceforge.net> Unrouteable address.",
|
||||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
|
|
@ -129,32 +129,66 @@ class TestMail (LinkCheckTest):
|
|||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
url = self.norm(u"mailto:")
|
||||
|
||||
def mail_valid (self, addr, **kwargs):
|
||||
return self.mail_test(addr, u"valid", **kwargs)
|
||||
|
||||
def mail_error (self, addr, **kwargs):
|
||||
return self.mail_test(addr, u"error", **kwargs)
|
||||
|
||||
def mail_test (self, addr, result, cache_key=None, warning=None):
|
||||
"""Test error mails."""
|
||||
url = self.norm(addr)
|
||||
if cache_key is None:
|
||||
cache_key = url
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
u"cache key %s" % url,
|
||||
u"cache key %s" % cache_key,
|
||||
u"real url %s" % url,
|
||||
u"warning No addresses found.",
|
||||
u"valid",
|
||||
]
|
||||
if warning:
|
||||
resultlines.append(u"warning %s" % warning)
|
||||
resultlines.append(result)
|
||||
self.direct(url, resultlines)
|
||||
|
||||
def test_bad_mail (self):
|
||||
"""
|
||||
Test some mailto addrs with bad syntax.
|
||||
"""
|
||||
def test_error_mail (self):
|
||||
"""Test some mailto addrs with bad syntax."""
|
||||
# too long or too short
|
||||
self.mail_error(u"mailto:")
|
||||
self.mail_error(u"mailto:@")
|
||||
self.mail_error(u"mailto:@example.org")
|
||||
self.mail_error(u"mailto:a@")
|
||||
self.mail_error(u"mailto:%s@%s" % (u"a"*60, u"b"*200))
|
||||
self.mail_error(u"mailto:%s@example.org" % (u"a"*65))
|
||||
self.mail_error(u"mailto:a@%s" % (u"a"*256))
|
||||
self.mail_error(u'mailto:a@%s.com' % (u"a"*64))
|
||||
# local part quoted
|
||||
self.mail_error(u'mailto:"a""@example.com', cache_key=u"mailto:a")
|
||||
self.mail_error(u'mailto:""a"@example.com', cache_key=u"mailto:")
|
||||
self.mail_error(u'mailto:"a\\"@example.com', cache_key=u'mailto:a"@example.com')
|
||||
# local part unqouted
|
||||
self.mail_error(u'mailto:.a@example.com')
|
||||
self.mail_error(u'mailto:a.@example.com')
|
||||
self.mail_error(u'mailto:a..b@example.com')
|
||||
# domain part
|
||||
self.mail_error(u'mailto:a@a_b.com')
|
||||
self.mail_error(u'mailto:a@example.com.')
|
||||
self.mail_error(u'mailto:a@example.com.111')
|
||||
self.mail_error(u'mailto:a@example..com')
|
||||
# other
|
||||
# ? extension forbidden in <> construct
|
||||
self.mail_error(u"mailto:Bastian Kleineidam <calvin@users.sourceforge.net?foo=bar>",
|
||||
cache_key=u"mailto:calvin@users.sourceforge.net?foo=bar")
|
||||
|
||||
def test_valid_mail (self):
|
||||
"""Test valid mail addresses."""
|
||||
if not has_network():
|
||||
raise SkipTest()
|
||||
# ? extension forbidden in <> construct
|
||||
url = self.norm(u"mailto:Bastian Kleineidam "\
|
||||
"<calvin@users.sourceforge.net?foo=bar>")
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
u"cache key None",
|
||||
u"real url %s" % url,
|
||||
u"error",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
for char in u"!#$&'*+-/=^_`.{|}~":
|
||||
addr = u'abc%sdef@sourceforge.net' % char
|
||||
self.mail_valid(u"mailto:%s" % addr,
|
||||
warning=u"Unverified address: 550 <%s> Unrouteable address." % addr,
|
||||
cache_key=u"mailto:%s" % addr)
|
||||
|
||||
def test_unicode_mail (self):
|
||||
if not has_network():
|
||||
|
|
|
|||
Loading…
Reference in a new issue