Added email syntax check.

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3960 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2009-02-18 15:35:23 +00:00
parent 7214943f38
commit 2e918a7b7a
5 changed files with 214 additions and 86 deletions

View file

@ -1,3 +1,8 @@
5.1 "" (released xx.xx.2009)
* Added email syntax checking.
Closes: SF bug #2595437
5.0.2 "All the boys love Mandy Lane" (released 13.2.2009)
* Properly detect location of the log configuration file in the Windows

View file

@ -45,12 +45,14 @@ checking. All connection check types are described below.
If one address fails, the whole list will fail.
For each mail address we check the following things:
1) Look up the MX DNS records. If we found no MX record,
1) Check the adress syntax, both of the part before and after
the @ sign.
2) Look up the MX DNS records. If we found no MX record,
print an error.
2) Check if one of the mail hosts accept an SMTP connection.
3) Check if one of the mail hosts accept an SMTP connection.
Check hosts with higher priority first.
If no host accepts SMTP, we print a warning.
3) Try to verify the address with the VRFY command. If we got
4) Try to verify the address with the VRFY command. If we got
an answer, print the verified address as an info.
- FTP links (``ftp:``)

View file

@ -91,7 +91,6 @@ WARN_HTTP_COOKIE_STORE_ERROR = "http-cookie-store-error"
WARN_HTTP_DECOMPRESS_ERROR = "http-decompress-error"
WARN_HTTP_UNSUPPORTED_ENCODING = "http-unsupported-encoding"
WARN_IGNORE_URL = "ignore-url"
WARN_MAIL_NO_ADDRESSES = "mail-no-addresses"
WARN_MAIL_NO_MX_HOST = "mail-no-mx-host"
WARN_MAIL_UNVERIFIED_ADDRESS = "mail-unverified-address"
WARN_MAIL_NO_CONNECTION = "mail-no-connection"
@ -128,7 +127,6 @@ Warnings = {
WARN_HTTP_UNSUPPORTED_ENCODING:
_("The URL content is encoded with an unknown encoding."),
WARN_IGNORE_URL: _("The URL has been ignored."),
WARN_MAIL_NO_ADDRESSES: _("The mailto: URL contained no addresses."),
WARN_MAIL_NO_MX_HOST: _("The mail MX host could not be found."),
WARN_MAIL_UNVERIFIED_ADDRESS:
_("The mailto: address could not be verified."),

View file

@ -19,38 +19,43 @@ Handle for mailto: links.
"""
import cgi
import re
import urllib
import smtplib
import email.Utils
from email._parseaddr import AddressList
import sys
from . import urlbase
from .. import log, LOG_CHECK, strformat, LinkCheckerError, url as urlutil
from .. import log, LOG_CHECK, strformat, url as urlutil
from ..dns import resolver
from .const import WARN_MAIL_NO_ADDRESSES, WARN_MAIL_NO_MX_HOST, \
from ..network import iputil
from .const import WARN_MAIL_NO_MX_HOST, \
WARN_MAIL_UNVERIFIED_ADDRESS, WARN_MAIL_NO_CONNECTION
def _split_address (address):
"""
Split username and hostname of address. The hostname defaults
to 'localhost' if it is not specified.
def getaddresses (addr):
"""Return list of email addresses from given field value."""
return [mail for name, mail in AddressList(addr).addresslist if mail]
@param address: an email address
@type address: string
@return: a tuple (username, hostname)
@rtype: tuple
@raise: LinkCheckerError if address could not be split
"""
split = address.split("@", 1)
if len(split) == 2:
if not split[1]:
return (split[0], "localhost")
return tuple(split)
if len(split) == 1:
return (split[0], "localhost")
raise LinkCheckerError(_("Could not split the mail address"))
def is_quoted (addr):
"""Return True iff address string is quoted."""
return addr.startswith(u'"') and addr.endswith(u'"')
def is_literal (domain):
"""Return True iff domain string is a literal."""
return domain.startswith(u'[') and domain.endswith(u']')
_remove_quoted = re.compile(ur'\\.').sub
_quotes = re.compile(ur'["\\]')
def is_missing_quote (addr):
return _quotes.match(_remove_quoted(u"", addr[1:-1]))
# list of CGI keys to search for email addresses
EMAIL_CGI = ("to", "cc", "bcc")
class MailtoUrl (urlbase.UrlBase):
"""
@ -58,32 +63,25 @@ class MailtoUrl (urlbase.UrlBase):
"""
def build_url (self):
"""
Call super.build_url(), extract list of mail addresses from URL,
"""Call super.build_url(), extract list of mail addresses from URL,
and check their syntax.
"""
super(MailtoUrl, self).build_url()
self.headers = {}
self.addresses = email.Utils.getaddresses([self.cutout_addresses()])
for key in ("to", "cc", "bcc"):
if key in self.headers:
for val in self.headers[key]:
a = urllib.unquote(val)
self.addresses.extend(email.Utils.getaddresses([a]))
# check syntax of emails
for _name, addr in self.addresses:
domain = _split_address(addr)[1]
if not urlutil.is_safe_domain(domain):
raise LinkCheckerError(_("Invalid mail syntax"))
log.debug(LOG_CHECK, "addresses: %s", self.addresses)
self.addresses = set()
self.parse_addresses()
if self.addresses:
for addr in sorted(self.addresses):
self.check_email_syntax(addr)
if not self.valid:
break
else:
self.set_result(_("No mail addresses found in `%(url)s'.") % \
{"url": self.url}, valid=False, overwrite=False)
def cutout_addresses (self):
"""
Parse all mail addresses out of the URL target. Additionally
store headers.
@return: comma separated list of email addresses
@rtype: string
def parse_addresses (self):
"""Parse all mail addresses out of the URL target. Also parses
optional CGI headers like "?to=foo@example.org".
Stores parsed paddresses in the self.addresses set.
"""
# cut off leading mailto: and unquote
url = urllib.unquote(self.base_url[7:])
@ -108,17 +106,111 @@ class MailtoUrl (urlbase.UrlBase):
elif mode == 2:
mode = 0
if i < (len(url) - 1):
self.addresses.update(getaddresses(url[:i]))
try:
headers = cgi.parse_qs(url[(i+1):], strict_parsing=True)
for key, val in headers.items():
self.headers.setdefault(key.lower(), []).extend(val)
for key, vals in headers.items():
if key.lower() in EMAIL_CGI:
# Only the first header value is added
self.addresses.update(getaddresses(urllib.unquote(vals[0])))
except ValueError, err:
self.add_warning(_("Error parsing CGI values: %s") % str(err))
addrs = url[:i]
else:
addrs = url
# addrs is comma-separated list of mails now
return addrs
self.addresses.update(getaddresses(url))
log.debug(LOG_CHECK, "addresses: %s", self.addresses)
def check_email_syntax (self, mail):
"""Check email syntax. The relevant RFCs:
- How to check names (memo):
http://tools.ietf.org/html/rfc3696
- Email address syntax
http://tools.ietf.org/html/rfc2822
- SMTP protocol
http://tools.ietf.org/html/rfc5321#section-4.1.3
- IPv6
http://tools.ietf.org/html/rfc4291#section-2.2
- Host syntax
http://tools.ietf.org/html/rfc1123#section-2
"""
# length checks
# restrict email length to 256 characters
# http://www.rfc-editor.org/errata_search.php?eid=1003
if len(mail) > 256:
self.set_result(_("Mail address `%(addr)s' too long. Allowed 256 chars, was %(length)d chars.") % \
{"addr": mail, "length": len(mail)}, valid=False, overwrite=False)
return
if "@" not in mail:
self.set_result(_("Missing `@' in mail address `%(addr)s'.") % \
{"addr": mail}, valid=False, overwrite=False)
return
# note: be sure to use rsplit since "@" can occur in local part
local, domain = mail.rsplit("@", 1)
if not local:
self.set_result(_("Missing local part of mail address `%(addr)s'.") % \
{"addr": mail}, valid=False, overwrite=False)
return
if not domain:
self.set_result(_("Missing domain part of mail address `%(addr)s'.") % \
{"addr": mail}, valid=False, overwrite=False)
return
if len(local) > 64:
self.set_result(_("Local part of mail address `%(addr)s' too long. Allowed 64 chars, was %(length)d chars.") % \
{"addr": mail, "length": len(local)}, valid=False, overwrite=False)
return
if len(domain) > 255:
self.set_result(_("Domain part of mail address `%(addr)s' too long. Allowed 255 chars, was %(length)d chars.") % \
{"addr": mail, "length": len(local)}, valid=False, overwrite=False)
return
# local part syntax check
# Rules taken from http://tools.ietf.org/html/rfc3696#section-3
if is_quoted(local):
if is_missing_quote(local):
self.set_result(_("Unquoted double quote or backslash in mail address `%(addr)s'.") % \
{"addr": mail}, valid=False, overwrite=False)
return
else:
if local.startswith(u"."):
self.set_result(_("Local part of mail address `%(addr)s' may not start with a dot.") % \
{"addr": mail}, valid=False, overwrite=False)
return
if local.endswith(u"."):
self.set_result(_("Local part of mail address `%(addr)s' may not end with a dot.") % \
{"addr": mail}, valid=False, overwrite=False)
return
if u".." in local:
self.set_result(_("Local part of mail address `%(addr)s' may not contain two dots.") % \
{"addr": mail}, valid=False, overwrite=False)
return
for char in u'@ \\",[]':
if char in local.replace(u"\\%s"%char, u""):
self.set_result(_("Local part of mail address `%(addr)s' contains unquoted character `%(char)s.") % \
{"addr": mail, "char": char}, valid=False, overwrite=False)
return
# domain part syntax check
if is_literal(domain):
# it's an IP address
ip = domain[1:-1]
if ip.startswith(u"IPv6:"):
ip = ip[5:]
if not iputil.is_valid_ip(ip):
self.set_result(_("Domain part of mail address `%(addr)s' has invalid IP.") % \
{"addr": mail}, valid=False, overwrite=False)
return
else:
# it's a domain name
if not urlutil.is_safe_domain(domain):
self.set_result(_("Invalid domain part of mail address `%(addr)s'.") % \
{"addr": mail}, valid=False, overwrite=False)
return
if domain.endswith(".") or domain.split(".")[-1].isdigit():
self.set_result(_("Invalid top level domain part of mail address `%(addr)s'.") % \
{"addr": mail}, valid=False, overwrite=False)
return
def check_connection (self):
"""
@ -135,23 +227,20 @@ class MailtoUrl (urlbase.UrlBase):
an answer, print the verified address as an info.
If not, print a warning.
"""
if not self.addresses:
self.add_warning(_("No addresses found."),
tag=WARN_MAIL_NO_ADDRESSES)
return
for name, mail in self.addresses:
self.check_smtp_domain(name, mail)
assert self.addresses
for mail in sorted(self.addresses):
self.check_smtp_domain(mail)
if not self.valid:
break
def check_smtp_domain (self, name, mail):
def check_smtp_domain (self, mail):
"""
Check a single mail address.
"""
from ..dns.exception import DNSException
log.debug(LOG_CHECK, "checking mail address %r", mail)
mail = strformat.ascii_safe(mail)
username, domain = _split_address(mail)
username, domain = mail.rsplit('@', 1)
log.debug(LOG_CHECK, "looking up MX mailhost %r", domain)
try:
answers = resolver.query(domain, 'MX')
@ -271,7 +360,7 @@ class MailtoUrl (urlbase.UrlBase):
"""
The cache key is a comma separated list of emails.
"""
emails = u",".join(sorted(addr[1] for addr in self.addresses))
emails = u",".join(sorted(self.addresses))
self.cache_url_key = u"%s:%s" % (self.scheme, emails)
assert isinstance(self.cache_url_key, unicode), self.cache_url_key
# cache_content_key remains None, recursion is not allowed

View file

@ -17,11 +17,13 @@
"""
Test mail checking.
"""
import urllib
from tests import has_network
from nose import SkipTest
from . import LinkCheckTest
class TestMail (LinkCheckTest):
"""
Test mailto: link checking.
@ -37,8 +39,7 @@ class TestMail (LinkCheckTest):
"Killer <calvin@users.sourceforge.net>?subject=bla")
resultlines = [
u"url %s" % url,
u"cache key mailto:calvin@users.sourceforge.net,"
u"calvin@users.sourceforge.net",
u"cache key mailto:calvin@users.sourceforge.net",
u"real url %s" % url,
u"info Verified address calvin@users.sourceforge.net: 250 <calvin@users.sourceforge.net> is deliverable.",
u"valid",
@ -48,8 +49,7 @@ class TestMail (LinkCheckTest):
"bcc=calvin%40users.sourceforge.net")
resultlines = [
u"url %s" % url,
u"cache key mailto:calvin@users.sourceforge.net,"
u"calvin@users.sourceforge.net",
u"cache key mailto:calvin@users.sourceforge.net",
u"real url %s" % url,
u"info Verified address calvin@users.sourceforge.net: 250 <calvin@users.sourceforge.net> is deliverable.",
u"valid",
@ -81,8 +81,8 @@ class TestMail (LinkCheckTest):
u"calvin_CC@users.sourceforge.net,calvin_cc@users.sourceforge.net",
u"real url %s" % url,
u"info Verified address calvin@users.sourceforge.net: 250 <calvin@users.sourceforge.net> is deliverable.",
u"warning Unverified address: 550 <calvin_cc@users.sourceforge.net> Unrouteable address.",
u"warning Unverified address: 550 <calvin_CC@users.sourceforge.net> Unrouteable address.",
u"warning Unverified address: 550 <calvin_cc@users.sourceforge.net> Unrouteable address.",
u"valid",
]
self.direct(url, resultlines)
@ -129,32 +129,66 @@ class TestMail (LinkCheckTest):
u"valid",
]
self.direct(url, resultlines)
url = self.norm(u"mailto:")
def mail_valid (self, addr, **kwargs):
return self.mail_test(addr, u"valid", **kwargs)
def mail_error (self, addr, **kwargs):
return self.mail_test(addr, u"error", **kwargs)
def mail_test (self, addr, result, cache_key=None, warning=None):
"""Test error mails."""
url = self.norm(addr)
if cache_key is None:
cache_key = url
resultlines = [
u"url %s" % url,
u"cache key %s" % url,
u"cache key %s" % cache_key,
u"real url %s" % url,
u"warning No addresses found.",
u"valid",
]
if warning:
resultlines.append(u"warning %s" % warning)
resultlines.append(result)
self.direct(url, resultlines)
def test_bad_mail (self):
"""
Test some mailto addrs with bad syntax.
"""
def test_error_mail (self):
"""Test some mailto addrs with bad syntax."""
# too long or too short
self.mail_error(u"mailto:")
self.mail_error(u"mailto:@")
self.mail_error(u"mailto:@example.org")
self.mail_error(u"mailto:a@")
self.mail_error(u"mailto:%s@%s" % (u"a"*60, u"b"*200))
self.mail_error(u"mailto:%s@example.org" % (u"a"*65))
self.mail_error(u"mailto:a@%s" % (u"a"*256))
self.mail_error(u'mailto:a@%s.com' % (u"a"*64))
# local part quoted
self.mail_error(u'mailto:"a""@example.com', cache_key=u"mailto:a")
self.mail_error(u'mailto:""a"@example.com', cache_key=u"mailto:")
self.mail_error(u'mailto:"a\\"@example.com', cache_key=u'mailto:a"@example.com')
# local part unqouted
self.mail_error(u'mailto:.a@example.com')
self.mail_error(u'mailto:a.@example.com')
self.mail_error(u'mailto:a..b@example.com')
# domain part
self.mail_error(u'mailto:a@a_b.com')
self.mail_error(u'mailto:a@example.com.')
self.mail_error(u'mailto:a@example.com.111')
self.mail_error(u'mailto:a@example..com')
# other
# ? extension forbidden in <> construct
self.mail_error(u"mailto:Bastian Kleineidam <calvin@users.sourceforge.net?foo=bar>",
cache_key=u"mailto:calvin@users.sourceforge.net?foo=bar")
def test_valid_mail (self):
"""Test valid mail addresses."""
if not has_network():
raise SkipTest()
# ? extension forbidden in <> construct
url = self.norm(u"mailto:Bastian Kleineidam "\
"<calvin@users.sourceforge.net?foo=bar>")
resultlines = [
u"url %s" % url,
u"cache key None",
u"real url %s" % url,
u"error",
]
self.direct(url, resultlines)
for char in u"!#$&'*+-/=^_`.{|}~":
addr = u'abc%sdef@sourceforge.net' % char
self.mail_valid(u"mailto:%s" % addr,
warning=u"Unverified address: 550 <%s> Unrouteable address." % addr,
cache_key=u"mailto:%s" % addr)
def test_unicode_mail (self):
if not has_network():