linkchecker/linkcheck/checker/internpaturl.py
calvin df857aab8d Intern patterns now accept URLs with and without "www." prefixes
as default. This allows sites to check that use both variants.


git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3714 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2008-04-21 09:18:58 +00:00

56 lines
1.9 KiB
Python

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2005-2008 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
Intern URL pattern support.
"""
import re
import urlbase
import linkcheck.checker
class InternPatternUrl (urlbase.UrlBase):
"""
Class supporting an intern URL pattern.
"""
def get_intern_pattern (self):
"""
Get pattern for intern URL matching.
@return non-empty regex pattern or None
@rtype String or None
"""
absolute = linkcheck.checker.absolute_url
url = absolute(self.base_url, self.base_ref, self.parent_url)
if not url:
return None
parts = linkcheck.strformat.url_unicode_split(url)
scheme = parts[0]
domain = parts[1]
domain, is_idn = linkcheck.url.idna_encode(domain)
if not (domain and scheme):
return None
path, params = linkcheck.url.splitparams(parts[2])
segments = path.split('/')[:-1]
path = "/".join(segments)
if url.endswith('/'):
path += '/'
args = list(re.escape(x) for x in (scheme, domain, path))
if args[0] in ('http', 'https'):
args[0] = 'https?'
if args[1].startswith('www\\.'):
args[1] = r"(www\.|)%s" % args[1][5:]
return "%s://%s%s" % tuple(args)