Improved duplication url check.

This commit is contained in:
Bastian Kleineidam 2012-10-10 21:04:48 +02:00
parent b758fc6f52
commit c4e15c7b88
2 changed files with 18 additions and 2 deletions

View file

@ -601,7 +601,13 @@ def is_duplicate_content_url(url1, url2):
if url1 == url2:
return True
if url2 in url1:
return shorten_duplicate_content_url(url1) == url2
url1 = shorten_duplicate_content_url(url1)
if not url2.endswith('/') and url1.endswith('/'):
url2 += '/'
return url1 == url2
if url1 in url2:
return shorten_duplicate_content_url(url2) == url1
url2 = shorten_duplicate_content_url(url2)
if not url1.endswith('/') and url2.endswith('/'):
url1 += '/'
return url1 == url2
return False

View file

@ -554,3 +554,13 @@ class TestUrl (unittest.TestCase):
@need_network
def test_get_content (self):
linkcheck.url.get_content('http://www.debian.org/')
def test_duplicate_urls(self):
is_dup = linkcheck.url.is_duplicate_content_url
self.assertTrue(is_dup("http://example.org", "http://example.org"))
self.assertTrue(is_dup("http://example.org/", "http://example.org"))
self.assertTrue(is_dup("http://example.org", "http://example.org/"))
self.assertTrue(is_dup("http://example.org/index.html", "http://example.org"))
self.assertTrue(is_dup("http://example.org", "http://example.org/index.html"))
self.assertTrue(is_dup("http://example.org/index.htm", "http://example.org"))
self.assertTrue(is_dup("http://example.org", "http://example.org/index.htm"))