2003-07-04 14:24:44 +00:00
# -*- coding: iso-8859-1 -*-
2014-01-08 21:33:04 +00:00
# Copyright (C) 2001-2014 Bastian Kleineidam
2002-11-24 22:13:45 +00:00
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
2009-07-24 21:58:20 +00:00
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
2005-01-19 15:08:02 +00:00
"""
Find link tags in HTML text .
"""
2002-11-24 22:13:45 +00:00
2004-07-07 18:04:40 +00:00
import re
2008-06-07 13:07:48 +00:00
from . . import strformat , log , LOG_CHECK , url as urlutil
from . import linkname
2004-07-20 14:50:00 +00:00
2005-01-20 12:07:04 +00:00
MAX_NAMELEN = 256
2010-03-08 08:04:33 +00:00
MAX_TITLELEN = 256
2008-04-27 11:39:21 +00:00
unquote = strformat . unquote
2002-11-24 22:13:45 +00:00
2011-12-30 07:55:38 +00:00
# HTML4/5 link tags
# ripped mainly from HTML::Tagset.pm with HTML5 added
2002-11-24 22:13:45 +00:00
LinkTags = {
2004-11-03 21:29:25 +00:00
' a ' : [ u ' href ' ] ,
' applet ' : [ u ' archive ' , u ' src ' ] ,
' area ' : [ u ' href ' ] ,
2011-12-30 07:55:38 +00:00
' audio ' : [ u ' src ' ] , # HTML5
2004-11-03 21:29:25 +00:00
' bgsound ' : [ u ' src ' ] ,
' blockquote ' : [ u ' cite ' ] ,
' body ' : [ u ' background ' ] ,
2011-12-30 07:55:38 +00:00
' button ' : [ u ' formaction ' ] , # HTML5
2004-11-03 21:29:25 +00:00
' del ' : [ u ' cite ' ] ,
' embed ' : [ u ' pluginspage ' , u ' src ' ] ,
' form ' : [ u ' action ' ] ,
' frame ' : [ u ' src ' , u ' longdesc ' ] ,
' head ' : [ u ' profile ' ] ,
2011-12-30 07:55:38 +00:00
' html ' : [ u ' manifest ' ] , # HTML5
2004-11-03 21:29:25 +00:00
' iframe ' : [ u ' src ' , u ' longdesc ' ] ,
' ilayer ' : [ u ' background ' ] ,
' img ' : [ u ' src ' , u ' lowsrc ' , u ' longdesc ' , u ' usemap ' ] ,
2011-12-30 07:55:38 +00:00
' input ' : [ u ' src ' , u ' usemap ' , u ' formaction ' ] ,
2004-11-03 21:29:25 +00:00
' ins ' : [ u ' cite ' ] ,
' isindex ' : [ u ' action ' ] ,
' layer ' : [ u ' background ' , u ' src ' ] ,
' link ' : [ u ' href ' ] ,
2005-06-28 13:36:39 +00:00
' meta ' : [ u ' content ' , u ' href ' ] ,
2012-06-23 12:28:32 +00:00
' object ' : [ u ' classid ' , u ' data ' , u ' archive ' , u ' usemap ' , u ' codebase ' ] ,
2004-11-03 21:29:25 +00:00
' q ' : [ u ' cite ' ] ,
2008-01-11 16:45:30 +00:00
' script ' : [ u ' src ' ] ,
2011-12-30 07:55:38 +00:00
' source ' : [ u ' src ' ] , # HTML5
2004-11-03 21:29:25 +00:00
' table ' : [ u ' background ' ] ,
' td ' : [ u ' background ' ] ,
' th ' : [ u ' background ' ] ,
' tr ' : [ u ' background ' ] ,
2011-12-30 07:55:38 +00:00
' track ' : [ u ' src ' ] , # HTML5
' video ' : [ u ' src ' ] , # HTML5
2004-11-03 21:29:25 +00:00
' xmp ' : [ u ' href ' ] ,
None : [ u ' style ' ] ,
2002-11-24 22:13:45 +00:00
}
2012-08-22 20:43:14 +00:00
# HTML anchor tags
AnchorTags = {
' a ' : [ u ' name ' ] ,
None : [ u ' id ' ] ,
}
# WML tags
WmlTags = {
' a ' : [ u ' href ' ] ,
' go ' : [ u ' href ' ] ,
' img ' : [ u ' src ' ] ,
}
2002-11-24 22:13:45 +00:00
# matcher for <meta http-equiv=refresh> tags
2004-11-03 21:29:25 +00:00
refresh_re = re . compile ( ur " (?i)^ \ d+; \ s*url=(?P<url>.+)$ " )
2005-03-08 20:49:00 +00:00
_quoted_pat = ur " ( ' [^ ' ]+ ' | \" [^ \" ]+ \" |[^ \ ) \ s]+) "
css_url_re = re . compile ( ur " url \ ( \ s*(?P<url> %s ) \ s* \ ) " % _quoted_pat )
2008-05-09 06:16:03 +00:00
swf_url_re = re . compile ( " (?i) %s " % urlutil . safe_url_pattern )
2007-11-14 18:46:14 +00:00
c_comment_re = re . compile ( ur " / \ *.*? \ */ " , re . DOTALL )
2010-03-13 07:47:12 +00:00
2007-11-14 18:46:14 +00:00
def strip_c_comments ( text ) :
""" Remove C/CSS-style comments from text. Note that this method also
deliberately removes comments inside of strings . """
return c_comment_re . sub ( ' ' , text )
2002-11-24 22:13:45 +00:00
2010-03-08 08:04:33 +00:00
class StopParse ( StandardError ) :
""" Raised when parsing should stop. """
pass
class TitleFinder ( object ) :
""" Find title tags in HTML text. """
2010-03-09 10:31:12 +00:00
def __init__ ( self ) :
""" Initialize title. """
2010-03-08 08:04:33 +00:00
super ( TitleFinder , self ) . __init__ ( )
log . debug ( LOG_CHECK , " HTML title parser " )
self . title = None
def start_element ( self , tag , attrs ) :
""" Search for <title> tag. """
if tag == ' title ' :
2010-03-09 10:31:12 +00:00
data = self . parser . peek ( MAX_TITLELEN )
2010-03-08 08:04:33 +00:00
data = data . decode ( self . parser . encoding , " ignore " )
self . title = linkname . title_name ( data )
2011-04-29 17:49:24 +00:00
raise StopParse ( " found <title> tag " )
2010-03-08 08:04:33 +00:00
elif tag == ' body ' :
2011-04-29 17:49:24 +00:00
raise StopParse ( " found <body> tag " )
2010-03-08 08:04:33 +00:00
2004-01-28 22:49:20 +00:00
class TagFinder ( object ) :
2010-03-08 08:04:33 +00:00
""" Base class handling HTML start elements.
TagFinder instances are used as HtmlParser handlers . """
2004-08-16 19:20:53 +00:00
2007-11-14 19:06:39 +00:00
def __init__ ( self ) :
2008-04-27 11:39:21 +00:00
""" Initialize local variables. """
2004-09-03 18:26:12 +00:00
super ( TagFinder , self ) . __init__ ( )
2004-08-16 19:20:53 +00:00
# parser object will be initialized when it is used as
# a handler object
self . parser = None
2004-01-07 20:50:07 +00:00
2005-06-09 15:10:29 +00:00
def start_element ( self , tag , attrs ) :
2008-04-27 11:39:21 +00:00
""" Does nothing, override in a subclass. """
2005-06-09 15:10:29 +00:00
pass
def start_end_element ( self , tag , attrs ) :
2008-04-27 11:39:21 +00:00
""" Delegate a combined start/end element (eg. <br/>) to
the start_element method . Ignore the end element part . """
2005-06-09 15:10:29 +00:00
self . start_element ( tag , attrs )
2004-01-07 20:50:07 +00:00
2004-01-28 22:49:20 +00:00
class MetaRobotsFinder ( TagFinder ) :
2008-04-27 11:39:21 +00:00
""" Class for finding robots.txt meta values in HTML. """
2004-08-16 19:20:53 +00:00
2007-11-14 19:06:39 +00:00
def __init__ ( self ) :
2010-03-09 10:31:12 +00:00
""" Initialize follow and index flags. """
2007-11-14 19:06:39 +00:00
super ( MetaRobotsFinder , self ) . __init__ ( )
2008-04-27 11:39:21 +00:00
log . debug ( LOG_CHECK , " meta robots finder " )
2010-03-08 08:04:33 +00:00
self . follow = self . index = True
2004-01-07 20:50:07 +00:00
2004-08-16 19:20:53 +00:00
def start_element ( self , tag , attrs ) :
2008-04-27 11:39:21 +00:00
""" Search for meta robots.txt " nofollow " and " noindex " flags. """
2010-03-08 08:04:33 +00:00
if tag == ' meta ' and attrs . get ( ' name ' ) == ' robots ' :
val = attrs . get_true ( ' content ' , u ' ' ) . lower ( ) . split ( u ' , ' )
self . follow = u ' nofollow ' not in val
self . index = u ' noindex ' not in val
2011-04-29 17:49:24 +00:00
raise StopParse ( " found <meta name=robots> tag " )
2010-03-08 08:04:33 +00:00
elif tag == ' body ' :
2011-04-29 17:49:24 +00:00
raise StopParse ( " found <body> tag " )
2004-01-07 20:50:07 +00:00
2005-06-28 13:36:39 +00:00
def is_meta_url ( attr , attrs ) :
2008-04-27 11:39:21 +00:00
""" Check if the meta attributes contain a URL. """
2005-06-28 13:36:39 +00:00
res = False
if attr == " content " :
equiv = attrs . get_true ( ' http-equiv ' , u ' ' ) . lower ( )
2005-10-13 22:26:12 +00:00
scheme = attrs . get_true ( ' scheme ' , u ' ' ) . lower ( )
2005-06-28 13:36:39 +00:00
res = equiv in ( u ' refresh ' , ) or scheme in ( u ' dcterms.uri ' , )
if attr == " href " :
rel = attrs . get_true ( ' rel ' , u ' ' ) . lower ( )
res = rel in ( u ' shortcut icon ' , u ' icon ' )
return res
2013-12-10 22:42:43 +00:00
def is_form_get ( attr , attrs ) :
""" Check if this is a GET form action URL. """
res = False
if attr == " action " :
method = attrs . get_true ( ' method ' , u ' ' ) . lower ( )
res = method != ' post '
return res
2004-01-28 22:49:20 +00:00
class LinkFinder ( TagFinder ) :
2010-03-08 08:04:33 +00:00
""" Find HTML links, and apply them to the callback function with the
format ( url , lineno , column , name , codebase ) . """
2002-11-24 22:13:45 +00:00
2010-03-09 10:31:12 +00:00
def __init__ ( self , callback , tags = None ) :
2008-04-27 11:39:21 +00:00
""" Store content in buffer and initialize URL list. """
2007-11-14 19:06:39 +00:00
super ( LinkFinder , self ) . __init__ ( )
2010-03-08 08:04:33 +00:00
self . callback = callback
2004-08-16 19:20:53 +00:00
if tags is None :
self . tags = LinkTags
else :
self . tags = tags
2004-11-06 12:42:33 +00:00
self . base_ref = u ' '
2008-04-27 11:39:21 +00:00
log . debug ( LOG_CHECK , " link finder " )
2002-11-24 22:13:45 +00:00
2004-08-16 19:20:53 +00:00
def start_element ( self , tag , attrs ) :
2008-04-27 11:39:21 +00:00
""" Search for links and store found URLs in a list. """
log . debug ( LOG_CHECK , " LinkFinder tag %s attrs %s " , tag , attrs )
2014-02-28 23:12:34 +00:00
log . debug ( LOG_CHECK , " line %d col %d old line %d old col %d " , self . parser . lineno ( ) , self . parser . column ( ) , self . parser . last_lineno ( ) , self . parser . last_column ( ) )
2004-11-09 00:29:06 +00:00
if tag == " base " and not self . base_ref :
2010-03-08 08:04:33 +00:00
self . base_ref = unquote ( attrs . get_true ( " href " , u ' ' ) )
2003-10-16 20:39:59 +00:00
tagattrs = self . tags . get ( tag , [ ] )
2011-12-30 07:58:45 +00:00
# add universal tag attributes using tagname None
2003-10-16 20:39:59 +00:00
tagattrs . extend ( self . tags . get ( None , [ ] ) )
2011-12-30 07:58:45 +00:00
# eliminate duplicate tag attributes
2005-03-08 20:51:38 +00:00
tagattrs = set ( tagattrs )
2010-03-08 08:04:33 +00:00
# parse URLs in tag (possibly multiple URLs in CSS styles)
2003-10-16 20:39:59 +00:00
for attr in tagattrs :
2004-11-06 12:42:33 +00:00
if attr not in attrs :
continue
2005-06-28 13:36:39 +00:00
if tag == " meta " and not is_meta_url ( attr , attrs ) :
continue
2013-12-10 22:42:43 +00:00
if tag == " form " and not is_form_get ( attr , attrs ) :
continue
2004-11-06 12:42:33 +00:00
# name of this link
name = self . get_link_name ( tag , attrs , attr )
# possible codebase
2010-03-08 08:04:33 +00:00
base = u ' '
2012-06-23 11:32:08 +00:00
if tag == ' applet ' :
2010-03-08 08:04:33 +00:00
base = unquote ( attrs . get_true ( ' codebase ' , u ' ' ) )
if not base :
base = self . base_ref
2005-07-04 20:28:55 +00:00
# note: value can be None
value = unquote ( attrs . get ( attr ) )
2013-01-17 19:41:09 +00:00
if tag == ' link ' and attrs . get ( ' rel ' ) == ' dns-prefetch ' :
if ' : ' in value :
value = value . split ( ' : ' , 1 ) [ 1 ]
value = ' dns: ' + value . rstrip ( ' / ' )
2010-03-08 08:04:33 +00:00
# parse tag for URLs
self . parse_tag ( tag , attr , value , name , base )
2008-04-27 11:39:21 +00:00
log . debug ( LOG_CHECK , " LinkFinder finished tag %s " , tag )
2004-11-06 12:42:33 +00:00
def get_link_name ( self , tag , attrs , attr ) :
2008-04-27 11:39:21 +00:00
""" Parse attrs for link name. Return name of link. """
2004-11-06 12:42:33 +00:00
if tag == ' a ' and attr == ' href ' :
2010-07-30 19:03:04 +00:00
# Look for name only up to MAX_NAMELEN characters
data = self . parser . peek ( MAX_NAMELEN )
data = data . decode ( self . parser . encoding , " ignore " )
name = linkname . href_name ( data )
2004-11-06 12:42:33 +00:00
if not name :
2010-07-30 19:03:04 +00:00
name = unquote ( attrs . get_true ( ' title ' , u ' ' ) )
2004-11-06 12:42:33 +00:00
elif tag == ' img ' :
2005-03-29 11:31:17 +00:00
name = unquote ( attrs . get_true ( ' alt ' , u ' ' ) )
2004-11-06 12:42:33 +00:00
if not name :
2005-03-29 11:31:17 +00:00
name = unquote ( attrs . get_true ( ' title ' , u ' ' ) )
2004-11-06 12:42:33 +00:00
else :
name = u " "
return name
2002-11-24 22:13:45 +00:00
2010-03-08 08:04:33 +00:00
def parse_tag ( self , tag , attr , url , name , base ) :
2008-04-27 11:39:21 +00:00
""" Add given url data to url list. """
2005-03-29 11:31:17 +00:00
assert isinstance ( tag , unicode ) , repr ( tag )
assert isinstance ( attr , unicode ) , repr ( attr )
assert isinstance ( name , unicode ) , repr ( name )
assert isinstance ( base , unicode ) , repr ( base )
2005-07-04 20:28:55 +00:00
assert isinstance ( url , unicode ) or url is None , repr ( url )
2003-10-17 06:36:22 +00:00
urls = [ ]
2002-11-24 22:13:45 +00:00
# look for meta refresh
2006-05-16 22:05:17 +00:00
if tag == u ' meta ' and url :
2004-11-03 21:29:25 +00:00
mo = refresh_re . match ( url )
2003-10-16 20:39:59 +00:00
if mo :
2003-10-17 06:36:22 +00:00
urls . append ( mo . group ( " url " ) )
2005-10-10 22:21:59 +00:00
elif attr != ' content ' :
2005-06-28 13:36:39 +00:00
urls . append ( url )
2006-05-16 22:05:17 +00:00
elif attr == u ' style ' and url :
2004-04-04 09:30:10 +00:00
for mo in css_url_re . finditer ( url ) :
2005-03-08 20:49:00 +00:00
u = mo . group ( " url " )
2005-03-29 11:31:17 +00:00
urls . append ( unquote ( u , matching = True ) )
2011-12-30 07:58:45 +00:00
elif attr == u ' archive ' :
2011-12-30 11:36:22 +00:00
urls . extend ( url . split ( u ' , ' ) )
2003-10-17 06:36:22 +00:00
else :
urls . append ( url )
if not urls :
# no url found
return
for u in urls :
2005-07-04 20:28:55 +00:00
assert isinstance ( u , unicode ) or u is None , repr ( u )
2014-02-28 23:12:34 +00:00
log . debug ( LOG_CHECK , u " LinkParser found link %r %r %r %r %r " , tag , attr , u , name , base )
2010-03-08 08:04:33 +00:00
self . callback ( u , self . parser . last_lineno ( ) ,
self . parser . last_column ( ) , name , base )