mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-20 07:50:24 +00:00
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3898 e7d03fd6-7b0d-0410-9947-9c21f3af8025
427 lines
15 KiB
Python
427 lines
15 KiB
Python
# -*- coding: iso-8859-1 -*-
|
|
# Copyright (C) 2003, 2004 Nominum, Inc.
|
|
#
|
|
# Permission to use, copy, modify, and distribute this software and its
|
|
# documentation for any purpose with or without fee is hereby granted,
|
|
# provided that the above copyright notice and this permission notice
|
|
# appear in all copies.
|
|
#
|
|
# THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES
|
|
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR
|
|
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
|
|
# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
|
|
"""Tokenize DNS master file format"""
|
|
|
|
from cStringIO import StringIO
|
|
import sys
|
|
|
|
import linkcheck.dns.exception
|
|
import linkcheck.dns.name
|
|
|
|
_DELIMITERS = {
|
|
' ' : True,
|
|
'\t' : True,
|
|
'\n' : True,
|
|
';' : True,
|
|
'(' : True,
|
|
')' : True,
|
|
'"' : True }
|
|
|
|
_QUOTING_DELIMITERS = { '"' : True }
|
|
|
|
EOF = 0
|
|
EOL = 1
|
|
WHITESPACE = 2
|
|
IDENTIFIER = 3
|
|
QUOTED_STRING = 4
|
|
COMMENT = 5
|
|
DELIMITER = 6
|
|
|
|
class UngetBufferFull(linkcheck.dns.exception.DNSException):
|
|
"""Raised when an attempt is made to unget a token when the unget
|
|
buffer is full."""
|
|
pass
|
|
|
|
class Tokenizer(object):
|
|
"""A DNS master file format tokenizer.
|
|
|
|
A token is a (type, value) tuple, where I{type} is an int, and
|
|
I{value} is a string. The valid types are EOF, EOL, WHITESPACE,
|
|
IDENTIFIER, QUOTED_STRING, COMMENT, and DELIMITER.
|
|
|
|
@ivar file: The file to tokenize
|
|
@type file: file
|
|
@ivar ungotten_char: The most recently ungotten character, or None.
|
|
@type ungotten_char: string
|
|
@ivar ungotten_token: The most recently ungotten token, or None.
|
|
@type ungotten_token: (int, string) token tuple
|
|
@ivar multiline: The current multiline level. This value is increased
|
|
by one every time a '(' delimiter is read, and decreased by one every time
|
|
a ')' delimiter is read.
|
|
@type multiline: int
|
|
@ivar quoting: This variable is true if the tokenizer is currently
|
|
reading a quoted string.
|
|
@type quoting: bool
|
|
@ivar eof: This variable is true if the tokenizer has encountered EOF.
|
|
@type eof: bool
|
|
@ivar delimiters: The current delimiter dictionary.
|
|
@type delimiters: dict
|
|
@ivar line_number: The current line number
|
|
@type line_number: int
|
|
@ivar filename: A filename that will be returned by the L{where} method.
|
|
@type filename: string
|
|
"""
|
|
|
|
def __init__(self, f=sys.stdin, filename=None):
|
|
"""Initialize a tokenizer instance.
|
|
|
|
@param f: The file to tokenize. The default is sys.stdin.
|
|
This parameter may also be a string, in which case the tokenizer
|
|
will take its input from the contents of the string.
|
|
@type f: file or string
|
|
@param filename: the name of the filename that the L{where} method
|
|
will return.
|
|
@type filename: string
|
|
"""
|
|
|
|
if isinstance(f, str):
|
|
f = StringIO(f)
|
|
if filename is None:
|
|
filename = '<string>'
|
|
else:
|
|
if filename is None:
|
|
if f is sys.stdin:
|
|
filename = '<stdin>'
|
|
else:
|
|
filename = '<file>'
|
|
self.file = f
|
|
self.ungotten_char = None
|
|
self.ungotten_token = None
|
|
self.multiline = 0
|
|
self.quoting = False
|
|
self.eof = False
|
|
self.delimiters = _DELIMITERS
|
|
self.line_number = 1
|
|
self.filename = filename
|
|
|
|
def _get_char(self):
|
|
"""Read a character from input.
|
|
@rtype: string
|
|
"""
|
|
|
|
if self.ungotten_char is None:
|
|
if self.eof:
|
|
c = ''
|
|
else:
|
|
c = self.file.read(1)
|
|
if c == '':
|
|
self.eof = True
|
|
elif c == '\n':
|
|
self.line_number += 1
|
|
else:
|
|
c = self.ungotten_char
|
|
self.ungotten_char = None
|
|
return c
|
|
|
|
def where(self):
|
|
"""Return the current location in the input.
|
|
|
|
@rtype: (string, int) tuple. The first item is the filename of
|
|
the input, the second is the current line number.
|
|
"""
|
|
|
|
return (self.filename, self.line_number)
|
|
|
|
def _unget_char(self, c):
|
|
"""Unget a character.
|
|
|
|
The unget buffer for characters is only one character large; it is
|
|
an error to try to unget a character when the unget buffer is not
|
|
empty.
|
|
|
|
@param c: the character to unget
|
|
@type c: string
|
|
@raises UngetBufferFull: there is already an ungotten char
|
|
"""
|
|
|
|
if not self.ungotten_char is None:
|
|
raise UngetBufferFull
|
|
self.ungotten_char = c
|
|
|
|
def skip_whitespace(self):
|
|
"""Consume input until a non-whitespace character is encountered.
|
|
|
|
The non-whitespace character is then ungotten, and the number of
|
|
whitespace characters consumed is returned.
|
|
|
|
If the tokenizer is in multiline mode, then newlines are whitespace.
|
|
|
|
@rtype: int
|
|
"""
|
|
|
|
skipped = 0
|
|
while True:
|
|
c = self._get_char()
|
|
if c != ' ' and c != '\t':
|
|
if (c != '\n') or not self.multiline:
|
|
self._unget_char(c)
|
|
return skipped
|
|
skipped += 1
|
|
raise AssertionError, "skip_whitespace() broke endless loop"
|
|
|
|
def get(self, want_leading = False, want_comment = False):
|
|
"""Get the next token.
|
|
|
|
@param want_leading: If True, return a WHITESPACE token if the
|
|
first character read is whitespace. The default is False.
|
|
@type want_leading: bool
|
|
@param want_comment: If True, return a COMMENT token if the
|
|
first token read is a comment. The default is False.
|
|
@type want_comment: bool
|
|
@rtype: (int, string) tuple
|
|
@raises linkcheck.dns.exception.UnexpectedEnd: input ended prematurely
|
|
@raises linkcheck.dns.exception.DNSSyntaxError: input was badly formed
|
|
"""
|
|
|
|
if not self.ungotten_token is None:
|
|
token = self.ungotten_token
|
|
self.ungotten_token = None
|
|
if token[0] == WHITESPACE:
|
|
if want_leading:
|
|
return token
|
|
elif token[0] == COMMENT:
|
|
if want_comment:
|
|
return token
|
|
else:
|
|
return token
|
|
skipped = self.skip_whitespace()
|
|
if want_leading and skipped > 0:
|
|
return (WHITESPACE, ' ')
|
|
token = ''
|
|
ttype = IDENTIFIER
|
|
while True:
|
|
c = self._get_char()
|
|
if c == '' or c in self.delimiters:
|
|
if c == '' and self.quoting:
|
|
raise linkcheck.dns.exception.UnexpectedEnd
|
|
if token == '' and ttype != QUOTED_STRING:
|
|
if c == '(':
|
|
self.multiline += 1
|
|
self.skip_whitespace()
|
|
continue
|
|
elif c == ')':
|
|
if not self.multiline > 0:
|
|
raise linkcheck.dns.exception.DNSSyntaxError
|
|
self.multiline -= 1
|
|
self.skip_whitespace()
|
|
continue
|
|
elif c == '"':
|
|
if not self.quoting:
|
|
self.quoting = True
|
|
self.delimiters = _QUOTING_DELIMITERS
|
|
ttype = QUOTED_STRING
|
|
continue
|
|
else:
|
|
self.quoting = False
|
|
self.delimiters = _DELIMITERS
|
|
self.skip_whitespace()
|
|
continue
|
|
elif c == '\n':
|
|
return (EOL, '\n')
|
|
elif c == ';':
|
|
while 1:
|
|
c = self._get_char()
|
|
if c == '\n' or c == '':
|
|
break
|
|
token += c
|
|
if want_comment:
|
|
self._unget_char(c)
|
|
return (COMMENT, token)
|
|
elif c == '':
|
|
if self.multiline:
|
|
raise linkcheck.dns.exception.DNSSyntaxError, \
|
|
'unbalanced parentheses'
|
|
return (EOF, '')
|
|
elif self.multiline:
|
|
self.skip_whitespace()
|
|
token = ''
|
|
continue
|
|
else:
|
|
return (EOL, '\n')
|
|
else:
|
|
# This code exists in case we ever want a
|
|
# delimiter to be returned. It never produces
|
|
# a token currently.
|
|
token = c
|
|
ttype = DELIMITER
|
|
else:
|
|
self._unget_char(c)
|
|
break
|
|
elif self.quoting:
|
|
if c == '\\':
|
|
c = self._get_char()
|
|
if c == '':
|
|
raise linkcheck.dns.exception.UnexpectedEnd
|
|
if c.isdigit():
|
|
c2 = self._get_char()
|
|
if c2 == '':
|
|
raise linkcheck.dns.exception.UnexpectedEnd
|
|
c3 = self._get_char()
|
|
if c == '':
|
|
raise linkcheck.dns.exception.UnexpectedEnd
|
|
if not (c2.isdigit() and c3.isdigit()):
|
|
raise linkcheck.dns.exception.DNSSyntaxError
|
|
c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
|
|
elif c == '\n':
|
|
raise linkcheck.dns.exception.DNSSyntaxError, 'newline in quoted string'
|
|
elif c == '\\':
|
|
|
|
# Treat \ followed by a delimiter as the
|
|
# delimiter, otherwise leave it alone.
|
|
|
|
c = self._get_char()
|
|
if c == '' or not c in self.delimiters:
|
|
self._unget_char(c)
|
|
c = '\\'
|
|
token += c
|
|
if token == '' and ttype != QUOTED_STRING:
|
|
if self.multiline:
|
|
raise linkcheck.dns.exception.DNSSyntaxError, 'unbalanced parentheses'
|
|
ttype = EOF
|
|
return (ttype, token)
|
|
|
|
def unget(self, token):
|
|
"""Unget a token.
|
|
|
|
The unget buffer for tokens is only one token large; it is
|
|
an error to try to unget a token when the unget buffer is not
|
|
empty.
|
|
|
|
@param token: the token to unget
|
|
@type token: (int, string) token tuple
|
|
@raises UngetBufferFull: there is already an ungotten token
|
|
"""
|
|
|
|
if not self.ungotten_token is None:
|
|
raise UngetBufferFull
|
|
self.ungotten_token = token
|
|
|
|
def next(self):
|
|
"""Return the next item in an iteration.
|
|
@rtype: (int, string)
|
|
"""
|
|
|
|
token = self.get()
|
|
if token[0] == EOF:
|
|
raise StopIteration
|
|
return token
|
|
|
|
def __iter__(self):
|
|
return self
|
|
|
|
# Helpers
|
|
|
|
def get_int(self):
|
|
"""Read the next token and interpret it as an integer.
|
|
|
|
@raises linkcheck.dns.exception.DNSSyntaxError:
|
|
@rtype: int
|
|
"""
|
|
|
|
(ttype, value) = self.get()
|
|
if ttype != IDENTIFIER:
|
|
raise linkcheck.dns.exception.DNSSyntaxError, 'expecting an identifier'
|
|
if not value.isdigit():
|
|
raise linkcheck.dns.exception.DNSSyntaxError, 'expecting an integer'
|
|
return int(value)
|
|
|
|
def get_uint8(self):
|
|
"""Read the next token and interpret it as an 8-bit unsigned
|
|
integer.
|
|
|
|
@raises linkcheck.dns.exception.DNSSyntaxError:
|
|
@rtype: int
|
|
"""
|
|
|
|
value = self.get_int()
|
|
if value < 0 or value > 255:
|
|
raise linkcheck.dns.exception.DNSSyntaxError, \
|
|
'%d is not an unsigned 8-bit integer' % value
|
|
return value
|
|
|
|
def get_uint16(self):
|
|
"""Read the next token and interpret it as a 16-bit unsigned
|
|
integer.
|
|
|
|
@raises linkcheck.dns.exception.DNSSyntaxError:
|
|
@rtype: int
|
|
"""
|
|
|
|
value = self.get_int()
|
|
if value < 0 or value > 65535:
|
|
raise linkcheck.dns.exception.DNSSyntaxError, \
|
|
'%d is not an unsigned 16-bit integer' % value
|
|
return value
|
|
|
|
def get_uint32(self):
|
|
"""Read the next token and interpret it as a 32-bit unsigned
|
|
integer.
|
|
|
|
@raises linkcheck.dns.exception.DNSSyntaxError:
|
|
@rtype: int
|
|
"""
|
|
(ttype, value) = self.get()
|
|
if ttype != IDENTIFIER:
|
|
raise linkcheck.dns.exception.DNSSyntaxError, 'expecting an identifier'
|
|
if not value.isdigit():
|
|
raise linkcheck.dns.exception.DNSSyntaxError, 'expecting an integer'
|
|
value = long(value)
|
|
if value < 0 or value > 4294967296L:
|
|
raise linkcheck.dns.exception.DNSSyntaxError, \
|
|
'%d is not an unsigned 32-bit integer' % value
|
|
return value
|
|
|
|
def get_string(self, origin=None):
|
|
"""Read the next token and interpret it as a string.
|
|
|
|
@raises linkcheck.dns.exception.DNSSyntaxError:
|
|
@rtype: string
|
|
"""
|
|
(ttype, t) = self.get()
|
|
if ttype != IDENTIFIER and ttype != QUOTED_STRING:
|
|
raise linkcheck.dns.exception.DNSSyntaxError, 'expecting a string'
|
|
return t
|
|
|
|
def get_name(self, origin=None):
|
|
"""Read the next token and interpret it as a DNS name.
|
|
|
|
@raises linkcheck.dns.exception.DNSSyntaxError:
|
|
@rtype: linkcheck.dns.name.Name object"""
|
|
(ttype, t) = self.get()
|
|
if ttype != IDENTIFIER:
|
|
raise linkcheck.dns.exception.DNSSyntaxError, 'expecting an identifier'
|
|
return linkcheck.dns.name.from_text(t, origin)
|
|
|
|
def get_eol(self):
|
|
"""Read the next token and raise an exception if it isn't EOL or
|
|
EOF.
|
|
|
|
@raises linkcheck.dns.exception.DNSSyntaxError:
|
|
@rtype: string
|
|
"""
|
|
|
|
(ttype, t) = self.get()
|
|
if ttype != EOL and ttype != EOF:
|
|
raise linkcheck.dns.exception.DNSSyntaxError, \
|
|
'expected EOL or EOF, got %d "%s"' % (ttype, t)
|
|
return t
|
|
|
|
def get_ttl(self):
|
|
(ttype, t) = self.get()
|
|
if ttype != IDENTIFIER:
|
|
raise linkcheck.dns.exception.SyntaxError, 'expecting an identifier'
|
|
return linkcheck.dns.ttl.from_text(t)
|