linkchecker/linkcheck/StringUtil.py
2002-11-24 19:53:37 +00:00

172 lines
4.1 KiB
Python

"""various string utils"""
# Copyright (C) 2000,2001 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import re, sys, htmlentitydefs
markup_re = re.compile("<.*?>", re.DOTALL)
entities = htmlentitydefs.entitydefs.items()
HtmlTable = map(lambda x: (x[1], "&"+x[0]+";"), entities)
UnHtmlTable = map(lambda x: ("&"+x[0]+";", x[1]), entities)
# order matters!
HtmlTable.sort()
UnHtmlTable.sort()
UnHtmlTable.reverse()
# standard xml entities
entities = {
'lt': '<',
'gt': '>',
'amp': '&',
'quot': '"',
'apos': "'",
}
XmlTable = map(lambda x: (x[1], "&"+x[0]+";"), entities.items())
UnXmlTable = map(lambda x: ("&"+x[0]+";", x[1]), entities.items())
# order matters!
XmlTable.sort()
UnXmlTable.sort()
UnXmlTable.reverse()
SQLTable = [
("'","''")
]
def stripQuotes (s):
"Strip optional quotes"
if len(s)<2:
return s
if s[0]=="\"" or s[0]=="'":
s = s[1:]
if s[-1]=="\"" or s[-1]=="'":
s = s[:-1]
return s
def indent (s, level):
"indent each line of s with <level> spaces"
return indentWith(s, level * " ")
def indentWith (s, indent):
"indent each line of s with given indent argument"
i = 0
while i < len(s):
if s[i]=="\n" and (i+1) < len(s):
s = s[0:(i+1)] + indent + s[(i+1):]
i += 1
return s
def blocktext (s, width):
"Adjust lines of s to be not wider than width"
# split into lines
s = s.split("\n")
s.reverse()
line = None
ret = ""
while len(s):
if line:
line += "\n"+s.pop()
else:
line = s.pop()
while len(line) > width:
i = getLastWordBoundary(line, width)
ret += line[0:i].strip() + "\n"
line = line[i:].strip()
return ret + line
def getLastWordBoundary (s, width):
"""Get maximal index i of a whitespace char in s with 0 < i < width.
Note: if s contains no whitespace this returns width-1"""
match = re.compile(".*\s").match(s[0:width])
if match:
return match.end()
return width-1
def applyTable (table, s):
"apply a table of replacement pairs to str"
for mapping in table:
s = s.replace(mapping[0], mapping[1])
return s
def sqlify (s):
"Escape special SQL chars and strings"
if not s:
return "NULL"
return "'%s'"%applyTable(SQLTable, s)
def htmlify (s):
"Escape special HTML chars and strings"
return applyTable(HtmlTable, s)
def unhtmlify (s):
return applyTable(UnHtmlTable, s)
def xmlify (s):
"""quote characters for XML"""
return applyTable(XmlTable, s)
def unxmlify (s):
"""unquote character from XML"""
return applyTable(UnXmlTable, s)
def getLineNumber (s, index):
"return the line number of str[index]"
i=0
if index<0: index=0
line=1
while i<index:
if s[i]=='\n':
line += 1
i += 1
return line
def paginate (text, lines=22):
"""print text in pages of lines size"""
textlines = text.split("\n")
curline = 1
for line in textlines:
print line
curline += 1
if curline >= lines and sys.stdin.isatty():
curline = 1
print "press return to continue..."
sys.stdin.read(1)
def remove_markup (s):
mo = markup_re.search(s)
while mo:
s = s[0:mo.start()] + s[mo.end():]
mo = markup_re.search(s)
return s
def unquote (s):
if not s:
return ''
return unhtmlify(stripQuotes(s))