25 November 2004 9 comments Python
Here's a little program I wrote recently to fix incorrectly defined characters into HTML entities. For example, this is incorrect:
<b>Bärs & Öl</b>
But this is correct:
<b>B&auml;rs &amp; &Ouml;l</b>
To demonstrate I have set up a little test page here so that you can test to convert your impure HTML content.
Run test program
Here's the source code for the program:
from htmlentitydefs import entitydefs
entitydefs_inverted = {}
for k,v in entitydefs.items():
entitydefs_inverted[v] = k
_badchars_regex = re.compile('|'.join(entitydefs.values()))
_been_fixed_regex = re.compile('&\w+;|&#[0-9]+;')
def html_entity_fixer(text, skipchars=[], extra_careful=1):
# if extra_careful we don't attempt to do anything to
# the string if it might have been converted already.
if extra_careful and _been_fixed_regex.findall(text):
return text
if type(skipchars) == type('s'):
skipchars = [skipchars]
keyholder= {}
for x in _badchars_regex.findall(text):
if x not in skipchars:
keyholder[x] = 1
text = text.replace('&','&amp;')
text = text.replace('\x80', '&#8364;')
for each in keyholder.keys():
if each == '&':
continue
better = entitydefs_inverted[each]
if not better.startswith('&#'):
better = '&%s;'%entitydefs_inverted[each]
text = text.replace(each, better)
return text
I learned that using Umlauts may be quite correct by setting the encoding to latin-1 (aka ISO something) or UTF-8 ... esp. with xhtml.
So the only BAD chars will be <,> and & ...
Harald
http://www.xml.com/pub/a/2004/07/21/dive.html explains it much better than I can. :-)
Because I'm such a forward thinker with a knack for coming up with unique names... I think...
from htmlescape import *
:D
#/usr/bin/python
import sys
import os
#http://www.asciitable.com/
#http://www.w3schools.com/tags/ref_entities.asp
#DICT { char : HTML entity }
dicionario = {
# ISO 8859-1 Character Entities
'À' : "À", 'Á' : "Á", 'Â' : "Â", 'Ã' : "Ã", 'Ä' : "Ä", 'Å' : "Å",
'Æ' : "Æ", 'Ç' : "Ç",
'È' : "È", 'É' : "É", 'Ê' : "Ê", 'Ë' : "Ë",
'Ì' : "Ì", 'Í' : "Í", 'Î' : "Î", 'Ï' : "Ï",
'Ð' : "Ð", 'Ñ' : "Ñ",
'Ò' : "Ò", 'Ó' : "Ó", 'Ô' : "Ô", 'Õ' : "Õ", 'Ö' : "Ö", 'Ø' : "Ø",
'Ù' : "Ù", 'Ú' : "Ú", 'Û' : "Û", 'Ü' : "Ü",
'Ý' : "Ý",
'Þ' : "Þ", 'ß' : "ß",
'à' : "à", 'á' : "á", 'â' : "â", 'ã' : "ã", 'ä' : "ä", 'å' : "å",
'æ' : "æ", 'ç' : "ç",
'è' : "è", 'é' : "é", 'ê' : "ê", 'ë' : "ë",
'ì' : "ì", 'í' : "í", 'î' : "î", 'ï' : "ï",
'ð' : "ð", 'ñ' : "ñ",
'ò' : "ò", 'ó' : "ó", 'ô' : "ô", 'õ' : "õ", 'ö' : "ö", 'ø' : "ø",
'ù' : "ù", 'ú' : "ú", 'û' : "û", 'ü' : "ü",
'ý' : "ý", 'þ' : "þ", 'ÿ' : "ÿ",
};
def main ():
try:
if (sys.argv[1]):
originalFile = open(sys.argv[1], "r")
newFile = open(sys.argv[1] + ".RC", "w");
while 1:
#Variables
read = originalFile.readline();
strHolder = "";
if not read:
break;
for char in read: # for i in xrange(len(read) - 1)
try:
if ( ord(char) > 128):
strHolder += dicionario[char];
else:
strHolder += char;
except KeyError: # if the char is extended ASCII but hasn't been included on the dict
strHolder += char;
#End for
print strHolder; ##scaffolding
newFile.write(strHolder);
#End while
#Close-ups
originalFile.close();
newFile.close();
#end if
except IndexError:
print "\n\nModo de uso: toEntities.py <Nome_Do_Arquivo>\n\n"; return 1;
except IOError:
print "\n\nArquivo nao pode ser aberto...\n\n"; return 2;
#end main
main();
#EOF
import re