#!/usr/bin/python
# Read old MS FrontPage HTML document and tidy it up.
# Contains site specific functions, so the script will need to be changed somewhat
# for every site.
# Version: 20100414/ploog+ipce.info
from optparse import OptionParser
import os
import re
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
# We have no options as yet, but still this is a convenient way of printing usage
#usage = "usage: %prog [options] arg1 arg2"
a = OptionParser(usage = "usage: %prog htmlfile",
description = "filename should be a HTML file")
(options, args) = a.parse_args()
if len(args) != 1:
print "Number of command line arguments must be 1!"
a.print_help()
exit()
fname = args[0]
### 'constants' for constructs we need to use in site specific code:
#
# - the font name (used in all illegal font tags which should be stripped out
# before even feeding the HTML to BeautifulSoup
# - name of the image which should be converted to 'li' tag when found
#
## if you use this script for different sites, insert favourite way of
## distinguishing between them, here:
if os.path.abspath(fname).find('ipce') != -1:
c_common_font = 'Book Antiqua, Times New Roman, Times'
c_img_bullet_re = 'expbul.?.?\.gif$'
else:
c_common_font = 'Arial, Helvetica'
c_img_bullet_re = 'posbul.?.?\.gif$'
### THIS REGEX IS REFERENCED AS A 'GLOBAL' INSIDE FUNCTIONS
#
rx = re.compile('^(?:\s|\ \;|\
)+$')
rxnl = re.compile('\S\n$')
rxe = re.compile('(?:\s|\ \;|\
)+$')
rs = re.compile('^\s+$')
rss = re.compile('^\s+')
#NB: the \n and ' ' are probably not necessary here because they do not affect the
# rendering of the document (and taking the '\n' away as we are doing now may
# be worse for readability of the source?) ...
# ...but I'll leave it in anyway, until I'm sure. Things work now, anyway.
#NB2: above is not really correct. They should be in the regexp
# because strings can be compound, like '\r\n '
#NB3: this regex can be used on all elements - but it will match _either_ a 'br'
# _or_ a combination of anything else - because 'br's are Tags, not in a NavigableString
###
### Functions 1/3: helper functions which are pretty much general
###
# return index of element inside parent contents
def indexInParent(slf):
# (Maybe there is a better way than this; I used to have this in a patched
# version of the BeautifulSoup.py library 3.1.0.1 itself, before I started
# working with the non-buggy v3.0.8.1. So I just took the function out
# and didn't look further)
index = 0
while index < len(slf.parent.contents):
if slf.parent.contents[index] is slf:
return index
index = index + 1
# if this happens, something is really wrong with the data structure:
return None
# move all contents out of one tag, to just before the other tag
def movecontentsbefore(fromwithin, tobefore):
movecontentsinside(fromwithin, tobefore.parent, indexInParent(tobefore))
def movecontentsinside(fromwithin, toinside, insertindex=0, fromindex = 0):
r = fromwithin.contents
i = insertindex
while len(r) > fromindex:
toinside.insert(i, r[fromindex])
i = i + 1
def matchstring(e, rx):
# Difficulty here: str(ee) may give UnicodeEncodeError with some characters
# and so may ee.__str__() and repr(ee) (the latter with some \x?? chars).
# The only thing sure to not give errors is ee.__repr__()
# However you don't want to use THAT for matching! So use it as a safety net
# to make sure str() is not called when unicode chars are in there
#
# Yeah, I know, it's probably just my limited Python knowledge, that made me
# write this function...
# (If it isn't a bug in BeautifulSoup 3.1; probably not.)
s = e.__repr__()
if s.find('\\u') != -1 or s.find('\\x') != -1:
return False
return rx.search(str(e))
# Remove all tags that only contain whitespace
# (do not remove the contents. Move the contents outside; remove the tags.)
def removetagcontainingwhitespace(tagname):
r = soup.findAll(tagname)
for e in r:
ok = 1
for ee in e.contents:
if not(matchstring(ee, rx)):
ok = 0
break
if ok:
movecontentsbefore(e, e)
e.extract()
def extractwhitespacefromend(t):
r = t.contents
while len(r):
e = r[-1]
if e.__class__.__name__ == 'Tag':
if e.__unicode__() == '
':
e.extract()
else:
extractwhitespacefromend(e)
break
elif matchstring(e, rx):
# delete whole NavigableString consisting of whitespace
e.extract()
elif matchstring(e, rxe) and not rxnl.search(str(e)):
# extract whitespace from end of NavigableString (except if it's just a newline for markup; we don't want to get everything on one line...)
s = rxe.sub('', str(e))
e.replaceWith(s)
else:
break
# Get style attribute from tag, return it as dictionary
def getstyle(t):
s = t.get('style')
r = {}
if s:
for styledef in s.split(';'):
(sn, sv) = s.split(':', 1)
r[sn.strip().lower()] = sv.strip()
return r
###
### Functions 2/3: helper functions which have logic (like tag/attribute names)
### encoded in them
###
# Check alignments of all elements inside a certain parent element.
# If alignment of an element is explicitly specified AND equal to the specified parent
# alignment, then delete that explicit attribute
# If alignment of ALL elements is the same AND NOT equal to the specified parent
# alignment, then change the parent's alignment property IF that is allowed.
#
# NOTES:
# This function currently checks only for the 'align' attribute, which is deprecated.
# There's the 'style="text-align: ..."' which should be used instead
# We now have mangleattributes() to change one into the other, so:
# - this function currently MUST be called before mangleattributes()
# - this function should ideally should be changed to check for 'whatever
# mangleattributes() changed it to' (LOOK THERE) and be called afterwards
def checkalign(pe, parentalign, pallowchange = ''):
## first: special handling for 'implicitly aligning tags', i.e.
if parentalign == 'center':
# get rid of all 'center' tags, because they do nothing.
# (you're generally better off placing its child contents at the same level now,
# so you can inspect them in one go)
r = pe.findAll('center', recursive=False)
for t in r:
movecontentsbefore(t, t)
t.extract()
al = {}
# non-whitespace NavigableStrings always have alignment equal to the parent element
# (whitespace strings don't matter; alignment can be changed without visible difference)
r = pe.findAll(text=lambda x, r=rx: r.match(x)==None, recursive=False)
if len(r):
al['inherit'] = True
# setting 'inherit' effectively means: prevent parent's alignment from being changed
## find/index alignment of all tags within pe, and process
r = pe.findAll(recursive=False)
for t in r:
s = t.__repr__()
talign = t.get('align')
if talign:
#NOTE: 'align' can also be "middle"... ignore that for now until I see it being used on non-navigation-images
thisalign = talign
allowchange = 'any'
elif s.startswith(''):
thisalign = 'center'
allowchange = parentalign
else:
thisalign = parentalign
if s.startswith('') or s.startswith('
:
if s.startswith('
'):
if 'CHANGE' in tal:
# align needs change -- which can (only) be done by deleting the tag.
movecontentsbefore(t, t)
t.extract()
else:
# 'normal' element
if 'CHANGE' in tal:
# align needs change
# (we may end up deleting it just afterwards, but this way keeps code clean)
#setattr(t, 'align', tal['CHANGE'])
t['align'] = tal['CHANGE'] ## Does this now always work? Otherwise use __setitem__()?
talign = tal['CHANGE']
if talign:
## explicit/changed alignment
if talign == parentalign:
# delete (now-)superfluous explicit 'align' attribute in tag
#delattr(t, 'align')
del t['align']
al['inherit'] = True
else:
# We're just collecting alignments 'not equal to inherited' here;
# Check after the loop what we want to do about it.
lastalign = talign
al[lastalign] = True
else:
## inherited, unchanged alignment
al['inherit'] = True
## After finding/indexing(/changing?) all 'align' from (recursive?) child tags:
#
# We can change this collection of elements' (and thus the parent's) alignment
# IF the parent's "align" property has no influence on any of its kids - i.e.
# no "inherit" was recorded.
if len(al) == 1 and ('inherit' not in al) and (pallowchange == 'any' or pallowchange == lastalign):
# All alignments are the same == lastalign.
# Indicate to caller that it should change parent's align attribute
al['CHANGE'] = lastalign
# Delete any explicit attribute because we will change the parent's.
for t in pe.findAll(align=lastalign, recursive=False):
del t['align']
return al
# Ideas for this routine:
# - if all your stuff is 'center', and more than one (and not inherit), then insert a 'center', place everything inside, and then delete all the explicit align=center from these tags
# - replace 'middle' by 'center'? (align=middle is used for pictures, I've seen sometimes.)
# Filter out attributes from a tag; change some others
#
# tagname is really a duplicate argument that could be derived from t
# but stupidly, there's no nice argument for that?
def mangleattributes(t, tagname):
#t.attrs is list of tuples
# so if you loop through it, you get tuples back
# still you can USE it as a dict type. So you can assign and delete stuff by key
# however you may not delete stuff by key while iterating of the list of attrs! That makes the iterator break off...
# create list of keys first
attrs = []
for attr in t.attrs:
attrs.append(attr[0])
for can in attrs:
cav = t.get(can)
an = can.lower()
av = cav.lower()
if an == 'align':
# Replace this outdated attribute by a 'class="align-..."' attribute
# Assumes you have those classes defined in CSS somewhere!
# (We can also go for a 'style: align=...' attribute, but I'd like to have less explicit style attributes in the HTML source if I can, so make a 'layer')
sv = t.get('class')
if sv:
# assume this class is not yet present
t['class'] = sv + ' align-' + av
else:
t['class'] = 'align-' + av
av = ''
elif an == 'margin-top':
# on ploog, this is present in almost every paragraph. Should be made into standard css definition.
if tagname == 'p':
av = ''
elif an == 'class':
classes = cav.split()
for av in classes:
if av.lower() == 'msonormal':
classes.remove(av)
av = ' '.join(classes)
elif an == 'lang':
# always remove 'lang' attributes
av = ''
elif an == 'style':
styledefs = av.split(';')
av = ''
for s in styledefs:
if s.strip() != '':
(sn, sv) = s.split(':', 1)
sn = sn.strip()
sv = sv.strip()
if sn == 'line-height':
if sv == '15.1pt' or sv == '15.1 pt' or sv == '100%' or sv == 'normal':
sv = ''
elif sn == 'color':
if sv == 'black' or sv == '#000' or sv == '#000000':
sv = ''
elif sn == 'text-autospace':
if sv == 'none':
sv = ''
elif sn == 'font-family':
if sv == 'arial' and c_common_font.find('Arial') == 0:
sv = ''
elif sn == 'font-size':
#on ploog, I see '12pt' and '3' and I see no difference
# Possibly, this should only be stripped on ploog. Use trick for that
if (sv == '12pt' or sv == '3') and c_common_font.find('Arial') == 0:
sv = ''
elif sn.startswith('margin'):
if sv.isnumeric() and float(sv) < 0.02:
sv = ''
elif sn.startswith('mso-'):
# weird office specific styles? Never check, just delete and hope they didn't do anything
sv = ''
if sv:
if av != '':
av += '; '
# gather possibly-chsnged styles
av += sn + ': ' + sv
# check if tags have changed
# also change uppercase attribute names to lower
if an != can or av != cav.lower():
if an != can or not av:
del t[can]
if av:
t[an] = av
##### Start the action
html = open(fname).read()
html = html.replace('\r\n','\n')
###
### Functions -I mean functionality- 3/3:
### Helper functionality that operates on the HTML (mangles it) BEFORE it
### gets parsed by BeautifulSoup.
### NOTES:
### - For now I didn't convert this to a function because it would only imply
### passing a huge 'html' string as the argument & return value
### - but now, there are a few lines of 'global' code which are executed already
### (the ones filling the html string)
###
#####
#
# Clean up screwy HTML before parsing - #1:
#
# Strip superfluous font tag, because FrontPage does stuff
# like , which makes HTMLTidy wronlgy
# 'correct' stuff that would be fine if those font tags weren't there.
# Also, accommodate for recursive font tags... because _in between_
# these idiotic tags there may be legit ones.
tagToStrip = ''
tagLenEnd = len('')
pos = 0
found = []
while True:
# Find a font start/end tag pair, without any other font tags in between
# Do this by searching for an end tag, and then storing all the start tags
# leading up to it.
pe = html.find('', pos)
if pe == -1:
break
#print 'end: ' + str(pe)
# Find zero or more start tags and store them all
ps = html.find(' .... ...
by putting inside
# (if not, BeatifulSoup will put a
before the which will mess up formatting)
rx1 = re.compile('\(\s*\)(.*?)\<\/b>', re.S)
for r in rx1.finditer(html):
if r.group(2).find('/p>') == -1:
html = html[:r.start()] + r.group(1) + '' + html[r.start(2):]
# since html stays just as long, the finditer will be OK?
##############
###
### Now do the tidying work, using BeautifulSoup
soup = BeautifulSoup(html)
# Delete all script tags
# (I don't know this syntax; just deduced it from the docs :) )
r = soup.findAll('script')
[e.extract() for e in r]
# delete comments
r = soup.findAll(text=lambda text:isinstance(text, Comment))
[e.extract() for e in r]
#Replace b->strong and i->em, for XHTML compliance
# and so that we're sure we are not skipping tags in the code below
r = soup.findAll('b')
for t in r:
e = Tag(soup, 'strong')
t.parent.insert(indexInParent(t), e)
movecontentsinside(t, e)
t.extract()
r = soup.findAll('i')
for t in r:
e = Tag(soup, 'em')
t.parent.insert(indexInParent(t), e)
movecontentsinside(t, e)
t.extract()
# Remove stupid MSFT 'o:p' tags. Apparently it is best for the document flow,
# if we get rid of some markup whitespace (not  ) inside these tags too...
rx1 = re.compile('^\s+')
rx2 = re.compile('\s+$')
r = soup.findAll('o:p')
for t in r:
r2 = t.contents
# check for whitespace at start
if len(r2) and matchstring(r2[0], rx1):
s = rx1.sub('', r2[0])
if s == '':
r2[0].extract()
else:
r2[0].replaceWith(s)
# check for whitespace at end
# (r2 may no be empty, after the extract)
if len(r2) and matchstring(r2[-1], rx2):
s = rx2.sub('', r2[-1])
if s == '':
r2[-1].extract()
else:
r2[-1].replaceWith(s)
if len(r2):
movecontentsbefore(t, t)
t.extract()
# Remove tags that only contain whitespace
# (do not remove the contents. Move the contents outside; remove the tags.)
removetagcontainingwhitespace('strong')
removetagcontainingwhitespace('em')
removetagcontainingwhitespace('font')
#NO. Don't do this. Keep the 'b's outside the 'a's... Keep this code for reference, maybe later...
#
# links are rendered in bold, by default.
# Some links have a 'b' around it, which makes no visual difference but
# is an inconsistency in the document structure. Remove it.
#r = soup.findAll('a')
#for e in r:
# s = e.parent.__repr__()
# if s[0:3] == '' and s[-4:] == '':
# the 'b' may have more content than just the link. As long as that's all
# whitespace, there is still no difference in taking it away.
# ok = 1
# for ee in e.parent.contents:
# if ee != e and not(matchstring(ee, rx)):
# ok = 0
# break
# if ok:
# ee = e.parent
# movecontentsbefore(ee, ee)
# ee.extract()
# Some 'a' tags have 'b' tags surrounding them, and some have 'b' tags inside them.
# Normalize this;
r = soup.findAll('a')
for t in r:
r1 = t.findAll('strong', recursive=False)
if r1:
r2 = t.findAll(recursive=False)
if len(r1) == len(r2) and len(t.findAll(text=lambda x, r=rx: r.match(x)==None, recursive=False)) == 0:
# all tags are 'b' and all navigablestrings are whitespace.
# Delete the 'b' (can be a chain of multiple, in extreme weird cases)
for e in r1:
movecontentsbefore(e, e)
e.extract()
# make 'strong' tag and move e inside it
e = Tag(soup, 'strong')
t.parent.insert(indexInParent(t), e)
e.insert(0, t)
# remove whitespace at end of paragraphs
r= soup.findAll('p')
for t in r:
extractwhitespacefromend(t)
# remove whitespace just before
# Strictly we only need to move 'HTML whitespace' ( ), but
# that may be followed by a separate NavigableString holding only '\n'
rxb = re.compile('(?:\ \;|\s)+$')
r= soup.findAll('br')
for t in r:
e = t.previousSibling
while e != None and matchstring(e, rxb):
# already store 'previous previous', so we can safely extract it
# (also works around us not knowing whether extract() will actually get rid of a '\n')
ee = e.previousSibling
s = rxb.sub('', e)
if s == '':
e.extract()
else:
e.replaceWith(s)
e = ee
# when inside a paragraph, replace (exactly) two consecutive br's by a paragraph ending/start
r= soup.findAll('br')
for t in r:
# Thanks to previous, newlines before brs have gone so we can just do nextSibling
t2 = t.nextSibling
if t2.__repr__() == '
':
e = t.previousSibling
if e.__repr__() != '
':
e = t2.nextSibling
if e.__repr__() != '
':
pe = t.parent
s = pe.__repr__()
if s.startswith('') or s.startswith('
structures.
rxb = re.compile(c_img_bullet_re)
r = soup.findAll('table')
for t in r:
r_tr = t.findAll('tr', recursive=False)
all_bullets = 1
for tr in r_tr:
if all_bullets:
all_bullets = 0
r_td = tr.findAll('td', recursive=False)
if len(r_td) == 2:
# Inspect the first 'td':
# needs to contain only one 'img' tag.
# (I don't know how to determine the tag of an element, so do duplicate findAll())
#r_cont = r_td[0].findAll()
#if len(r_cont) == 1 and len(r_td[0].findAll('img')) == 1:
r_cont = filter(lambda x: x != '\n', r_td[0].contents)
if len(r_cont) == 1:
s = r_cont[0].__repr__()
if s[0:5] == '
':
# When is this a bullet point? Look at 'src' tag. That'll do.
# Is a relative path, so look only at the end.
s = r_cont[0]['src']
if rxb.search(s):
all_bullets = 1
# After looping through everything, we know if this table contains 'only bullet points'
if all_bullets:
# insert ul just before the table
# (If some of the siblings are NavigableStrings, not inside an element...
# this actually misplaces stuff and the ul may be inserted _before_ a string
# when it should be inserted after. I don't know a solution for this atm.)
e = Tag(soup, 'ul')
l = indexInParent(t)
t.parent.insert(l, e)
# insert li's and move all the contents from the second td's into there
# (Is it always legal to just 'dump everything' inside a li? Let's hope so.)
i = 0
for tr in r_tr:
ee = Tag(soup,'li')
e.insert(i, ee)
r_td = tr.findAll('td', recursive=False)
#r_cont = r_td[1].findAll()
# In the preceding code we used findAll() because we assumed that there are no
# loose NavigableStrings in between tr's or td's.
# However with the contents of the second td, we can't take that chance.
r_cont = filter(lambda x: x != '\n', r_td[1].contents)
if len(r_cont) == 1:
s = r_cont[0].__repr__()
# Remark: yes, we should allow for other whitespace (e.g. newline) behind the p...
# But not right now. We're only doing this nasty frontpage html and it'll do.
if (s[0:3] == '') and s[-4:] == '
':
# inside the 'td' there's exactly one paragraph.
# insert the contents of the paragraph, instead of the paragraph itself.
movecontentsinside(r_cont[0], ee)
else:
# any other case: just insert all contents of the 'td'
movecontentsinside(r_td[1], ee)
else:
movecontentsinside(r_td[1], ee)
extractwhitespacefromend(ee)
ee = NavigableString('\n')
e.insert(i + 1, ee)
i = i + 2
t.extract()
# Delete/change superfluous 'align' attributes (and tags sometimes)
checkalign(soup.body, 'left')
# replace 'font color=' with 'span color=' -- it's more XHTML compliant and no hassle
# replace 'font' tags with style attributes. First look if there is a single
# encompassing div/span/p, then look whether font encompasses a single one, otherwise create a 'span' tag in place.
r = soup.findAll('font')
for t in r:
e = None
innerdest = False
ee = t.parent
s = t.__repr__()
if s.startswith('') or s.startswith('
') or s.startswith('') or s.startswith('') or s.startswith('
') or s.startswith('') or s.startswith(')')
# button links. Usually there's one to the index page but not always
# the 'nieuw' is for the p-loog index page which doesn't
rx2 = re.compile('^\
]+\/\>$')
r = soup.body.contents
if str(r[0]) == '\n':
# we want the first newline to remain there,
# so the body tag will be on a line by itself
i = 1
else:
i = 0
v = 3
while v >= 0:
# find whitespace _before_ the real content
# this will likely be 'markup whitespace' (newlines) that are unnecessary now
# Removing 'HTML whitespace' (like breaks/nbsp) has its effect on the actual page -but delete it anyway. I think we want to unify 'space at the start' anyway.
if matchstring(r[i], rx):
r[i].extract() ### This actually changes r
elif matchstring(r[i], rx1):
if len(r[i].contents) == 0:
# extract empty paragraph at start -- that's just as "whitespace" as the above
r[i].extract()
elif v:
if v == 3 or v == 1:
# look for the buttons (only once) - sometimes these are above, sometimes below the title image
e = r[i].findNext()
if matchstring(e, rx2):
r[i].extract()
v -= 1
# if, right after the paragraph with buttons, there's again a paragraph containing links to categories... delete that too.
# OR NOT? ... leave that idea for now...
continue
if v == 3 or v == 2:
# look for a header title image (which is superfluous because the title's also in the page)
rr = r[i].findAll()
if len(rr) == 1 and matchstring(rr[0], rx3):
r[i].extract()
v -= 2
continue
v = -1 # other nonempty paragraph
else:
v = -1 # other nonempty paragraph while v==0
else:
v = -1 # other tag/NavigableString
# Last
# NB: this is partly effectively extractwhitespacefromend() - but intermixed with empty tags too
v = 2
if str(r[-1]) == '\n':
# we want the last newline to remain there,
# so the body tag will be on a line by itself
i = -2
else:
i = -1
while v:
if matchstring(r[i], rx):
r[i].extract()
#i = i - 1
elif matchstring(r[i], rx1):
if len(r[i].contents) == 0:
r[i].extract()
#i = i - 1
elif v == 2:
e = r[i].findNext()
if matchstring(e, rx2):
r[i].extract()
#soup.body.findAll(recursive=False)[-1].extract()
#i = i - 1
v = 1
else:
v = 0 # other nonempty paragraph; quit
else:
v = 0 # other nonempty paragraph while v==1
else:
v = 0 # other tag/NavigableString
###TODO: remove unnecessary html entities like &ldquo, or stuff?
# no, FG might object to "different" quotes?
print soup