########################################################################
# PDF -> EPUB conversion
# Copyright (C) 2013 Daniel Beer
#
# Permission to use, copy, modify, and/or distribute this software for
# any purpose with or without fee is hereby granted, provided that the
# above copyright notice and this permission notice appear in all
# copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
# PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.
########################################################################
# Tuned for "A Mathematician's Apology", by G.H. Hardy, available in
# PDF format from:
#
# http://www.math.ualberta.ca/mss/
########################################################################
import xml.sax, sys, cgi, math, re
########################################################################
# Take XML input from pdf2txt.py and feed it to a SAX parser which will
# extract pages of unordered glyphs.
########################################################################
def runFilters(filep):
parser = xml.sax.make_parser()
parser.setContentHandler(GlyphExtractor())
parser.parse(filep)
########################################################################
# Parse the output of pdf2txt.py and produce a stream of tuples
# containing glyphs with their positions, fonts and sizes
########################################################################
class GlyphExtractor(xml.sax.ContentHandler):
def __init__(self):
self.handler = LineGrouper()
def startElement(self, name, attrs):
if name == 'text':
self.textattrs = attrs
elif name == 'page':
self.handler.page()
def characters(self, content):
if content == '\n':
return
try:
fontBits = self.textattrs['font'].split(',')
if len(fontBits) < 2:
fontBits.append(None)
bbox = map(float, self.textattrs['bbox'].split(','))
fontBits.append(int(round(float(self.textattrs['size']))))
except:
return
self.handler.glyph(tuple(bbox), tuple(fontBits), content)
def endDocument(self):
self.handler.end()
class GlyphExtractorPrinter():
def page(self):
print 'page'
def glyph(self, bbox, font, char):
print (' glyph: bbox=%s, font=%s, char=%r' % \
('(%.02f, %.02f, %.02f, %.02f)' % bbox,
'(%s, %s, %d)' % font, char)).encode('utf-8')
def end(self):
print '<<>>'
########################################################################
# Take the output of the glyph extractor and then identify common
# baselines within each page. Emit glyphs grouped by line, then by page.
# Lines are ordered top-to-bottom, but glyphs are unordered within each
# line.
########################################################################
class LineGrouper():
def __init__(self):
self.handler = ChunkJoiner()
self.glyphs = None
def emit_page(self):
if self.glyphs is None:
return
lines = {}
for bi, glyph in self.glyphs:
klass = self.get_class(bi)
if not lines.has_key(klass):
lines[klass] = []
lines[klass].append(glyph)
klasses = lines.keys()
klasses.sort(lambda a, b: cmp(self.baselines[a], self.baselines[b]))
klasses.reverse()
self.handler.page()
for klass in klasses:
bl = self.baselines[klass]
self.handler.line(bl)
for bottom, span, font, char in lines[klass]:
lift = int(round(bottom - bl))
self.handler.glyph(span, font + tuple([lift]), char)
self.glyphs = None
def page(self):
self.emit_page()
self.glyphs = []
self.eqClass = {}
self.baselines = {}
# Obtain the class to which the given index currently belongs.
# Initially, all indices belong solely to distinct classes.
def get_class(self, bi):
if not self.eqClass.has_key(bi):
return bi
cl = self.get_class(self.eqClass[bi])
self.eqClass[bi] = cl
return cl
# Record the given baseline value as being associated with the class
# idx. We keep track of the smallest baseline seen in each class.
def notice_baseline(self, idx, value):
if not self.baselines.has_key(idx) or \
self.baselines[idx] > value:
self.baselines[idx] = value
# Join two classes
def join_classes(self, a, b):
if a < b:
self.eqClass[b] = a
self.notice_baseline(a, self.baselines[b])
elif b > a:
self.eqClass[a] = b
self.notice_baseline(b, self.baselines[a])
def glyph(self, bbox, font, char):
left, bottom, right, top = bbox
bi = int(math.ceil(bottom))
ti = int(math.floor(top))
klass = self.get_class(bi)
self.notice_baseline(klass, bottom)
if char != ' ':
for i in xrange(bi + 1, (bi + ti + ti) / 3):
ic = self.get_class(i)
self.notice_baseline(ic, bottom)
self.join_classes(klass, ic)
self.glyphs.append((klass, (bottom, (left, right), font, char)))
def end(self):
self.emit_page()
self.handler.end()
class LineGrouperPrinter():
def page(self):
print 'page'
def line(self, base):
print ' line: base=%.02f' % base
def glyph(self, span, font, char):
print (' glyph: span=%s, font=%s, char=%r' % \
('(%.02f, %.02f)' % span,
'(%s, %s, %d, %d)' % font, char)).encode('utf-8')
def end(self):
print '<<>>'
########################################################################
# Take the output of line grouping, and within each line, sort and join
# glyphs into substring chunks with common styling. Chunks are ordered
# by left edge.
########################################################################
GLUE_MARGIN = 20.0
class ChunkJoiner:
def __init__(self):
self.handler = Depaginator()
self.glyphs = None
def emit_line(self):
if self.glyphs is None:
return
self.glyphs.sort(lambda a, b: cmp(a[0][0], b[0][0]))
chunkText = ''
chunkFont = None
chunkLeft = None
chunkRight = None
self.handler.line(self.baseline)
for (l, r), font, char in self.glyphs:
if chunkFont != font or abs(chunkRight - l) > GLUE_MARGIN:
if chunkText != '':
self.handler.chunk((chunkLeft, chunkRight),
chunkFont, chunkText)
chunkText = ''
chunkFont = font
chunkLeft = l
chunkRight = r
chunkText += char
if chunkText != '':
self.handler.chunk((chunkLeft, chunkRight),
chunkFont, chunkText)
self.glyphs = None
def page(self):
self.emit_line()
self.handler.page()
def line(self, baseline):
self.emit_line()
self.baseline = baseline
self.glyphs = []
def glyph(self, span, font, char):
self.glyphs.append((span, font, char))
def end(self):
self.emit_line()
self.handler.end()
class ChunkJoinerPrinter:
def page(self):
print 'page'
def line(self, base):
print ' line: base=%.02f' % base
def chunk(self, span, font, text):
print (' chunk: span=%s, font=%s, text=%r' % \
('(%.02f, %.02f)' % span,
'(%s, %s, %d, %d)' % font, text)).encode('utf-8')
def end(self):
print '<<>>'
########################################################################
# Take the output of the chunk joiner, and depaginate the document. We
# classify lines based on content and position to identify:
# - title-page material
# - page numbers
# - footnotes
# Page numbers are thrown away, and footnotes are reassembled at the end
# of the document. Pages and baselines disappear, and we emit sections,
# consisting of lines of chunks.
########################################################################
SPECIAL_PAGES = ['title', 'copyright', 'dedication', 'preface']
class Depaginator:
def __init__(self):
self.chunks = None
self.footnotes = []
self.page_number = -1
self.handler = Unwrapper()
def emit_chunks(self):
if self.chunks is None:
return
chunks = self.chunks
self.chunks = None
# Throw away the footer
if self.baseline < 170.0:
return
# Throw away lines with no content
haveContent = False
for x in chunks:
if x[2].rstrip() != '':
haveContent = True
if not haveContent:
return
if self.isFootnote:
self.footnotes.append(chunks)
else:
self.handler.line()
for span, font, text in chunks:
self.handler.chunk(span, font, text)
def page(self):
self.emit_chunks()
self.page_number += 1
self.isFootnote = False
if self.page_number < len(SPECIAL_PAGES):
self.handler.section(SPECIAL_PAGES[self.page_number])
elif self.page_number == len(SPECIAL_PAGES):
self.handler.section('main')
def line(self, baseline):
self.emit_chunks()
self.chunks = []
self.baseline = baseline
def chunk(self, span, font, text):
if span[0] < 130.0 and font[2] == 9:
self.isFootnote = True
self.chunks.append((span, font, text))
def emit_footnotes(self):
self.handler.section('footnotes')
for line in self.footnotes:
self.handler.line()
for span, font, text in line:
self.handler.chunk(span, font, text)
def end(self):
self.emit_chunks()
self.emit_footnotes()
self.handler.end()
class DepaginatorPrinter:
def section(self, ident):
print 'section: ident=%r' % ident
def line(self):
print ' line'
def chunk(self, span, font, text):
print (' chunk: span=%s, font=%s, text=%r' % \
('(%.02f, %.02f)' % span,
'(%s, %s, %d, %d)' % font, text)).encode('utf-8')
def end(self):
print '<<>>'
########################################################################
# Take the output of the depaginator, and rejoin lines into paragraphs.
# Dehyphenate and add spacing as necessary
########################################################################
PARA_INDENT = 16.0
PARA_WOBBLE = 2.0
BLOCKQUOTE_EDGE = 160.0
RIGHT_JUSTIFY = 450.0
class Unwrapper:
def __init__(self):
self.handler = HTMLGenerator()
self.paragraph = []
self.chunks = []
def dehyphenate(self):
if len(self.paragraph) <= 0:
return
font, text = self.paragraph.pop()
if len(text) > 0 and text[len(text) - 1] == '-':
self.paragraph.append((font, text[:len(text) - 1]))
elif text == '\n':
self.paragraph.append((font, text))
else:
self.paragraph.append((font, text.rstrip() + ' '))
def rstrip_paragraph(self):
font, text = self.paragraph.pop()
if len(text) > 0:
self.paragraph.append((font, text.rstrip()))
def emit_paragraph(self):
if len(self.paragraph) <= 0:
return
self.rstrip_paragraph()
self.handler.paragraph(self.leftEdge)
for font, text in self.paragraph:
self.handler.chunk(font, text)
self.paragraph = []
def emit_line(self):
if len(self.chunks) <= 0:
return
# Look for paragraph breaks
lineEdge = self.chunks[0][0][0]
if len(self.paragraph) > 0:
if lineEdge > self.lastLineEdge + PARA_WOBBLE or \
lineEdge < (self.lastLineEdge - PARA_INDENT):
self.emit_paragraph()
elif abs(lineEdge - self.leftEdge) < 0.1 and \
lineEdge > BLOCKQUOTE_EDGE:
self.rstrip_paragraph()
self.paragraph.append((self.chunks[0][1], '\n'))
elif self.lastRightEdge < RIGHT_JUSTIFY:
self.emit_paragraph()
self.lastLineEdge = lineEdge
if len(self.paragraph) == 0:
self.leftEdge = lineEdge
# Feed in line chunks
self.dehyphenate()
for (l, r), font, text in self.chunks:
self.paragraph.append((font, text))
self.lastRightEdge = r
self.chunks = []
def section(self, ident):
self.emit_line()
self.emit_paragraph()
self.handler.section(ident)
def line(self):
self.emit_line()
def chunk(self, span, font, text):
self.chunks.append((span, font, text))
def end(self):
self.emit_line()
self.emit_paragraph()
self.handler.end()
class UnwrapperPrinter:
def section(self, ident):
print 'section: ident=%r' % ident
def paragraph(self, edge):
print ' paragraph: edge=%.02f' % edge
def chunk(self, font, text):
print (' chunk: font=%s, text=%r' % \
('(%s, %s, %d, %d)' % font, text)).encode('utf-8')
def end(self):
print '<<>>'
########################################################################
# Take unflowed paragraphs and produce an HTML document.
########################################################################
# Chunk attributes
ATTR_SUPERSCRIPT = 0x01
ATTR_ITALIC = 0x02
# Paragraph attributes
ATTR_BLOCKQUOTE = 0x01
ATTR_HEADING = 0x02
ATTR_FOOTNOTE = 0x04
class HTMLGenerator:
def __init__(self):
self.handler = OutputWriter()
self.chunk_text = []
self.para_chunks = []
def section(self, ident):
self.emit_chunk()
self.emit_paragraph()
self.current_section = ident
self.handler.block(ident.replace(' ', '_'))
def paragraph(self, edge):
self.emit_chunk()
self.emit_paragraph()
self.para_attr = 0
if edge > 150.0 and self.current_section == 'main':
self.para_attr |= ATTR_BLOCKQUOTE
def emit_chunk(self):
if len(self.chunk_text) == 0:
return
self.para_chunks.append((self.chunk_attr, ''.join(self.chunk_text)))
self.chunk_text = []
def emit_paragraph(self):
if len(self.para_chunks) == 0:
return
raw_text = ''.join([text for _, text in self.para_chunks])
if self.para_attr & ATTR_HEADING:
self.para_chunks = []
name = (self.current_section + ' ' + raw_text).replace(' ', '_')
self.handler.block(name)
self.handler.data('' % name)
self.handler.data('')
self.handler.data(cgi.escape(raw_text))
self.handler.data('
')
return
# Hack to look for math-like paragraphs
if re.match('^[A-Za-z]?([^A-Za-z]+[A-Za-z])*[^A-Za-z]*$', raw_text):
self.para_attr |= ATTR_BLOCKQUOTE
if self.para_attr & ATTR_BLOCKQUOTE:
self.handler.data('')
self.handler.data('')
for attr, text in self.para_chunks:
if text == '\n':
self.handler.data('
')
elif text == '':
pass
else:
if attr & ATTR_ITALIC != 0:
self.handler.data('')
if attr & ATTR_SUPERSCRIPT != 0:
self.handler.data('')
if attr & ATTR_FOOTNOTE:
if self.current_section == 'footnotes':
dst = 'fnsrc_'
src = 'fndef_'
else:
src = 'fnsrc_'
dst = 'fndef_'
self.handler.data('' % (src, text))
self.handler.data('' % (dst, text))
self.handler.data(cgi.escape(text))
if attr & ATTR_FOOTNOTE:
self.handler.data('')
if attr & ATTR_SUPERSCRIPT != 0:
self.handler.data('')
if attr & ATTR_ITALIC != 0:
self.handler.data('')
self.handler.data('
')
if self.para_attr & ATTR_BLOCKQUOTE:
self.handler.data('
')
self.para_chunks = []
def chunk(self, font, text):
face, style, size, lift = font
attr = 0
if text == '\n':
self.emit_chunk()
self.para_chunks.append((0, '\n'))
return
if style == 'Italic':
attr |= ATTR_ITALIC
if lift >= size / 2:
attr |= ATTR_SUPERSCRIPT
if face == 'TimesNewRoman' and \
((self.current_section == 'footnotes' and size == 9 and lift >= 5) or
(self.current_section == 'main' and size == 12 and lift == 8)):
attr |= ATTR_FOOTNOTE
if self.chunk_text != [] and self.chunk_attr != attr:
self.emit_chunk()
if face == 'EDONCD+cmr10' and size == 18:
self.para_attr |= ATTR_HEADING
self.chunk_attr = attr
self.chunk_text.append(text)
def end(self):
self.emit_chunk()
self.emit_paragraph()
self.handler.end()
class HTMLGeneratorPrinter:
def __init__(self):
self.have_line = False
def end_line(self):
if self.have_line:
sys.stdout.write('\n')
self.have_line = False
def block(self, name):
self.end_line()
sys.stdout.write('' % name)
def data(self, text):
sys.stdout.write(text.encode('utf-8'))
self.have_line = True
def end(self):
self.end_line()
sys.stdout.write('\n')
########################################################################
# Regex rewriter. This takes text from the HTML generator, organized
# into blocks, and applies various regexes to it to fix minor issues.
########################################################################
OUTPUT_PREAMBLE = """
A Mathematician's Apology
"""
OUTPUT_POSTAMBLE = """
"""
OUTPUT_GLOBAL = [
(u'\u00a7([0-9]*)', u'\u00a7\\1'),
('\(cid:34\)', u'\u22ef'),
('101010', '101010')
]
OUTPUT_LOCAL = {
'title': [
('(.*Apology)
', '\\1
'),
('(G. H. Hardy)
', '\\1')
],
'copyright': [
('^', '
'),
('
', ' '),
('by the
', 'by the '),
('Web at
', 'Web at '),
('$',
'
HTML generated by ama_generate.py
, '
'2 Jan 2013, http://www.dlbeer.co.nz/
'),
('(http://[a-z\./]+)', '\\1')
],
'dedication': [('^', '
')],
'preface_Preface': [('18July1940', '18 July 1940')],
'footnotes': [('^', 'Footnotes
')],
'main_7': [
('(Against the fall of night\?)
', '\\1')
],
'main_13': [
('(ality.*is fraction )', '\\1'),
('in the form.*.*2;(.* that the equation )',
'in the form (a/b)2;\\1'),
('is fraction .*, where.*are integers',
'is a fraction (a/b), where ' +
'a and b are integers'),
('the equation \(B\)', 'the equation
(B)'),
('b([a-z])', 'b \\1'),
('(
\(B\) a2=2b2
)',
'\\1
'),
# Missing square-root glyphs
('(irrationality. of )2', u'\\1\u221a2'),
('(2 is irrational)', u'\u221a\\1'),
('(2 cannot be expressed)', u'\u221a\\1')
],
'main_14': [
('3,5,11,13,17', u'\u221a3,\u221a5,\u221a11,\u221a13,\u221a17'),
('3', u'3\u221a'),
('(314159265)', '\\1')
],
'main_18': [
('profoundly15',
'profoundly'
'15')
]
}
def do_subs(text, subs):
for pat, repl in subs:
text = re.sub(pat, repl, text)
return text
class OutputWriter:
def __init__(self):
self.bits = []
sys.stdout.write(OUTPUT_PREAMBLE)
def emit_block(self):
if len(self.bits) == 0:
return
text = ''.join(self.bits)
self.bits = []
text = do_subs(text, OUTPUT_LOCAL.get(self.block_name, []))
text = do_subs(text, OUTPUT_GLOBAL)
print text.encode('utf-8')
def block(self, name):
self.emit_block()
self.block_name = name
def data(self, text):
self.bits.append(text)
def end(self):
self.emit_block()
sys.stdout.write(OUTPUT_POSTAMBLE)
########################################################################
# Entry point
########################################################################
if __name__ == '__main__':
runFilters(sys.stdin)