#!/usr/bin/env python
"""
'[this-program] x.html' makes x-clean.html with the hideous HTML of (in my case) OpenOffice cleansed, plus it makes x.epub
Now also makes x.zip for publishing on the Kindle.

Customize the set of options below for your book.

Use OpenOffice Writer:File:Save As - HTML, not Export HTML. Export produces xhtml, but it's not even valid, at least according to the epub validator. I elso export from the version without cover images embedded because this script lets you specify them

To do:
- doesn't handle the table in the Reason chapter, or the 2 code examples
- maybe automate pulling html from OO
- at least automatically end part pattern matches where the next pattern starts
- apparently all images should be in an images subfolder and the css should be in a css subfolder

References:
http://en.wikipedia.org/wiki/Epub
http://www.jedisaber.com/eBooks/tutorial.asp
Use http://www.threepress.org/document/epub-validate/ to validate the epub
http://kindleformatting.com/formatting.php

Possible alternatives:
https://www.bookglutton.com/api/convert.html -- seems not free; don't know how well it works
http://code.google.com/p/epub-tools/ -- seems to java (ugh!) tools with little documentation; too annoying to trying
http://www.feedbooks.com/ -- you have to paste each chapter in, and i don't see how that works for images; seems very limited
http://www.smashwords.com/ -- Only word .doc to various formats; ugly and look's annoying
eScape -- Seems to require you to create a new doc, paste yours in, then fiddle with using their styles
"""

__author__ = 'Patrick Roberts'
__copyright__ = 'Copyright 2010 Patrick Roberts'
__license__ = 'Python'
__version__ = '1.2'


from itertools import *
from StringIO import StringIO
import operator, os, re, subprocess, sys, time, zipfile


# >>>>>>>>>>>>>>> Customize these for your book <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
info = dict(title='Mind Making',
            author='Patrick Roberts',
            rights='Copyright Patrick Roberts',
            publisher='Patrick Roberts',
            ISBN='978-1449921880',
            subject='Philosophy',
            description="The shared laws of natural and artificial intelligence.",
            date=time.strftime('%Y-%m-%d'),
            front_cover='mindmaking-high.png',
            back_cover='mindmaking-back-high.png',
            part_descr=[dict(pattern=r'(?is)<p>For.*?(?=<h\d)', title='Dedication', template='<div class="center">%s</div>', once=True),
                        dict(pattern=r'(?is)<h1>([^<]+)</h1>.*?(?=<h[12]>|</body>|$)'),
                        dict(pattern=r'(?is)<h2><strong>Thanks.*?</div>', title='Thanks', template='<div class="center">%s</div>', once=True),
                        dict(pattern=r'(?is)<div class="center"><p><img src="mindmaking-3d.*?</ol>', title='Next', once=True),
                        dict(pattern=r'(?is)<div class="center"><p><img src="headshot.*?(?=</body)', title='Author', template='<div class="center">%s</div>', once=True),
                        ]) # searches the cleaned x-clean.xhtml


class MyZipFile(zipfile.ZipFile):
    def writestr(self, name, s, compress=zipfile.ZIP_DEFLATED):
        zipinfo = zipfile.ZipInfo(name, time.localtime(time.time())[:6])
        zipinfo.compress_type = compress
        zipfile.ZipFile.writestr(self, zipinfo, s)


if __name__ == '__main__':
    path = sys.argv[1]
    s = file(path).read()

    s = re.sub(r'(?i)<(/?)i>', r'<\1em>', s) # tidy -c removes italics tags
    s = re.sub(r'(?i)<(/?)b>', r'<\1strong>', s) # tidy -c removes bold tags
    #s = re.sub(r'<FONT FACE="Courier New, monospace">', r'<\1strong>', s) <FONT FACE="Courier New, monospace">

    if 1:
        try:
            p = subprocess.Popen("tidy -c -w -asxhtml", shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
            p.stdin.write(s)
            p.stdin.close()
            s = p.stdout.read()
            #print s[:1000]
        except Exception, e:
            print e
            print "Warning: Input left untidy because I couldn't find HTML Tidy - http://tidy.sourceforge.net/"
        #sys.exit()

    #s = re.sub(r'(?si)<!DOCTYPE.*?>', '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">', s)
    #s = '<?xml version="1.0" encoding="UTF-8" ?>\n' + s
    #s = re.sub(r'(?si)<html.*?>', '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">', s)
    s = re.sub(r'(?si)<p[^>]*>[\s\r\n]*', '<p>', s)
    s = re.sub(r'(?i)<h(\d) [^>]*>', r'<h\1>', s)
    #s = re.sub(r'(?i)<body.*?>', r'<body>', s)
    s = re.sub('(?i)<br[^>]*>', '<br />', s) # <br clear="left" />
    s = re.sub('(?i)(<br />)+', '<br />', s)
    s = re.sub('(?i)<a[^>]*></a>', '', s)
#    s = re.sub('(?si)<head.*?>.*?</head>', """<head>
# <meta http-equiv="Content-Type" content="application/xhtml+xml; charset=utf-8" />
# <title>%(title)s</title>
# <link rel="stylesheet" href="style.css" type="text/css" />
#</head>""" % info, s)
    s = re.sub('(?i)%s' % '|'.join(map('(%s)'.__mod__, ['</?span[^>]*>', '</?font[^>]*>', '</?div[^>]*>', '</?u>', r'<h\d></h\d>'])), '', s)
    ##s = re.sub(r'<img src="(.+?)" name="graphics16" align="left" width="576" height="864" border="0" id="graphics16" />
    #s = re.sub(r'(?i)<img src=("[^"]+") name="[^"]+" (?:align="[^"]+" )?(width="[^"]+") (height="[^"]+")[^>]*>', lambda m: ('<a href=%s>%s</a>' % (m.group(1), '<div class="center"><img src=%s %s %s alt="Image" /></div>' % m.groups())), s)
    s = re.sub(r'(?i)<p><img src="([^"]+)" name="[^"]+" (?:align="[^"]+" )?(width="[^"]+") (height="[^"]+")[^>]*></p>', lambda m: ('<div class="center"><p><img src="%s" %s %s alt="Image" /></p></div>' % m.groups()), s) # no hyperlinks to full size img, at least for epub, since it dislikes and doesn't use them
    #s = re.sub('(?i)(<img .*?>)', r'<div style="text-align: center;">\1</div>', s) # [\s\n]*
    s = re.sub(r'(?si)<p>\s*(<em>[^<^\.]+\.+</em>)</p>', r'<div class="center"><p>\1</p></div>', s)
    ##s = re.sub(r'<IMG .*?>|<P><I>[^<]+</I></P>', lambda m: r'<CENTER>%s</CENTER>' % m.group(), s) # [\s\n]*
    s = re.sub(r'(?<=em>)(?=\w)', ' ', s)
    s = re.sub(r'(?is)(?<=<li>)\s*<p>|</p>\s*(?=</li>)', '', s) # for some reason OO puts pars in list elements and that looks bad in kindle

    file('%s-clean.xhtml' % os.path.splitext(path)[0], 'w').write(s)#re.sub(, , s)) # ePub doesn't like links to the images

    kindle = MyZipFile('%s.zip' % os.path.splitext(path)[0], 'w', zipfile.ZIP_DEFLATED) # Kindle zip

    z = MyZipFile('%s.epub' % os.path.splitext(path)[0], 'w', zipfile.ZIP_DEFLATED)
    z.writestr('mimetype', 'application/epub+zip', zipfile.ZIP_STORED)
    content_dir = 'OEBPS'

    z.writestr('META-INF/container.xml', """<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="%s/volume.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>""" % content_dir)

    #parts = [dict(title=m.group(1), body=m.group()) for m in re.finditer(r'(?is)<h1>([^<]+)</h1>.*?(?=<h1>|</body>|$)', s)]
    parts = []
    for descr in info['part_descr']:
        parts += [dict(title=(descr.get('title') or m.group(1)), body=descr.get('template', '%s') % m.group()) for m in islice(re.finditer(descr['pattern'], s), descr.get('once'))]
    #print len(parts); sys.exit()
    part_template = """<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<meta http-equiv="Content-Type" content="application/xhtml+xml; charset=utf-8" />
<title>%(title)s</title>
<link rel="stylesheet" href="style.css" type="text/css" />
</head>
<body>
%%s
</body>
</html>""" % info
    parts.insert(0, dict(title='Cover', body="""<div class="center"><img src="%(front_cover)s" width="704" height="1000" alt="%(title)s" /></div>""" % info, id='cover'))
    parts.insert(1, dict(title='Copyright', body="""<div class="center"><p>Version %(date)s</p><p>Copyright &copy; %(author)s</p><p>All rights reserved</p></div>""" % info))
    #parts.insert(2, dict(title='Dedication', body="""<div class="center">%s</div>""" % re.search(r'(?s)<[pP]>For.*?(?=<[hH])', s).group()))
    parts.append(dict(title='Back Cover', body="""<div class="center"><img src="%(back_cover)s" width="704" height="1000" alt="%(title)s" /></div>""" % info))
    def get_part_id(i):
        return parts[i].get('id') or ('part%d' % i)
    for i, part in enumerate(parts):
        z.writestr(os.path.join(content_dir, '%s.xhtml' % get_part_id(i)), part_template % part['body'])
    kindle.writestr('book.html', part_template % '\r\n<mbp:pagebreak />\r\n'.join(imap(operator.itemgetter('body'), parts)))
    info['toc'] = '\n'.join('<navPoint id="%s" playOrder="%d"><navLabel><text>%s</text></navLabel><content src="%s.xhtml"/></navPoint>' % (get_part_id(i), i, part['title'], get_part_id(i)) for i, part in enumerate(parts))
    info['spine'] = '\n'.join('<itemref idref="%s" />' % get_part_id(i) for i in xrange(len(parts)))
    info['parts'] = '\n'.join('<item id="%s" href="%s.xhtml" media-type="application/xhtml+xml" />' % ((get_part_id(i),)*2) for i in xrange(len(parts)))

    images = []
    def resize_and_zip_image_and_add_item(name, img_id, size):
        images.append('<item id="%s" href="%s" media-type="image/png" />' % (img_id, name))
        try:
            from PIL import Image
        except:
            print "Warning: Couldn't shrink images because I don't have the PIL Python imaging library"
            z.write(name, os.path.join(content_dir, name), zipfile.ZIP_DEFLATED)
        else:
            img = Image.open(name)
            buf = StringIO()
            Image.open(name).convert('RGB').resize(size, Image.ANTIALIAS).save(buf, format='PNG')
            z.writestr(os.path.join(content_dir, name), buf.getvalue())
            kindle.writestr(name, buf.getvalue())
    for part in parts:
        for m in re.finditer('(?i)<img src="([^"]+)" width="(\d+)" height="(\d+)"', part['body']): # <IMG SRC="knots/d.png" NAME="graphics39" ALIGN=BOTTOM WIDTH=166 HEIGHT=166 BORDER=0>
            name = m.group(1).replace('%20', ' ')
            resize_and_zip_image_and_add_item(name, 'img%d' % len(images), (int(m.group(2)), int(m.group(3))))
    info['images'] = '\n'.join(images)

# <dc:identifier id="BookId" opf:scheme="UUID" xmlns:opf="http://www.idpf.org/2007/opf">urn:uuid:148b5f03-2b4a-45f1-840e-158bf2ba24c0</dc:identifier>
    z.writestr(os.path.join(content_dir, 'volume.opf'), ("""<?xml version="1.0" encoding="UTF-8"?>
<package version="2.0" unique-identifier="BookId" xmlns="http://www.idpf.org/2007/opf">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title>%(title)s</dc:title>
<dc:creator>%(author)s</dc:creator>
<dc:language>en</dc:language>
<dc:rights>%(rights)s</dc:rights>
<dc:publisher>%(publisher)s</dc:publisher>
<dc:subject>%(subject)s</dc:subject>
<dc:description>%(description)s</dc:description>
<dc:date>%(date)s</dc:date>
<dc:identifier id="BookId">%(ISBN)s</dc:identifier>
<meta name="cover" content="my-cover-image" />
</metadata>
<manifest>
<item href="%(front_cover)s" id="my-cover-image" media-type="image/png" />
<item id="ncx" href="toc.ncx" media-type='application/x-dtbncx+xml' />
<item id="stylesheet" href="style.css" media-type="text/css" />
%(parts)s
%(images)s
</manifest>
<spine toc="ncx">
%(spine)s
</spine>
</package>""" % info).replace('\n', '\r\n')) # creator  <!--  opf:role="aut" -->; date <!-- event="modification" attribute not allowed by ePub yet -->; identifiier  <!--  opf:scheme="ISBN" -->

    css = """ul { list-style-type: square; }\n.center { text-align: center; }\nh1 { padding-bottom: 1em; }"""
    z.writestr(os.path.join(content_dir, 'style.css'), css)
    kindle.writestr('style.css', css)

    z.writestr(os.path.join(content_dir, 'toc.ncx'), """<?xml version="1.0" encoding="UTF-8"?>
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta name="dtb:uid" content="%(ISBN)s"/>
<meta name="dtb:depth" content="1"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
</head>
<docTitle>
<text>%(title)s</text>
</docTitle>
<docAuthor>
<text>%(author)s</text>
</docAuthor>
<navMap>
%(toc)s
</navMap>
</ncx>""" % info)

    if 0:
        import Tkinter
        root = Tkinter.Tk()
        print Tkinter.Label(root).selection_get(selection='CLIPBOARD')
        Tkinter.Label(root).clipboard_clear()
        Tkinter.Label(root).clipboard_append('test')

