Commit a7d4aeca authored by gijs's avatar gijs

Tiny start on epub processing

parent edbd60ea
#! /usr/bin/env python2
# Copyright (C) 2015-2017 Alexandre Leray (Open Source Publishing)
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Fetches the images of an HTML document and fixes links to them
#
# Usage:
#
# ./generate.py infile.md outfile.html
import urlparse
import os
import requests
import html5lib
import logging
from html5lib.filters import _base
logger = logging.getLogger(__name__)
formatter = logging.Formatter('[%(levelname)s] %(message)s')
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
class ScriptSrcFilter(_base.Filter):
def __init__(self, source, local_dir="."):
self.source = source
self.local_dir = local_dir
def __iter__(self):
for token in _base.Filter.__iter__(self):
if token["name"] == "script":
print token, token["data"][(None, "src")]
class ImgSrcFilter(_base.Filter):
"""
"""
def __init__(self, source, local_dir="."):
self.source = source
self.local_dir = local_dir
def __iter__(self):
for token in _base.Filter.__iter__(self):
if token["type"] == "EmptyTag" and token['name'] == "img":
src = token["data"][(None, 'src')]
print(" downloading {}".format(src))
response = requests.get(src, stream=True)
if not response.ok:
logger.info("Could not retrieve {}. Skipping.".format(src))
continue
parts = urlparse.urlparse(response.url)
fn = os.path.basename(parts.path)
print(" -> {}".format(fn))
if not os.path.exists(self.local_dir):
os.makedirs(self.local_dir)
local_path = os.path.join(self.local_dir, fn)
with open(local_path, 'wb') as handle:
for block in response.iter_content(1024):
handle.write(block)
token["data"][(None, 'src')] = "/" + local_path
yield token
def pull_images(infile):
print("processing {}".format(infile.name))
dom = html5lib.parseFragment(infile.read(), treebuilder="etree")
walker = html5lib.getTreeWalker("etree")
stream = walker(dom)
local_dir = os.path.join(os.path.dirname(infile.name), "img")
stream = ImgSrcFilter(stream, local_dir=local_dir)
s = html5lib.serializer.HTMLSerializer(quote_attr_values=True,
omit_optional_tags=False)
output = s.render(stream)
return output
if __name__ == '__main__':
import argparse
import sys
parser = argparse.ArgumentParser()
parser.add_argument('infile', type=argparse.FileType('r'), default=sys.stdin)
parser.add_argument('-o', '--outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
args = parser.parse_args()
out = pull_images(args.infile)
args.outfile.write(out.encode("utf-8"))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment