Commit 6a26e831 by murtaugh

catch (and report) HTTP errors in wget

parent 1f35e6e1
......@@ -11,7 +11,7 @@ from time import mktime
from mediawikiutils import ensure_unique_wiki_pagename, upload_file_to_wiki
import json
from html import unescape as html_unescape
from urllib.error import HTTPError
"""
_ __ _ ___ _____
......@@ -77,24 +77,28 @@ def ensure_extension (filename, ext):
def wget (url, cachedir, blocksize=4*1000):
# if type(url) == unicode:
# url = url.encode("utf-8")
fin = urlopen(url)
ct = fin.info().get("content-type")
if ct in EXT:
filename = ensure_extension(os.path.basename(urlparse(url).path), EXT[ct])
count = 0
# path = os.path.join(cachedir, "tmp."+EXT[ct])
path = os.path.join(cachedir, filename)
with open(path, "wb") as fout:
while True:
data = fin.read(blocksize)
if not data:
break
fout.write(data)
count += len(data)
if count > 0:
return path
print ("[wget] skipping {0} content-type {1}".format(url, ct), file=sys.stderr)
return None
try:
fin = urlopen(url)
ct = fin.info().get("content-type")
if ct in EXT:
filename = ensure_extension(os.path.basename(urlparse(url).path), EXT[ct])
count = 0
# path = os.path.join(cachedir, "tmp."+EXT[ct])
path = os.path.join(cachedir, filename)
with open(path, "wb") as fout:
while True:
data = fin.read(blocksize)
if not data:
break
fout.write(data)
count += len(data)
if count > 0:
return path
print ("[wget] skipping {0} content-type {1}".format(url, ct), file=sys.stderr)
return None
except HTTPError as e:
print ("[wget]: skipping {0} HTTP ERROR {1}".format(url, e.code), file=sys.stderr)
return None
def download_url_to_file (href, cachedir):
if "mixcloud.com/" in href:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment