Commit eba57eba by murtaugh

Process item enclosures as well as links, and deal with PDF

parent fffcf9e5
......@@ -25,7 +25,8 @@ EXT["image/png"] = "png"
EXT["image/gif"] = "gif"
EXT["image/jpeg"] = "jpg"
EXT["image/jpg"] = "jpg"
IMAGE_EXT = set(("png", "jpg", "gif"))
EXT["application/pdf"] = "pdf"
IMAGE_EXT = set(("png", "jpg", "gif", "pdf"))
DL_VIMEO_FORMAT = "http-360p"
DATETIME_STRF = "%Y-%m-%d %H:%M:%S"
......@@ -182,7 +183,9 @@ class PageWeb (object):
if download:
links = set(
[x.attrib.get("href") for x in et.findall(".//a[@href]")] +
[x.attrib.get("src") for x in et.findall(".//img[@src]")] )
[x.attrib.get("src") for x in et.findall(".//img[@src]")] +
[x.get("href") for x in item.enclosures]
)
for link in links:
filename = download_url_to_file(link, cachedir)
if filename != None:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment