Commit 10adafcd by murtaugh

scan for ALL tags with src or href in process_item

parent ec7fe5f3
......@@ -190,8 +190,8 @@ class PageWeb (object):
thumbnail = None
if download:
links = set(
[x.attrib.get("href") for x in et.findall(".//a[@href]")] +
[x.attrib.get("src") for x in et.findall(".//img[@src]")] +
[x.attrib.get("href") for x in et.findall(".//*[@href]")] +
[x.attrib.get("src") for x in et.findall(".//*[@src]")] +
[x.get("href") for x in item.enclosures]
for link in links:
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment