Commit 1f35e6e1 by murtaugh

urljoin linked URLs with item.link

parent cbabd9d6
......@@ -3,7 +3,7 @@ from xml.etree import ElementTree as ET
import feedparser, html5lib
import sys, subprocess, re, datetime
import os
from urllib.parse import urlparse
from urllib.parse import urlparse, urljoin
from urllib.request import urlopen
# import hashlib
from mwclient import Site
......@@ -192,8 +192,8 @@ class PageWeb (object):
thumbnail = None
if download:
links = set(
[x.attrib.get("href") for x in et.findall(".//*[@href]")] +
[x.attrib.get("src") for x in et.findall(".//*[@src]")] +
[urljoin(item.link, x.attrib.get("href")) for x in et.findall(".//*[@href]")] +
[urljoin(item.link, x.attrib.get("src")) for x in et.findall(".//*[@src]")] +
[x.get("href") for x in item.enclosures]
)
print ("[process_feed_item]: processing {0} links/enclosures".format(len(links)), file=sys.stderr)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment