Commit 4ed623a2 by murtaugh

catch and report http/url errors with processing feeds, warns and skips

parent 2373f68c
Showing with 42 additions and 37 deletions
......@@ -11,7 +11,7 @@ from time import mktime
from mediawikiutils import ensure_unique_wiki_pagename, upload_file_to_wiki
import json
from html import unescape as html_unescape
from urllib.error import HTTPError
from urllib.error import HTTPError, URLError
"""
_ __ _ ___ _____
......@@ -323,44 +323,49 @@ class SiteWeb (object):
def process_feed(self, cachedir, limit=None, phpuploader=None, user=None, force=False, pretend=False):
print ("[process_feed]: feed_url {0}".format(self.feed_url), file=sys.stderr)
# adding urlopen as I otherwise encountered malformed XML issues (feedparser fail silently)
feed = feedparser.parse(urlopen(self.feed_url))
# ensure in reverse chronological order (probably redundant but important for the processing)
print ("[process_feed]: {0} items".format(len(feed.entries)), file=sys.stderr)
feed.entries.sort(key=lambda x: PageWeb.get_item_date(x), reverse=True)
# print (feed.entries[0].published_parsed, "to", feed.entries[-1].published_parsed)
count = 0
# Process in CHRONOLOGICAL order... skipping elements that OLDER than / equal to feed's last updated timestamp
all_entries = reversed(feed.entries)
skipped = 0
# Filter list when last updated is present (and not using force)
if not force and self.last_updated:
use_all_entries = []
# print ("processing feed, last_updated {0}".format(self.last_updated.strftime(DATETIME_STRF)), file=sys.stderr)
try:
feed = feedparser.parse(urlopen(self.feed_url))
# ensure in reverse chronological order (probably redundant but important for the processing)
print ("[process_feed]: {0} items".format(len(feed.entries)), file=sys.stderr)
feed.entries.sort(key=lambda x: PageWeb.get_item_date(x), reverse=True)
# print (feed.entries[0].published_parsed, "to", feed.entries[-1].published_parsed)
count = 0
# Process in CHRONOLOGICAL order... skipping elements that OLDER than / equal to feed's last updated timestamp
all_entries = reversed(feed.entries)
skipped = 0
# Filter list when last updated is present (and not using force)
if not force and self.last_updated:
use_all_entries = []
# print ("processing feed, last_updated {0}".format(self.last_updated.strftime(DATETIME_STRF)), file=sys.stderr)
for item in all_entries:
item_dt = PageWeb.get_item_date(item)
if item_dt <= self.last_updated:
# print ("Skipping older item {0}".format(item.title), file=sys.stderr)
skipped += 1
else:
use_all_entries.append(item)
all_entries = use_all_entries
# else:
# print ("processing feed, last_updated is not set", file=sys.stderr)
if skipped > 0 and len(all_entries) == 0:
print ("No new items since feed last updated", file=sys.stderr)
for item in all_entries:
item_dt = PageWeb.get_item_date(item)
if item_dt <= self.last_updated:
# print ("Skipping older item {0}".format(item.title), file=sys.stderr)
skipped += 1
else:
use_all_entries.append(item)
all_entries = use_all_entries
# else:
# print ("processing feed, last_updated is not set", file=sys.stderr)
if skipped > 0 and len(all_entries) == 0:
print ("No new items since feed last updated", file=sys.stderr)
for item in all_entries:
item_dt = PageWeb.get_item_date(item)
pageweb = PageWeb(self.wiki, url=item.link, title=item.title)
pageweb.process_feed_item(item, site=self.pagename, cachedir=cachedir, phpuploader=phpuploader, user=user, pretend=pretend)
if self.last_updated == None or item_dt > self.last_updated:
self.last_updated = item_dt
if not pretend:
self.save()
count += 1
if limit and count>=limit:
break
pageweb = PageWeb(self.wiki, url=item.link, title=item.title)
pageweb.process_feed_item(item, site=self.pagename, cachedir=cachedir, phpuploader=phpuploader, user=user, pretend=pretend)
if self.last_updated == None or item_dt > self.last_updated:
self.last_updated = item_dt
if not pretend:
self.save()
count += 1
if limit and count>=limit:
break
except URLError as e:
print ("WARNING [process_feed] Skipping feed {0}, URL Error: {1}".format(self.feed_url, e.reason), file=sys.stderr)
except HTTPError as e:
print ("WARNING [process_feed] Skipping feed {0}, HTTP Error {1}".format(self.feed_url, e.code), file=sys.stderr)
if __name__ == "__main__":
from argparse import ArgumentParser
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment