...
 
Commits (2)
Showing with 31 additions and 22 deletions
......@@ -97,8 +97,8 @@ def download_url_to_file (href, cachedir):
# CLASSES
############################
class SitePage (object):
""" Wrapper for a wiki page that represents a "Site" record -- aka a URL/RSS Item """
class PageWeb (object):
""" Wrapper for a wiki page that represents a "PageWeb" record -- aka a URL/RSS Item """
@staticmethod
def pagename_for_url (url):
""" Example: """
......@@ -116,16 +116,16 @@ class SitePage (object):
elif 'published' in i and 'published_parsed' in i:
return datetime.datetime.fromtimestamp(mktime(i['published_parsed']))
def __init__ (self, wiki, url):
def __init__ (self, wiki, url, title):
self.wiki = wiki
self.url = url
self.title = None
self.title = title
self.updated = None
self.init()
def init (self):
result = self.wiki.get("cargoquery",
tables="Site",
tables="Page_web",
fields="_pageID=pageID,_pageTitle=pageTitle,_pageName=pageName,_pageNamespace=pageNamespace,URL,Title,Updated",
where="URL=\"{0}\"".format(self.url),
limit=1)
......@@ -136,12 +136,21 @@ class SitePage (object):
self.title = result['Title']
self.updated = result['Updated']
else:
# create new wiki page object
pagename = ensure_unique_wiki_pagename(self.wiki, "Site:"+self.pagename_for_url(self.url))
# create a new wiki page object for the URL
# nb: ensure_unique_wiki_pagename is called here because this means that
# no "Page web" already exists for this URL,
# we ensure unique, so that in the case of recurring titles, say "Untitled"
# you would then use "Untitled (2)"
# Ideally this would be a transaction whereby the wiki page would be immediately created
# and commit the related URL ... but it isn't (assumption is that the subsequent code
# takes care of that and that no overlapping updates occur)
#
pagename = ensure_unique_wiki_pagename(self.wiki, "Page web:"+self.title)
self.page = self.wiki.pages.get(pagename)
return self
def process_feed_item (self, item, source, cachedir, download=True, phpuploader=None, user=None, pretend=False):
def process_feed_item (self, item, site, cachedir, download=True, phpuploader=None, user=None, pretend=False):
# print ("-"*10, file=sys.stderr)
# sitepage = SitePage(wiki, item.link)
# page, data = get_site_page_and_data(wiki, item.link)
......@@ -188,20 +197,20 @@ class SitePage (object):
thumbnail = wikifile.page_title
if not thumbnail:
thumbnail = ""
page_contents = u"""{{{{Site
page_contents = u"""{{{{Page web
|URL={url}
|Title={title}
|Updated={updated}
|Source={source}
|Site={site}
|Thumbnail={thumbnail}
}}}}
[[Catégorie:Site]]
[[Catégorie:Page web]]
""".format(
url=item.link,
title=item.title,
updated=ts.strftime("%Y/%m/%d %H:%M:%S"),
source=source,
site=site,
thumbnail=thumbnail)
if gallery:
......@@ -216,7 +225,7 @@ class SitePage (object):
if not pretend:
self.page.save(page_contents)
class SiteWebPage (object):
class SiteWeb (object):
@staticmethod
def parse_dt (val):
try:
......@@ -288,7 +297,7 @@ class SiteWebPage (object):
def process_feed(self, cachedir, limit=None, phpuploader=None, user=None, force=False, pretend=False):
feed = feedparser.parse(self.feed_url)
# ensure in reverse chronological order (probably redundant but important for the processing)
feed.entries.sort(key=lambda x: SitePage.get_item_date(x), reverse=True)
feed.entries.sort(key=lambda x: PageWeb.get_item_date(x), reverse=True)
# print (feed.entries[0].published_parsed, "to", feed.entries[-1].published_parsed)
count = 0
# Process in CHRONOLOGICAL order... skipping elements that OLDER than / equal to feed's last updated timestamp
......@@ -300,7 +309,7 @@ class SiteWebPage (object):
use_all_entries = []
# print ("processing feed, last_updated {0}".format(self.last_updated.strftime(DATETIME_STRF)), file=sys.stderr)
for item in all_entries:
item_dt = SitePage.get_item_date(item)
item_dt = PageWeb.get_item_date(item)
if item_dt <= self.last_updated:
# print ("Skipping older item {0}".format(item.title), file=sys.stderr)
skipped += 1
......@@ -313,9 +322,9 @@ class SiteWebPage (object):
print ("No new items since feed last updated", file=sys.stderr)
for item in all_entries:
item_dt = SitePage.get_item_date(item)
sitepage = SitePage(self.wiki, item.link)
sitepage.process_feed_item(item, source=self.pagename, cachedir=cachedir, phpuploader=phpuploader, user=user, pretend=pretend)
item_dt = PageWeb.get_item_date(item)
pageweb = PageWeb(self.wiki, url=item.link, title=item.title)
pageweb.process_feed_item(item, site=self.pagename, cachedir=cachedir, phpuploader=phpuploader, user=user, pretend=pretend)
if self.last_updated == None or item_dt > self.last_updated:
self.last_updated = item_dt
if not pretend:
......@@ -348,19 +357,19 @@ if __name__ == "__main__":
args= ap.parse_args()
# wiki
site = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
if args.user:
site.login(args.user, args.password)
wiki.login(args.user, args.password)
# page = upload_file_to_wiki(site, args.source)
# if page:
# print (page.name)
if args.source:
flux = SiteWebPage(site, args.source)
flux = SiteWeb(site, args.source)
flux.process_feed(args.cachedir, limit=args.limit, phpuploader=args.phpimportimages, user=args.user, force=args.force, pretend=args.pretend)
else:
print ("Processing all feeds", file=sys.stderr)
for i, f in enumerate(SiteWebPage.get_all(site)):
for i, f in enumerate(SiteWeb.get_all(wiki)):
print ("[{0}] {1}".format(i, f.pagename), file=sys.stderr)
print ("feed_url: {0}".format(f.feed_url), file=sys.stderr)
if f.last_updated:
......