Commit 180c60b2 by murtaugh

feedr scrape ++ --site

parent 19e8df4a
......@@ -214,10 +214,10 @@ class PageWeb (object):
if not pretend:
self.page.save(page_contents)
def scrape (self, phpimportimages=None):
def scrape (self, title_selector=None, remove_title=False, content_selector=None, remove_selector=None, phpimportimages=None):
""" Attempt to scrape contents based on HTML itself (as opposed to a feed item) """
# get site object to determine selectors to use (if any)
scrapetitle, newsrc = scrape_url_to_mediawiki(self.url)
scrapetitle, newsrc = scrape_url_to_mediawiki(self.url, title_selector=title_selector, remove_title=remove_title, content_selector=content_selector, remove_selector=remove_selector)
# print ("TITLE", scrapetitle, file=sys.stderr)
# print ("SRC", newsrc, file=sys.stderr)
newsrc = newsrc.strip()
......@@ -416,13 +416,28 @@ def process_feeds(wikiprotocol, wikihost, wikipath, user, password, source, limi
if feedlimit and i+1>=feedlimit:
break
def cli_scrape (wikiprotocol, wikihost, wikipath, user, password, page, title_selector, remove_title, content_selector, remove_selector, phpimportimages):
def cli_scrape (wikiprotocol, wikihost, wikipath, user, password, page, site, title_selector, remove_title, content_selector, remove_selector, phpimportimages):
wiki = Site((wikiprotocol, wikihost), path=wikipath)
if user:
wiki.login(user, password)
page = PageWeb.from_pagename(wiki, page)
page.scrape(phpimportimages=phpimportimages)
if page:
page = PageWeb.from_pagename(wiki, page)
page.scrape(title_selector=title_selector, remove_title=remove_title, content_selector=content_selector, remove_selector=remove_selector, phpimportimages=phpimportimages)
elif site:
# site = wiki.pages(site)
result = wiki.get("cargoquery",
tables="Page_web",
fields="_pageID=pageID,_pageTitle=pageTitle,_pageName=pageName,_pageNamespace=pageNamespace,URL,Title,Updated,Site",
where="site=\"{0}\"".format(site), limit=100)
result = result['cargoquery']
if len(result) > 0:
for row in result:
item = row['title']
print (item['pageTitle'], file=sys.stderr)
page = PageWeb(wiki, item['pageName'], item['URL'], item['Title'], item['Updated'], item['Site'])
page.scrape(title_selector=title_selector, remove_title=remove_title, content_selector=content_selector, remove_selector=remove_selector, phpimportimages=phpimportimages)
if __name__ == "__main__":
from argparse import ArgumentParser
......@@ -455,7 +470,8 @@ if __name__ == "__main__":
# scrape
apscrape = sub.add_parser('scrape', help="scrape a single page")
apscrape.add_argument("page")
apscrape.add_argument("--page", default=None)
apscrape.add_argument("--site", default=None)
# apscrape.add_argument("--cachedir", default="cache", help="directory where files are temporarily downloaded")
apscrape.add_argument("--phpimportimages", default=None, help="optional: path to mw/maintenance/importImages.php. Default is to upload via the API")
apscrape.add_argument('--title-selector', default=None, help="CSS selector to extract title content")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment