Commit 1a61fa26 authored by Michael Murtaugh's avatar Michael Murtaugh

scraper from diversions 2016

parent 7d06d39e
Pipeline #408 canceled with stages
from __future__ import print_function
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException
from urllib2 import urlopen
import sys, json, os
from argparse import ArgumentParser
BUFSIZE = 1024 * 1000
def wget (url, tofile):
f = urlopen(url)
count = 0
with open(tofile, "wb") as fout:
while True:
data = f.read(BUFSIZE)
if data == "":
break
count += len(data)
fout.write(data)
return count
def image_path (x):
x = x.replace(" ", "_").lower()
x = x+".jpg"
return x
def log (*msg):
print (*msg, file=sys.stderr)
ap = ArgumentParser("MRAH scaper")
ap.add_argument("--starturl", default="http://carmentis.be")
ap.add_argument("--browser", choices=("firefox", "chrome", "opera"), default="chrome", help="browser driver: firefox (default), chrome, opera")
ap.add_argument("--format", choices=("json", "csv"), default="csv", help="output format: json (default), csv")
ap.add_argument("--imagepath", default="images")
ap.add_argument("--skipimages", action="store_true", default=False)
ap.add_argument("--limit", type=int, default=None)
ap.add_argument("--sleeptime", type=float, default=None, help="sleeptime")
args = ap.parse_args()
if not args.skipimages:
try:
os.makedirs(args.imagepath)
except OSError:
pass
sleeptime = args.sleeptime
log("Opening browser...")
driver = None
if args.browser == "opera":
b = webdriver.Opera()
elif args.browser == "chrome":
b = webdriver.Chrome()
else:
b = webdriver.Firefox()
if sleeptime == None:
sleeptime = 0.5
# wait = WebDriverWait(b, 20)
# b.implicitly_wait(10) # seconds
b.get(args.starturl)
log("Perform a search and select detail mode, then press enter to start scraping items... (Ctrl-c to cancel)")
raw_input()
# # switch to detail view...
# b.find_element_by_css_selector(".contentViews .arrowDownButton").click()
# sleep(0.25)
# b.find_element_by_id("viewTypes-detailView").click()
# wait.until(EC.visibility_of_element_located((By.ID, 'collectionDetailItem')))
props = """
collectionName
inventoryNb
objectName
objectTitle
objectCulture
geography
dating
material
technique
dimensions
legalRightOwner
""".strip().splitlines()
if args.format == "csv":
from csv import DictWriter
fieldnames = props[:]
fieldnames.append("url")
if not args.skipimages:
fieldnames.extend(("imageurl", "image"))
csvout = DictWriter(sys.stdout, fieldnames=fieldnames)
csvout.writeheader()
count = 0
while True:
b.implicitly_wait(0)
count += 1
item = {}
for p in props:
try:
li = b.find_element_by_css_selector("li."+p)
name = li.find_element_by_css_selector(".tspPrefix")
span = li.find_element_by_css_selector(".tspValue")
# item[name.text] = span.text
item[p] = span.text
except NoSuchElementException as e:
pass
# permalink / bookmark
tries = 0
while tries < 5:
try:
permalink = b.find_element_by_css_selector("li.bookmark")\
.find_element_by_css_selector("input")\
.get_attribute("value")
item['url'] = permalink
break
except NoSuchElementException:
tries += 1
sleep(0.1)
# print ("PERMALINK: {0}".format(permalink))
if not args.skipimages:
imglink = b.find_element_by_css_selector("dt.detailImg a")
imglink.click()
img_src = None
tries = 0
while tries < 5:
try:
b.switch_to_window('HighResImage')
img = b.find_element_by_css_selector("img")
img_src = img.get_attribute("src")
item['imageurl'] = img_src
b.close()
b.switch_to_window("")
break
except NoSuchWindowException as e:
log("NoSuchWindowException", tries)
sleep(1)
tries += 1
if img_src:
# log("IMAGE: {0}".format(img_src))
ifilename = image_path(item['inventoryNb'])
ipath = os.path.join(args.imagepath, ifilename)
if wget(img_src, ipath):
item['image'] = ifilename
if args.format == "json":
print (json.dumps(item))
elif args.format == "csv":
csvout.writerow({k:v.encode('utf8') for k,v in item.items()})
log(u"{0} {1}/{2}".format(count, item.get('objectName', u''), item.get('objectTitle', u'')).encode("utf-8"))
next = None
try:
next = b.find_element_by_css_selector('#pageSetEntries-nextSet a')
except NoSuchElementException as e:
pass
if next == None:
log("END OF LIST")
break
b.implicitly_wait(10)
next.click()
if sleeptime:
sleep(sleeptime)
log("output {0} items".format(count))
b.close()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment