...
 
......@@ -176,6 +176,14 @@ function mouseover_href (href, src) {
}
function show_external (href) {
console.log("show_external", href);
$.fancybox.open({src: href, type: 'iframe'});
// document.getElementById("external")
// .contentWindow
// .postMessage({msg: "showurl", href: href, type:'iframe'}, "*");
// document.getElementById("externalframe").classList.add("visible");
return;
document.getElementById("external").src = href;
var xu = document.getElementById("externalurl");
// defer this to onload
......
......@@ -4,6 +4,9 @@
<title>école de recherche graphique</title>
<link rel="stylesheet" type="text/css" href="map.css">
<meta charset="utf-8">
<script src="/lib/jquery-3.2.1.min.js"></script>
<link rel="stylesheet" href="/lib/fancybox/dist/jquery.fancybox.altered.css">
<script src="/lib/fancybox/dist/jquery.fancybox.min.js"></script>
</head>
<body>
......@@ -17,7 +20,7 @@
</div>
</div>
<div class="topright"><div class="close"><a id="externalclosebutton" class="button" href="#">&#10006;</a></div></div>
<iframe src="" name="external" id="external"></iframe>
<iframe src="external.html" name="external" id="external"></iframe>
</div>
</div>
......
......@@ -559,32 +559,6 @@ function list_remove_item (list, item) {
}
}
// function init_categories() {
// // show categories
// allcats.sort(function (a, b) { return a.name > b.name });
// // console.log("allcats", allcats.map(function (x) { return x.name}));
// d3.select("#cats").select(".body").selectAll("a.cat").data(allcats).enter()
// .append("a").attr("class", "cat").text(function (d, i) { return d.name+" "+(i+1<allcats.length ? "| ":"") })
// .attr("href", function (d) { return d.id })
// .attr("target", "over")
// .on("click", function (d) {
// var $this = d3.select(this);
// // d3.event.preventDefault();
// // console.log("** category click", d);
// // TOGGLE HIGHLIGHT
// if (d.highlight) {
// list_remove_item(highlighted_categories, d);
// d.highlight = false;
// } else {
// highlighted_categories.push(d);
// d.highlight = true;
// }
// $this
// .classed("highlight", function (d) { return d.highlight });
// rebuild_links();
// });
// }
function init_svg () {
link = svgg.append("g")
.attr("class", "links")
......@@ -744,30 +718,6 @@ function ticked() {
// console.log("ticked", node.size());
}
// function movementHook () {
// node.each(function (d) {
// if (d.linked || d.fx) return;
// // bounce
// if (d.y > BOTTOM + MARGIN) {
// d.direction = Math.PI*0.5;
// } else if (d.y > BOTTOM) {
// d.direction = around(Math.PI * 0.5);
// } else if (d.x < LEFT) {
// d.direction = around(0.0);
// } else if (d.x > RIGHT) {
// d.x = RIGHT;
// d.direction
// d.direction = around(Math.PI);
// } else if (d.y < TOP) {
// d.direction = around(Math.PI * 1.5);
// }
// d.x += Math.cos(d.direction) * SPEED;
// d.y -= Math.sin(d.direction) * SPEED;
// })
// }
function rebuild_activenodepages () {
activenodepages = [];
for (var i=0, l=allnodepages.length; i<l; i++) {
......@@ -1066,7 +1016,6 @@ d3.json("../../data/join.json", function(error, graph) {
if (error) throw error;
// console.log("loaded pages", graph);
index_graph(graph['@graph']);
// // // OLD!! init_categories();
init_movement();
init_svg();
if (show_all_nodes) {
......
......@@ -172,32 +172,6 @@ function list_remove_item (list, item) {
}
}
// function init_categories() {
// // show categories
// allcats.sort(function (a, b) { return a.name > b.name });
// // console.log("allcats", allcats.map(function (x) { return x.name}));
// d3.select("#cats").select(".body").selectAll("a.cat").data(allcats).enter()
// .append("a").attr("class", "cat").text(function (d, i) { return d.name+" "+(i+1<allcats.length ? "| ":"") })
// .attr("href", function (d) { return d.id })
// .attr("target", "over")
// .on("click", function (d) {
// var $this = d3.select(this);
// // d3.event.preventDefault();
// // console.log("** category click", d);
// // TOGGLE HIGHLIGHT
// if (d.highlight) {
// list_remove_item(highlighted_categories, d);
// d.highlight = false;
// } else {
// highlighted_categories.push(d);
// d.highlight = true;
// }
// $this
// .classed("highlight", function (d) { return d.highlight });
// rebuild_links();
// });
// }
function init_svg () {
link = svgg.append("g")
.attr("class", "links")
......@@ -357,30 +331,6 @@ function ticked() {
// console.log("ticked", node.size());
}
// function movementHook () {
// node.each(function (d) {
// if (d.linked || d.fx) return;
// // bounce
// if (d.y > BOTTOM + MARGIN) {
// d.direction = Math.PI*0.5;
// } else if (d.y > BOTTOM) {
// d.direction = around(Math.PI * 0.5);
// } else if (d.x < LEFT) {
// d.direction = around(0.0);
// } else if (d.x > RIGHT) {
// d.x = RIGHT;
// d.direction
// d.direction = around(Math.PI);
// } else if (d.y < TOP) {
// d.direction = around(Math.PI * 1.5);
// }
// d.x += Math.cos(d.direction) * SPEED;
// d.y -= Math.sin(d.direction) * SPEED;
// })
// }
function rebuild_activenodepages () {
activenodepages = [];
for (var i=0, l=allnodepages.length; i<l; i++) {
......@@ -679,7 +629,6 @@ d3.json("../../data/join.json", function(error, graph) {
if (error) throw error;
// console.log("loaded pages", graph);
index_graph(graph['@graph']);
// // // OLD!! init_categories();
init_movement();
init_svg();
if (show_all_nodes) {
......
#!/usr/bin/env python3
# amateurwebscraper.py
import sys
import re
import html5lib
from urllib.request import urlopen
from urllib.parse import urljoin
# from xml.etree import ElementTree as ET
import subprocess
from cssselect import GenericTranslator, SelectorError
# nb using lxml for xpath
from etreeutils import replace_elt, textContent
import lxml.etree as ET
# # CRAZY IMPORT FROM http://lxml.de/tutorial.html
# try:
# from lxml import etree
# # print("running with lxml.etree")
# except ImportError:
# try:
# # Python 2.5
# import xml.etree.cElementTree as etree
# # print("running with cElementTree on Python 2.5+")
# except ImportError:
# try:
# # Python 2.5
# import xml.etree.ElementTree as etree
# # print("running with ElementTree on Python 2.5+")
# except ImportError:
# try:
# # normal cElementTree install
# import cElementTree as etree
# # print("running with cElementTree")
# except ImportError:
# try:
# # normal ElementTree install
# import elementtree.ElementTree as etree
# # print("running with ElementTree")
# except ImportError:
# print("Failed to import ElementTree from any known place")
"""
__ __
......@@ -32,6 +68,108 @@ def scrape (url):
[urljoin(url, x.attrib.get("src")) for x in t.findall(".//*[@src]")])
return links
# SCRAPE PHASE 2
def css_to_xpath (s):
return GenericTranslator().css_to_xpath(s)
def extract_content (t, title_selector, content_selector, remove_title):
"""
Extract content of a document using html5lib + (optional) content and title css selectors
Returns contents (str)
"""
# print ("[extract_content]", url, file=sys.stderr)
title = ""
# Try title_selector
if title_selector:
for item in t.xpath(css_to_xpath(title_selector)):
title = textContent(item)
if remove_title:
item.getparent().remove(item)
# otherwise the contents of the title tag
if not title:
title = t.find(".//title")
if title != None:
title = title.text
content = ""
found_content = False
if content_selector:
content = """<!DOCTYPE html>
<html>
<head>
<title>{0}</title>
<meta charset="utf-8">
</head>
<body>
""".format(title)
for x in t.xpath(css_to_xpath(content_selector)):
found_content = True
content += ET.tostring(x, encoding="unicode", method="html")
content += """
</body>
</html>"""
# otherwise use the whole document
if not found_content:
content = ET.tostring(t, encoding="unicode", method="html")
return title, content
def iframe_to_links (t):
""" pandoc doesn't render iframes, so let's transform them into links """
for iframe in t.findall(".//iframe[@src]"):
src = iframe.attrib.get("src")
# print ("FOUND IFRAME", src, file=sys.stderr)
a = replace_elt(t, iframe, "a")
a.attrib["href"] = src
a.attrib["class"] = "iframe"
a.text = src
def pandoc (src, fro="html", to="mediawiki"):
# print ("[pandoc]", file=sys.stderr)
p = subprocess.Popen(["pandoc", "--from", fro, "--to", to], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate(src.encode("utf-8"))
return stdout.decode("utf-8")
def absolutize_links (t, url):
for elt in t.findall(".//*[@href]"):
href = elt.attrib.get("href")
elt.attrib['href'] = urljoin(url, href)
for elt in t.findall(".//*[@src]"):
href = elt.attrib.get("src")
elt.attrib['src'] = urljoin(url, href)
def scrape_url_to_mediawiki (url, title_selector=None, content_selector=None, remove_title=False, remove_selector=None):
# url = normalize_url(url)
print ("scrape_url_to_mediawiki {0}".format(url), file=sys.stderr)
f = urlopen(url)
t = html5lib.parse(f, namespaceHTMLElements=False, treebuilder="lxml")
absolutize_links(t, url)
iframe_to_links(t)
# print (ET.tostring(t, encoding="unicode", method="html"))
if remove_selector:
for elt in t.xpath(css_to_xpath(remove_selector)):
print ("Removing", elt, file=sys.stderr)
elt.getparent().remove(elt)
title, src = extract_content(t, title_selector, content_selector, remove_title)
src = pandoc(src, to="mediawiki").strip()
# if title:
# src = "= {0} =\n\n".format(title) + src
# src += "\n\n[[Category:Scrape links]]"
return title, src
def scrape2 (url, title_selector, content_selector, remove_title, remove_selector, wikiprotocol, wikihost, wikipath, wikipage, user, password):
""" tester """
import mwclient
wiki = mwclient.Site((wikiprotocol, args.wikihost), path=args.wikipath)
wiki.login(user, password)
title, newsrc = scrape_url_to_mediawiki(url, title_selector, content_selector, remove_title, remove_selector)
page = wiki.pages[wikipage]
page.save(src)
if __name__== "__main__":
import argparse
ap = argparse.ArgumentParser("")
......@@ -49,6 +187,24 @@ if __name__== "__main__":
# ap_foo.add_argument('y', type=float)
ap_norm.set_defaults(func=normalize_url)
aps2 = sub.add_parser('scrape2', help="scrape a URL (2)")
aps2.add_argument('url')
aps2.add_argument('--title-selector', default=None, help="CSS selector to extract title content")
aps2.add_argument('--remove-title', default=False, action="store_true", help="remove title (if specified with a selector)")
aps2.add_argument('--content-selector', default=None, help="CSS selector to extract main content")
aps2.add_argument('--remove-selector', default=None, help="CSS selector of content to remove")
aps2.add_argument("--wikiprotocol", default="http")
aps2.add_argument("--wikihost", default="localhost")
aps2.add_argument("--wikipath", default="/mw/")
aps2.add_argument("--wikipage", default="Test")
aps2.add_argument("--user", default="Michael Murtaugh@bot")
aps2.add_argument("--password", default="3ap1p11k33snual7hr2m9nldvg1jo0si")
# aps2.add_argument("--page", default=None)
aps2.set_defaults(func=scrape2)
args = ap.parse_args()
# print (args)
params = vars(args).copy()
......
from __future__ import print_function
try:
import lxml.etree as ET
except ImportError:
from xml.etree import ElementTree as ET
try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin
import sys
def innerHTML (elt):
if elt.text != None:
ret = elt.text
else:
ret = u""
return ret + u"".join([ET.tostring(x, method="html") for x in elt])
def textContent (elt):
if elt.text != None:
ret = elt.text
else:
ret = u""
return ret + u"".join([ET.tostring(x, method="text", encoding="utf8").decode("utf-8") for x in elt])
def hn (elt):
""" iterator for all header elements """
for n in range(5):
h = elt.find(".//h{0}".format(n+1))
if h != None:
return h
def absolutize_links(t, baseurl):
""" Use a baseurl to absolutize the links of an etree """
for elt in t.findall(".//*[@href]"):
elt.attrib['href'] = urljoin(baseurl, elt.attrib.get("href"))
for elt in t.findall(".//*[@src]"):
elt.attrib['src'] = urljoin(baseurl, elt.attrib.get("src"))
def parentchilditer (elt):
for parent in elt.iter():
for child in parent:
yield parent, child
def parentchilditerwithindex (elt):
for parent in elt.iter():
for i, child in enumerate(parent):
yield parent, child, i
def replace_elt (t, elt, tag):
for p, c, i in parentchilditerwithindex(t):
if c == elt:
# print ("replacing {0} with {1}".format(elt.tag, tag), file=sys.stderr)
newelt = ET.SubElement(p, tag)
p.remove(elt)
p.insert(i, newelt)
return newelt
def containing_tags (elt, fromelt):
ret = None
if elt == fromelt:
return [elt]
for child in fromelt:
nelts = containing_tags(elt, child)
if nelts:
ret = [fromelt] + nelts
return ret
def is_header (elt):
return re.search(r"^h\d$", elt.tag) != None
......@@ -331,6 +331,9 @@ if __name__ == "__main__":
ap.add_argument("--allpages", default=False, action="store_true")
ap.add_argument("--force", action="store_true", default=False)
ap.add_argument("--output", type=argparse.FileType('w'), default=sys.stdout)
ap.add_argument("--namespace", action="append")
args = ap.parse_args()
# Template
......@@ -352,6 +355,19 @@ if __name__ == "__main__":
output = []
# namespaces
if args.namespace:
for ns in args.namespace:
for nsid, name in site.namespaces.items():
if (ns == name):
pages = list(site.allpages(namespace=nsid))
pages.sort(key=lambda x: x.name)
for i, p in enumerate(pages):
print ("NS {0}/{1} {2}...".format(i+1, len(pages), p.name), file=sys.stderr)
d = dumppage(p, site, path=args.path, htmltemplate=template, force=args.force)
output.append(d)
if args.limit and i+1>=args.limit:
break
if args.page:
p = site.pages[args.page]
d = dumppage(p, site, path=args.path, htmltemplate=template, force=args.force)
......@@ -397,4 +413,6 @@ if __name__ == "__main__":
nextround.append(c)
do = nextround
json.dump({'@graph': output}, args.output, indent=2)
from mwclient import Site
import argparse, re, sys
from mediawikiutils import upload_url_to_wiki
from datetime import datetime
def filetest (p):
""" Filter evil 1x1 pixel images """
im = Image.open(p)
w, h = im.size
if (w <= 1 and h <= 1):
return False
return True
def fixurlfilelinks (src, wiki, timestamp=None):
if timestamp == None:
timestamp = datetime.now().strftime("%Y/%m/%d")
def linksub (m):
print ("linksub", m.group(1), file=sys.stderr)
url = m.group(1)
page = upload_url_to_wiki(url, wiki, "{{{{Provenance web|URL={0}|Date={1}]}}}}".format(url, timestamp))
if page == None:
print ("URL failed {0} removing link: {1}".format(m.group(1), m.group(0)), file=sys.stderr)
return ""
else:
return "[[{0}{1}]]".format(page.name, m.group(2) or "")
pass
return re.sub(r"\[\[File\:(https?\://.+?)(\|.+)?\]\]", linksub, src)
def fixlinks (page, wiki):
src = page.text()
newsrc = fixurlfilelinks(src, wiki)
if newsrc != src:
print ("[fixlinks] {0} saving updated page...".format(page.name), file=sys.stderr)
page.save(newsrc, description="fix broken file links bot")
if __name__ == "__main__":
ap = argparse.ArgumentParser("")
ap.add_argument("--wikiprotocol", default="http")
ap.add_argument("--wikihost", default="erg.activearchives.org")
ap.add_argument("--wikipath", default="/mw/")
# ap.add_argument("--wikihost", default="localhost")
# ap.add_argument("--wikipath", default="/mw/")
# ap.add_argument("--user", default=None)
# ap.add_argument("--password", default=None)
ap.add_argument("--user", default="Michael Murtaugh@bot")
ap.add_argument("--password", default="3ap1p11k33snual7hr2m9nldvg1jo0si")
ap.add_argument("--page", default=None, help="fix just the page with the given name (otherwise default is to use Category:Pages with broken file links)")
args = ap.parse_args()
wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
if args.user:
wiki.login(args.user, args.password)
if args.page:
fixlinks(wiki.pages[args.page], wiki)
for page in wiki.categories["Pages with broken file links"]:
fixlinks(page, wiki)
......@@ -16,6 +16,7 @@ import shutil
import subprocess
import tempfile
import sys
from utils import download_url_to_file
def category_members (site, cattitle, objects=True):
......@@ -204,6 +205,26 @@ def upload_file_to_wiki (wiki, path, filename=None, description="", phpuploader=
for im in wiki.allimages(sha1=sh.hexdigest()):
return im
def upload_url_to_wiki (url, wiki, description="", phpuploader=None, user=None, cachedir=None, filetest=None):
tmp = None
try:
if cachedir == None:
tmp = tempfile.TemporaryDirectory()
cachedir = tmp.name
path = download_url_to_file(url, cachedir)
if not path:
return None
if filetest:
if not filetest(path):
return None
filename = os.path.basename(path)
filepage = upload_file_to_wiki (wiki, path, filename=filename, description=description, phpuploader=phpuploader, user=user)
# TODO: delete the path?? (when not using tmpdirectory)
return filepage
finally:
if tmp:
tmp.cleanup()
if __name__ == "__main__":
import argparse
from mwclient import Site
......
import re
url = "http://i.creativecommons.org/l/by-nc-nd/2.0/be/80x15.png"
# def page_exists (wiki, name):
# return wiki.pages[name].exists
from urllib.request import Request, urlopen
def strip_all_whitespace (text):
return re.sub("\s+", " ", text).strip()
# f = urlopen(url)
f = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
print (f)
print ("|{0}|".format(strip_all_whitespace("""
This is a test.
-- More things here.
* Hey what about that!
""")))
\ No newline at end of file
from urllib.parse import urlparse, urljoin, unquote as urlunquote
from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError
import os, sys
# extensions for wget
EXT = {}
EXT["image/png"] = "png"
EXT["image/gif"] = "gif"
EXT["image/jpeg"] = "jpg"
EXT["image/jpg"] = "jpg"
EXT["image/svg"] = "svg"
EXT["image/svg+xml"] = "svg"
EXT["application/svg+xml"] = "svg"
EXT["application/svg"] = "svg"
EXT["application/pdf"] = "pdf"
def ensure_extension (filename, ext):
base, fext = os.path.splitext(filename)
return base+"."+ext
def wget (url, cachedir, blocksize=4*1000, user_agent='Mozilla/5.0'):
# if type(url) == unicode:
# url = url.encode("utf-8")
try:
# fin = urlopen(url)
fin = urlopen(Request(url, headers={'User-Agent': user_agent}))
ct = fin.info().get("content-type")
filename = os.path.basename(urlunquote(urlparse(url).path))
if ct in EXT:
filename = ensure_extension(filename, EXT[ct])
count = 0
# path = os.path.join(cachedir, "tmp."+EXT[ct])
path = os.path.join(cachedir, filename)
with open(path, "wb") as fout:
while True:
data = fin.read(blocksize)
if not data:
break
fout.write(data)
count += len(data)
if count > 0:
return path
# print ("[wget] skipping {0} content-type {1}".format(url, ct), file=sys.stderr)
# return None
except HTTPError as e:
print ("[wget]: skipping {0} HTTP ERROR {1}".format(url, e.code), file=sys.stderr)
return None
except URLError as e:
print ("[wget]: skipping {0} URL ERROR {1}".format(url, e.reason), file=sys.stderr)
return None
# CATCH ALL EXCEPTIONS (dangerous)
# except Exception as e:
# print ("[wget]: skipping {0} An exception occured {1}".format(url, e), file=sys.stderr)
# return None
def youtube_dl (url, cachedir, format=None, youtube_dl_path="youtube-dl"):
args = [youtube_dl_path]
if format != None:
args.append("-f")
args.append(format)
args.append(url)
p = subprocess.Popen(args, cwd=cachedir, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
output, _ = p.communicate()
output = output.decode("utf-8")
m = re.search(r"^\[download\] Destination: (.+?)\s*$", output, re.M)
if m:
filename = m.group(1)
# print ("DOWNLOADED \"{0}\"".format(filename), file=sys.stderr)
return os.path.join(cachedir, filename)
else:
m = re.search(r"^\[download\] (.+?) has already been downloaded", output, re.M)
if m:
filename = m.group(1)
# print ("ALREADY DOWNLOADED \"{0}\"".format(filename))
return os.path.join(cachedir, filename)
else:
# print ("DOWNLOAD FAILURE", file=sys.stderr)
return None
def download_url_to_file (href, cachedir):
""" returns path to file """
if "mixcloud.com/" in href:
print ("[mixcloud] {0}".format(href), file=sys.stderr)
return youtube_dl(href, cachedir)
elif "vimeo.com/" in href:
print ("[vimeo] {0}".format(href), file=sys.stderr)
return youtube_dl(href, cachedir, format="http-360p")
elif "youtube.com/" in href:
print ("[youtube] {0}".format(href), file=sys.stderr)
return youtube_dl(href, cachedir, format="18")
elif "youtu.be/" in href:
print ("[youtube] {0}".format(href), file=sys.stderr)
return youtube_dl(href, cachedir, format="18")
else:
return wget(href, cachedir)
......@@ -175,6 +175,14 @@ function mouseover_href (href, src) {
}
function show_external (href) {
console.log("show_external", href);
$.fancybox.open({src: href, type: 'iframe'});
// document.getElementById("external")
// .contentWindow
// .postMessage({msg: "showurl", href: href, type:'iframe'}, "*");
// document.getElementById("externalframe").classList.add("visible");
return;
document.getElementById("external").src = href;
var xu = document.getElementById("externalurl");
// defer this to onload
......
......@@ -37,7 +37,7 @@ m/p/wordmap/dist.js: m/p/wordmap/wordmap.js m/p/wordmap/svgcamera.js
################################
m/data/wikidump.json: web.touch
python3 m/scripts/mediawikidump.py --allpages --subcats Category:Menu --path m/wiki --output $@
python3 m/scripts/mediawikidump.py --namespace "Page web" --allpages --subcats Category:Menu --path m/wiki --output $@
# python3 m/scripts/mediawikidump.py --allpages --allcats --path m/wiki --output $@
forcedump:
......