Commit a88550bb by murtaugh

rewrite redirected links

parent 5dae6f7f
#!/usr/bin/env python3
import argparse, json, sys, os
from urllib.parse import urljoin
from objectify_ld import objectify_ld
from mediawikidump import myurlquote
import html5lib
from xml.etree import ElementTree as ET
def resolve_redirects(d, index):
while "redirects_to" in d:
# print ("redirects_to", type(d['redirects_to']), file=sys.stderr)
# print ("redirect {0} -> {1}".format(d['url'], d['redirects_to'][0]['url']), file=sys.stderr)
d = d['redirects_to'][0]
return d
def rewrite_redirected_links (t, index, url_for_file):
count = 0
for a in t.findall(".//a[@href]"):
oldhref = a.attrib.get("href")
href = oldhref
href = urljoin(url_for_file, href)
if href in index:
d = index[href]
# print ("d", d['url'], file=sys.stderr)
d = resolve_redirects(d, index)
if d['url'] != href:
newhref = d['url']
newhref = os.path.relpath(newhref, os.path.split(url_for_file)[0])
print ("[rewrite_redirected_links] {0}: {1} --> {2}".format(url_for_file, oldhref, newhref), file=sys.stderr)
a.attrib['href'] = newhref
count += 1
# print ("sniff {0}".format(href), file=sys.stderr)
return count
if __name__ == "__main__":
ap = argparse.ArgumentParser("")
ap.add_argument("--data", default="m/data/merge.json")
ap.add_argument("--basepath", default="m/wiki/")
ap.add_argument('input', nargs="+")
args = ap.parse_args()
with open(args.data) as f:
data = json.load(f)
index = objectify_ld(data, reverse=True, reverseprefix="rev_")
for n in args.input:
# print (n, file=sys.stderr)
# url_for_file = myurlquote(n.replace(args.basepath, ""))
url_for_file = "/" + myurlquote(n)
# if url_for_file not in index:
# print ("Warning: {0} not in index (deleted page?)".format(url_for_file), file=sys.stderr)
# continue
# d = index[url_for_file]
with open(n) as f:
t = html5lib.parse(f.read(), namespaceHTMLElements=False)
if rewrite_redirected_links(t, index, url_for_file):
with open(n, "w") as f:
print ("<!DOCTYPE html>", file=f)
print (ET.tostring(t, method="html", encoding="unicode"), file=f)
......@@ -8,7 +8,7 @@
#skin_map=$(skins:%=m/skins/%/map.html)
all: accueil m/index.html m/wiki/index/index.html m/wiki/index.json backlinks m/data/main.json
all: accueil m/index.html m/wiki/index/index.html m/wiki/index.json rewrite_redirects backlinks m/data/main.json
#skins: $(skin_index) $(skin_map)
......@@ -22,6 +22,9 @@ clean:
rm -rf m/wiki/*
# rm -rf m/feeds/*
feeds:
python3 m/scripts/mediawikifeeder.py --user Ergbot --password mettezlesmotsquimanquent feed
site: zapzeros $(data) $(agenda) m/wiki m/menu.html
doc: $(dochtml)
......@@ -167,6 +170,16 @@ backlinks: m/data/merge04.json m/templates/backlinks.html
m/wiki/Actualités/*.html \
m/wiki/Catégorie/*.html
.PHONY: rewrite_redirects
rewrite_redirects: m/data/merge02.json
python3 m/scripts/rewrite_redirected_links.py \
--data m/data/merge02.json \
m/wiki/*.html \
m/wiki/Page_web/*.html \
m/wiki/Actualités/*.html \
m/wiki/Catégorie/*.html
m/data/merge02.json: m/data/merge.json
python3 m/scripts/pp_absurls.py $< /m/wiki/ > $@
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment