From d21cc4b21ea1f78fc2cb958baffac98b50645d6f Mon Sep 17 00:00:00 2001
From: Michael Murtaugh <mm@automatist.org>
Date: Fri, 13 Nov 2015 11:03:57 +0100
Subject: [PATCH] make file friendliness

---
 README.md                                  |  8 ++
 etherdump/commands/common.py               | 25 ++++++
 etherdump/commands/creatediffhtml.py       | 38 +++++++++
 etherdump/commands/dumpcsv.py              | 28 ++++++-
 etherdump/commands/gethtml.py              | 34 ++++++++
 etherdump/commands/{text.py => gettext.py} |  8 +-
 etherdump/commands/showmeta.py             | 32 ++++++++
 etherdump/commands/sync.py                 | 96 ++++++++++++++++++++++
 8 files changed, 261 insertions(+), 8 deletions(-)
 create mode 100644 etherdump/commands/common.py
 create mode 100644 etherdump/commands/creatediffhtml.py
 create mode 100644 etherdump/commands/gethtml.py
 rename etherdump/commands/{text.py => gettext.py} (84%)
 create mode 100644 etherdump/commands/showmeta.py
 create mode 100644 etherdump/commands/sync.py

diff --git a/README.md b/README.md
index 36d82f1..22ebd71 100644
--- a/README.md
+++ b/README.md
@@ -43,3 +43,11 @@ subcommands
 To get help on a subcommand:
 
 	etherdump revisionscount --help
+
+TODO
+--------
+* Modify tools to work with make
+** Sync command
+** Dump command that works on a single page
+** Post processing as separable filters (such as linkify)
+* Support for migrating (what dump formats exist that would allow pushing to another instance?)
diff --git a/etherdump/commands/common.py b/etherdump/commands/common.py
new file mode 100644
index 0000000..b8d6194
--- /dev/null
+++ b/etherdump/commands/common.py
@@ -0,0 +1,25 @@
+import re, os
+from urllib import quote_plus, unquote_plus
+
+
+groupnamepat = re.compile(r"^g\.(\w+)\$")
+def splitpadname (padid):
+    m = groupnamepat.match(padid)
+    if m:
+        return(m.group(1), padid[m.end():])
+    else:
+        return (u"", padid)
+
+def padpath (padid, pub_path=u"", group_path=u""):
+    g, p = splitpadname(padid)
+    if type(g) == unicode:
+        g = g.encode("utf-8")
+    if type(p) == unicode:
+        p = p.encode("utf-8")
+    p = quote_plus(p)
+    # p = p.replace(" ", "_")
+    # p = p.replace("*", "-")
+    if g:
+        return os.path.join(group_path, g, p)
+    else:
+        return os.path.join(pub_path, p)
diff --git a/etherdump/commands/creatediffhtml.py b/etherdump/commands/creatediffhtml.py
new file mode 100644
index 0000000..66be578
--- /dev/null
+++ b/etherdump/commands/creatediffhtml.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+
+from argparse import ArgumentParser
+import json
+from urllib import urlencode
+from urllib2 import urlopen, HTTPError, URLError
+
+
+def main(args):
+	p = ArgumentParser("")
+	p.add_argument("padid", help="the padid")
+	p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
+	p.add_argument("--showurl", default=False, action="store_true")
+	p.add_argument("--format", default="text", help="output format, can be: text, json; default: text")
+	p.add_argument("--rev", type=int, default=None, help="revision, default: latest")
+	args = p.parse_args(args)
+
+	with open(args.padinfo) as f:
+		info = json.load(f)
+	apiurl =  "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
+	data = {}
+	data['apikey'] = info['apikey']
+	data['padID'] = args.padid
+	data['startRev'] = "0"
+	if args.rev != None:
+		data['rev'] = args.rev
+	requesturl = apiurl+'createDiffHTML?'+urlencode(data)
+	if args.showurl:
+		print requesturl
+	else:
+		try:
+			results = json.load(urlopen(requesturl))['data']
+			if args.format == "json":
+				print json.dumps(results)
+			else:
+				print results['html'].encode("utf-8")
+		except HTTPError as e:
+			pass
\ No newline at end of file
diff --git a/etherdump/commands/dumpcsv.py b/etherdump/commands/dumpcsv.py
index d390b94..4343ecb 100644
--- a/etherdump/commands/dumpcsv.py
+++ b/etherdump/commands/dumpcsv.py
@@ -6,6 +6,7 @@ from datetime import datetime
 from urllib import urlencode
 from urllib2 import urlopen, HTTPError, URLError
 from csv import writer 
+from math import ceil, floor
 
 """
 Dumps a CSV of all pads with columns
@@ -32,6 +33,7 @@ def main (args):
     p = ArgumentParser("")
     p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
     p.add_argument("--format", default="csv", help="output format: csv (default), json")
+    p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False")
     args = p.parse_args(args)
 
     with open(args.padinfo) as f:
@@ -41,11 +43,23 @@ def main (args):
     data['apikey'] = info['apikey']
     requesturl = apiurl+'listAllPads?'+urlencode(data)
 
-    results = jsonload(requesturl)['data']['padIDs']
-    results.sort()
+    padids = jsonload(requesturl)['data']['padIDs']
+    padids.sort()
+    numpads = len(padids)
+    maxmsglen = 0
+    count = 0
     out.writerow(("padid", "groupid", "lastedited", "revisions", "author_ids"))
-    for padid in results:
-        print (u"{0}".format(padid), file=sys.stderr)
+    for i, padid in enumerate(padids):
+        p = (float(i) / numpads)
+        percentage = int(floor(p*100))
+        bars = int(ceil(p*20))
+        bar = ("*"*bars) + ("-"*(20-bars))
+        msg = u"\r{0} {1}/{2} {3}... ".format(bar, (i+1), numpads, padid)
+        if len(msg) > maxmsglen:
+            maxmsglen = len(msg)
+        sys.stderr.write("\r{0}".format(" "*maxmsglen))
+        sys.stderr.write(msg.encode("utf-8"))
+        sys.stderr.flush()
         m = groupnamepat.match(padid)
         if m:
             groupname = m.group(1)
@@ -56,10 +70,16 @@ def main (args):
 
         data['padID'] = padid.encode("utf-8")
         revisions = jsonload(apiurl+'getRevisionsCount?'+urlencode(data))['data']['revisions']
+        if (revisions == 0) and not args.zerorevs:
+            continue
+
+
         lastedited_raw = jsonload(apiurl+'getLastEdited?'+urlencode(data))['data']['lastEdited']
         lastedited_iso = datetime.fromtimestamp(int(lastedited_raw)/1000).isoformat()
         author_ids = jsonload(apiurl+'listAuthorsOfPad?'+urlencode(data))['data']['authorIDs']
         author_ids = u" ".join(author_ids).encode("utf-8")
         out.writerow((padidnogroup.encode("utf-8"), groupname.encode("utf-8"), revisions, lastedited_iso, author_ids))
+        count += 1
 
+    print("\nWrote {0} rows...".format(count), file=sys.stderr)
 
diff --git a/etherdump/commands/gethtml.py b/etherdump/commands/gethtml.py
new file mode 100644
index 0000000..419c629
--- /dev/null
+++ b/etherdump/commands/gethtml.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+
+from argparse import ArgumentParser
+import json
+from urllib import urlencode
+from urllib2 import urlopen, HTTPError, URLError
+
+
+def main(args):
+	p = ArgumentParser("")
+	p.add_argument("padid", help="the padid")
+	p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
+	p.add_argument("--showurl", default=False, action="store_true")
+	p.add_argument("--format", default="text", help="output format, can be: text, json; default: text")
+	p.add_argument("--rev", type=int, default=None, help="revision, default: latest")
+	args = p.parse_args(args)
+
+	with open(args.padinfo) as f:
+		info = json.load(f)
+	apiurl =  "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
+	data = {}
+	data['apikey'] = info['apikey']
+	data['padID'] = args.padid
+	if args.rev != None:
+		data['rev'] = args.rev
+	requesturl = apiurl+'getHTML?'+urlencode(data)
+	if args.showurl:
+		print requesturl
+	else:
+		results = json.load(urlopen(requesturl))['data']
+		if args.format == "json":
+			print json.dumps(results)
+		else:
+			print results['html'].encode("utf-8")
diff --git a/etherdump/commands/text.py b/etherdump/commands/gettext.py
similarity index 84%
rename from etherdump/commands/text.py
rename to etherdump/commands/gettext.py
index c0f50dc..7e806e5 100644
--- a/etherdump/commands/text.py
+++ b/etherdump/commands/gettext.py
@@ -12,6 +12,7 @@ def main(args):
 	p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
 	p.add_argument("--showurl", default=False, action="store_true")
 	p.add_argument("--format", default="text", help="output format, can be: text, json; default: text")
+	p.add_argument("--rev", type=int, default=None, help="revision, default: latest")
 	args = p.parse_args(args)
 
 	with open(args.padinfo) as f:
@@ -19,7 +20,9 @@ def main(args):
 	apiurl =  "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
 	data = {}
 	data['apikey'] = info['apikey']
-	data['padID'] = args.padid.encode("utf-8")
+	data['padID'] = args.padid # is utf-8 encoded
+	if args.rev != None:
+		data['rev'] = args.rev
 	requesturl = apiurl+'getText?'+urlencode(data)
 	if args.showurl:
 		print requesturl
@@ -29,6 +32,3 @@ def main(args):
 			print json.dumps(results)
 		else:
 			print results['text'].encode("utf-8")
-
-	# To save to file run:
-	# python gettext.py > copy.txt
diff --git a/etherdump/commands/showmeta.py b/etherdump/commands/showmeta.py
new file mode 100644
index 0000000..9d0053a
--- /dev/null
+++ b/etherdump/commands/showmeta.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+from __future__ import print_function
+from argparse import ArgumentParser
+import json, sys, re
+from common import *
+
+"""
+Extract and output selected fields of metadata
+"""
+
+def main (args):
+    p = ArgumentParser("")
+    p.add_argument("--path", default=None, help="read from a meta.json file")
+    p.add_argument("--padid", default=None, help="read meta for this padid")
+    p.add_argument("--format", default="{padid}", help="format str, default: {padid}")
+    args = p.parse_args(args)
+
+    path = args.path
+    if not path and args.padid:
+        path = padpath(args.padid) + ".meta.json"
+
+    if not path:
+        print ("Must specify either --path or --padid")
+        sys.exit(-1)
+
+    with open(path) as f:
+        meta = json.load(f)
+
+    formatstr = args.format.decode("utf-8")
+    formatstr = re.sub(ur"{(\w+)}", r"{0[\1]}", formatstr)
+    print (formatstr.format(meta).encode("utf-8"))
+
diff --git a/etherdump/commands/sync.py b/etherdump/commands/sync.py
new file mode 100644
index 0000000..ba69f9c
--- /dev/null
+++ b/etherdump/commands/sync.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+from __future__ import print_function
+from argparse import ArgumentParser
+import sys, json, re, os
+from datetime import datetime
+from urllib import urlencode
+from urllib2 import urlopen, HTTPError, URLError
+from math import ceil, floor
+from common import *
+
+"""
+sync(meta):
+    Update meta data files for those that have changed.
+    Check for changed pads by looking at revisions & comparing to existing
+
+"""
+
+def jsonload (url):
+    f = urlopen(url)
+    data = f.read()
+    f.close()
+    return json.loads(data)
+
+def load_padinfo(p):
+    with open(p) as f:
+        info = json.load(f)
+    info['api'] = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
+    return info
+
+
+def main (args):
+    p = ArgumentParser("")
+    p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
+    p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False")
+    p.add_argument("--pub", default="pub", help="pub path for output, default: pub")
+    p.add_argument("--group", default="g", help="group path for output, default: g")
+    p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None")
+    args = p.parse_args(args)
+
+    info = load_padinfo(args.padinfo)
+    data = {}
+    data['apikey'] = info['apikey']
+    padids = jsonload(info['api']+'listAllPads?'+urlencode(data))['data']['padIDs']
+    padids.sort()
+    numpads = len(padids)
+    maxmsglen = 0
+    count = 0
+    for i, padid in enumerate(padids):
+        if args.skip != None and i<args.skip:
+            continue
+        p = (float(i) / numpads)
+        percentage = int(floor(p*100))
+        bars = int(ceil(p*20))
+        bar = ("*"*bars) + ("-"*(20-bars))
+        msg = u"\r{0} {1}/{2} {3}... ".format(bar, (i+1), numpads, padid)
+        if len(msg) > maxmsglen:
+            maxmsglen = len(msg)
+        sys.stderr.write("\r{0}".format(" "*maxmsglen))
+        sys.stderr.write(msg.encode("utf-8"))
+        sys.stderr.flush()
+        data['padID'] = padid.encode("utf-8")
+        p = padpath(padid, args.pub, args.group)
+        metapath = p + ".meta.json"
+        revisions = None
+        if os.path.exists(metapath):
+            with open(metapath) as f:
+                meta = json.load(f)
+            revisions = jsonload(info['api']+'getRevisionsCount?'+urlencode(data))['data']['revisions']
+            if meta['revisions'] == revisions:
+                continue
+        
+        meta = {'padid': padid.encode("utf-8")}
+        if revisions == None:
+            meta['revisions'] = jsonload(info['api']+'getRevisionsCount?'+urlencode(data))['data']['revisions']
+        else:
+            meta['revisions' ] = revisions            
+
+        if (meta['revisions'] == 0) and (not args.zerorevs):
+            # print("Skipping zero revs", file=sys.stderr)
+            continue
+
+        count += 1
+        # todo: load more metadata!
+        meta['lastedited_raw'] = int(jsonload(info['api']+'getLastEdited?'+urlencode(data))['data']['lastEdited'])
+        meta['lastedited_iso'] = datetime.fromtimestamp(int(meta['lastedited_raw'])/1000).isoformat()
+        meta['author_ids'] = jsonload(info['api']   +'listAuthorsOfPad?'+urlencode(data))['data']['authorIDs']
+
+        # save it
+        try:
+            os.makedirs(os.path.split(metapath)[0])
+        except OSError:
+            pass
+        with open(metapath, "w") as f:
+            json.dump(meta, f)
+
+    print("\nWrote {0} files...".format(count), file=sys.stderr)
-- 
GitLab