Commit 491afc46 authored by alexandre's avatar alexandre
Browse files

WIP: testing parsing and storing RDFa with RDFlib

parent 13da1aae
from django.core.management.base import BaseCommand
from etherpadlite.models import Pad
from django.contrib.sites.models import Site
from django.urls import reverse
from aasniff import AAApp
class Conf(object):
SNIFFERS = [
'HttpSniffer',
'HtmlSniffer',
]
STORE = {
'ENGINE': 'sqlite',
'NAME': 'aasniff.sqlite',
}
class Command(BaseCommand):
args = ''
help = 'Indexes pages'
def handle(self, *args, **options):
HOST = None
if Site.objects.count() > 0:
site = Site.objects.all()[0]
HOST = site.domain
if not HOST:
return "No site domain settings found"
host = u"http://%s" % HOST
app = AAApp(conf=Conf)
for pad in Pad.objects.filter():
url = "{}{}".format(host, reverse('pad-read', kwargs={'mode': 'r', 'slug': pad.display_slug}))
try:
app.index(url)
print(f"indexed {url}")
except:
print(f"fail at indexing {url}")
from django.core.management.base import BaseCommand
from etherpadlite.models import Pad
from django.contrib.sites.models import Site
from django.urls import reverse
from aasniff import AAApp
import markdown
from markdown.extensions.toc import TocExtension
class Conf(object):
SNIFFERS = [
'HttpSniffer',
'HtmlSniffer',
]
STORE = {
'ENGINE': 'sqlite',
'NAME': 'aasniff.sqlite',
}
class Command(BaseCommand):
args = ''
help = 'Indexes pages'
def handle(self, *args, **options):
from django.contrib.sites.models import Site
domain = Site.objects.get_current().domain
app = AAApp(conf=Conf)
for pad in Pad.objects.filter():
# TODO: make a Pad method for that
client = pad.epclient
pad_id = pad.padid
text = client.getText(pad_id)['text']
path = reverse('pad-read', kwargs={'mode': 'r', 'slug': pad.display_slug})
# FIXME: handle https as well
url = f"http://{domain}{path}"
md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(baselevel=2), 'attr_list'])
text = md.convert(text)
html = f"""<!DOCTYPE html>
<html lang="en">
<head>
<title>Example Document</title>
</head>
<body>
{text}
</body>
</html>"""
print(html)
# print(f"parsing {url}")
# app.graph.parse(data=html, format="rdfa", publicID=url)
from django.core.management.base import BaseCommand
from etherpadlite.models import Pad
from django.contrib.sites.models import Site
from django.urls import reverse
from aasniff import AAApp
import rdflib
class Conf(object):
SNIFFERS = [
'HttpSniffer',
'HtmlSniffer',
]
STORE = {
'ENGINE': 'sqlite',
'NAME': 'aasniff.sqlite',
}
class Command(BaseCommand):
args = ''
help = 'Indexes pages'
def handle(self, *args, **options):
app = AAApp(conf=Conf)
# print(rt)
# for quad in graph.quads():
# print(quad)
node = rdflib.URIRef("http://purl.org/dc/terms/title")
NS = {
'bibo': rdflib.Namespace("http://purl.org/ontology/bibo/"),
'bib': rdflib.Namespace("http://purl.org/net/biblio#"),
'dc': rdflib.namespace.DC,
'dcterms': rdflib.namespace.DCTERMS,
'egr2': rdflib.Namespace("http://rdvocab.info/ElementsGr2/"),
'foaf': rdflib.namespace.FOAF,
'frbr': rdflib.Namespace("http://purl.org/vocab/frbr/core#"),
'purl': rdflib.Namespace("http://purl.org/dc/terms/"),
'rdf': rdflib.RDF,
'rdvocab': rdflib.Namespace("http://RDVocab.info/elements/"),
'stats': rdflib.Namespace("http://kavan.land/vocab/stats#"),
'vocab': rdflib.Namespace("http://purl.org/vocab/frbr/core#"),
'wemi': rdflib.Namespace("http://RDVocab.info/RDARelationshipsWEMI/"),
'z': rdflib.Namespace("http://www.zotero.org/namespaces/export#"),
}
as_predicate = app.graph.query("""
SELECT DISTINCT ?subject ?title
WHERE {
?subject dcterms:title ?title.
}
""", initBindings={'predicate': node}, initNs=NS)
print(as_predicate)
for i in as_predicate:
print(i[1])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment