rdf_index.py 2.13 KB
Newer Older
1
2
3
4
5
6
7
from django.core.management.base import BaseCommand
from etherpadlite.models import Pad
from django.contrib.sites.models import Site
from django.urls import reverse
from aasniff import AAApp
import markdown
from markdown.extensions.toc import TocExtension
8
9
10
11
from django.test import Client
import html5lib
from rdflib.plugins.memory import IOMemory
import rdflib
alexandre's avatar
alexandre committed
12
from ... import settings as app_settings
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27


def tidy(string):
    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
    dom = parser.parse(string)

    # FIXME: remove this? we moved to rdflib
    # Redland crashes if no xmlns attribute is declared.
    # see: http://bugs.librdf.org/mantis/view.php?id=521
    # Lets fix it in the meanwhile...
    elt = dom.getElementsByTagName("html")[0]
    if not elt.hasAttribute("xmlns"):
        elt.setAttribute("xmlns", "http://www.w3.org/1999/xhtml")

    return dom.toxml()
28
29
30
31
32
33
34
35


class Conf(object):
    SNIFFERS = [
        'HttpSniffer',
        'HtmlSniffer',
    ]

alexandre's avatar
alexandre committed
36
    STORE = app_settings.STORE 
37
38
39
40
41
42
43
44
45
46
47

class Command(BaseCommand):
    args = ''
    help = 'Indexes pages'

    def handle(self, *args, **options):
        from django.contrib.sites.models import Site
        domain = Site.objects.get_current().domain

        app = AAApp(conf=Conf)

48
49
        # store = IOMemory()
        # graph = rdflib.graph.ConjunctiveGraph(store=store)
50

51
52
        for pad in Pad.objects.filter():
            c = Client()
53
            path = reverse('pad-read', kwargs={'mode': 'r', 'slug': pad.display_slug})
54
            response = c.get(path)
55
            url = "http://{}{}".format(domain, path)
56
            
57
            print("parsing {}".format(url))
58
59
60
61
            if response.status_code == 200:
                try:
                    app.graph.parse(data=tidy(response.content), format="rdfa", publicID=url)
                except:
62
                    print("couldn't parse {}".format(url))
63
            else:
64
                print("failed to parse {}".format(url))
65
66
67
68
69
70
71


        # for quad in graph.quads():
        #     # avoids duplicates statements
        #     app.graph.remove(quad)
        #     # adds the new statements
        #     app.graph.add(quad)