rdf_index.py 2.1 KB
Newer Older
1
2
3
4
5
6
7
from django.core.management.base import BaseCommand
from etherpadlite.models import Pad
from django.contrib.sites.models import Site
from django.urls import reverse
from aasniff import AAApp
import markdown
from markdown.extensions.toc import TocExtension
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from django.test import Client
import html5lib
from rdflib.plugins.memory import IOMemory
import rdflib


def tidy(string):
    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
    dom = parser.parse(string)

    # FIXME: remove this? we moved to rdflib
    # Redland crashes if no xmlns attribute is declared.
    # see: http://bugs.librdf.org/mantis/view.php?id=521
    # Lets fix it in the meanwhile...
    elt = dom.getElementsByTagName("html")[0]
    if not elt.hasAttribute("xmlns"):
        elt.setAttribute("xmlns", "http://www.w3.org/1999/xhtml")

    return dom.toxml()
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50


class Conf(object):
    SNIFFERS = [
        'HttpSniffer',
        'HtmlSniffer',
    ]

    STORE = {
        'ENGINE': 'sqlite',
        'NAME': 'aasniff.sqlite',
    }


class Command(BaseCommand):
    args = ''
    help = 'Indexes pages'

    def handle(self, *args, **options):
        from django.contrib.sites.models import Site
        domain = Site.objects.get_current().domain

        app = AAApp(conf=Conf)

51
52
        # store = IOMemory()
        # graph = rdflib.graph.ConjunctiveGraph(store=store)
53

54
55
        for pad in Pad.objects.filter():
            c = Client()
56
            path = reverse('pad-read', kwargs={'mode': 'r', 'slug': pad.display_slug})
57
            response = c.get(path)
58
            url = f"http://{domain}{path}"
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
            
            print(f"parsing {url}")
            if response.status_code == 200:
                try:
                    app.graph.parse(data=tidy(response.content), format="rdfa", publicID=url)
                except:
                    print(f"couldn't parse {url}")
            else:
                print(f"failed to parse {url}")


        # for quad in graph.quads():
        #     # avoids duplicates statements
        #     app.graph.remove(quad)
        #     # adds the new statements
        #     app.graph.add(quad)