index.py 8.34 KB
Newer Older
eric's avatar
eric committed
1
2
3
4
5
# -*- coding: utf-8 -*-

# Python imports

import os
6
import re
7
import sys
eric's avatar
eric committed
8
9
import codecs
import json
10
from urllib import error
11
from time import clock
eric's avatar
eric committed
12
13
14
15
16
17
18
19
20
21
22
23

# PyPi imports

import rdflib

# Django imports

from django.core.management.base import BaseCommand, CommandError
from django.template.loader import render_to_string

# Django Apps import

gijs's avatar
gijs committed
24
from django.contrib.sites.models import Site
eric's avatar
eric committed
25
from etherpadlite.models import Pad, PadAuthor
26
from ethertoff.settings import BACKUP_DIR
eric's avatar
eric committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

"""
We scrape all the pages, construct a graph, and ask the RDF store to return us all the metadata.

The result will looks something like this:

+----------------------------------------------------------------------+--------------------------------------------+---------------------------------------------------+
| subject                                                              | predicate                                  | object                                            |
+----------------------------------------------------------------------+--------------------------------------------+---------------------------------------------------+
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://purl.org/dc/terms/creator           | Élodie Royer et Yoann Gourmel                     |
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://purl.org/dc/terms/title             | The Play/ザ・プレイ                                 |
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://www.w3.org/ns/md#item               | http://www.w3.org/1999/02/22-rdf-syntax-ns#nil    |
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://purl.org/dc/terms/created           | 2012-06-02T00:00:00                               |
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://www.w3.org/1999/xhtml/vocab#license | http://creativecommons.org/licenses/by-nd/3.0/fr/ |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://purl.org/dc/terms/title             | Utopia                                            |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://purl.org/dc/terms/creator           | Bernadette Mayer                                  |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://www.w3.org/1999/xhtml/vocab#license |                                                   |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://purl.org/dc/terms/created           | 2014-10-02T00:00:00                               |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://www.w3.org/ns/md#item               | http://www.w3.org/1999/02/22-rdf-syntax-ns#nil    |
+----------------------------------------------------------------------+--------------------------------------------+---------------------------------------------------+

We then use python to construct a list that is easy to use in a template, something like:

[
  {
52
53
54
55
56
57
58
59
    "language": "fr", 
    "author": "Mayer, Bernadette", 
    "title": "Utopia", 
    "href": "http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md", 
    "authors": [
      "Mayer, Bernadette"
    ], 
    "date": "2014-10-10T00:00:00"
eric's avatar
eric committed
60
61
  }, 
  {
62
63
64
65
66
67
68
69
70
71
    "license": "http://creativecommons.org/licenses/by-nd/3.0/fr/", 
    "language": "fr", 
    "author": "Gourmel, Yoann", 
    "title": "The Play/ザ・プレイ", 
    "href": "http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md", 
    "authors": [
      "Royer, Élodie", 
      "Gourmel, Yoann"
    ], 
    "date": "2012-06-10T00:00:00"
eric's avatar
eric committed
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
  }
]

"""


# This SPARQL QUERY it simply gets all the metadata in triples
sparql_query = """PREFIX dc: <http://purl.org/dc/elements/1.1/>
select ?subject ?predicate ?object
where { ?subject ?predicate ?object .\n
}
order by ?subject 
"""

# Map between predicate uris and more easy names to use in the template
short_names = {
    "http://www.w3.org/1999/xhtml/vocab#license" : "license",
    "http://purl.org/dc/terms/title" : "title",
eric's avatar
eric committed
90
    "http://purl.org/dc/terms/creator" : "authors",
eric's avatar
eric committed
91
    "http://purl.org/dc/terms/created" : "date",
92
    "http://purl.org/dc/terms/language" : "language",
svilayphiou's avatar
svilayphiou committed
93
    "http://purl.org/dc/terms/type" : "type",
94
    "http://purl.org/dc/terms/identifier" : "id",
eric's avatar
eric committed
95
96
    "http://commontag.org/ns#Tag" : "tags",
    "http://purl.org/dc/terms/subject" : "theme",
eric's avatar
eric committed
97
98
    "http://purl.org/dc/terms/contributor" : "participants",
    "http://purl.org/dc/terms/Location" : "place",
eric's avatar
eric committed
99
100
}

101
HOST = None
gijs's avatar
gijs committed
102
103
104
if Site.objects.count() > 0:
    site = Site.objects.all()[0]
    HOST = site.domain
105

eric's avatar
eric committed
106
107
108
109
110
111
112
def query_results_to_template_articles(query_results):
    """
    Transform the RDFLIB SPARQL query result into the row that we want to use for the template
    """
    template_articles = []
    article = None
    current_uri = None
113
114
115
116
    
    if len(query_results) == 0:
        return template_articles
    
eric's avatar
eric committed
117
    for s, p, o in query_results:
gijs's avatar
gijs committed
118
119
120
        uri   = s.strip()
        key   = p.strip()
        value = o.strip()
eric's avatar
eric committed
121
122
123
124
125
126
127
128
129
130
        
        if uri != current_uri:
            if article:
                template_articles.append(article)
            article = {
                "href": uri
            }
            current_uri = uri
        
        if key in short_names:
eric's avatar
eric committed
131
132
133
134
            new_key = short_names[key]
            # Some terms give us lists:
            if key in [
               "http://purl.org/dc/terms/creator",
eric's avatar
eric committed
135
               "http://commontag.org/ns#Tag",
eric's avatar
eric committed
136
137
138
139
140
141
142
               "http://purl.org/dc/terms/contributor",
               "http://purl.org/dc/terms/Location"
            ]:
                if new_key not in article:
                   article[new_key] = []
                article[new_key].append(value)
            # Otherwise it is strings
143
144
            else:
                article[new_key] = value
eric's avatar
eric committed
145
146
147
    
    template_articles.append(article)
    
148
149
150
151
152
153
    # Ad hoc: on VJ14, we were getting a result:
    # {'date': u'2013-12-14T00:00:00',
    #  'href': u'http://video.constantvzw.org/VJ14/videoarchive/wolke/Wolke-JuliaRone.ogv',
    #  'license': u'Free Art Licence'}
    # Still don’t know why we get it—but then it balks because there is no title
    # So we first check if there is a title
154
    # Ad hoc: remove the about page (maybe have some meta info. that determine if a page can be sniffed?)
155
    template_articles = [article for article in template_articles if 'title' in article and article['title'] != "About"]
156
    
eric's avatar
eric committed
157
    return sorted(template_articles, key=lambda a: a['date'] if 'date' in a else 0, reverse=True)
eric's avatar
eric committed
158

159
def snif():
160
161
162
    global HOST
    if not HOST:
        return "No site domain settings found"
163
    host = u"http://%s" % HOST
164
    
165
166
167
168
    start = clock()
    
    g = rdflib.Graph()
    
169
170
    i = 0
    total = Pad.objects.count()
171
    for pad in Pad.objects.all():
172
173
        i += 1
        txt = "checking pad %s of %s: %s" % (i, total, pad.display_slug)
gijs's avatar
gijs committed
174
175
        print(txt)
        print(host + pad.get_absolute_url())
176
177
178
179
        # We only want to index the articles—
        # For now we can distinguish them because they have url’s
        # ending in ‘.md’
        if not pad.display_slug.endswith('.md'):
180
            print("no *.md extension, probably not meant for publication")
181
182
183
            continue
        try:
            result = g.parse(host + pad.get_absolute_url())
184
            print("succesfully parsed")
gijs's avatar
gijs committed
185
        except error.HTTPError as e:
186
187
188
189
            if e.code == 403:
                # Some of the pads will not be public yet—
                # They gives a ‘403 FORBIDDEN’ response
                # this is expected, and we don’t need to scrape them
190
                print("pad not public")
191
192
193
194
195
196
197
                continue
            else:
                raise
    
    d = query_results_to_template_articles(g.query(sparql_query))
    
    with open(os.path.join(BACKUP_DIR, "index.json"), 'w') as f:
gijs's avatar
gijs committed
198
        json.dump(d, f, indent=2, ensure_ascii=False)
199
200
201
202
203
204
        # with open(os.path.join(BACKUP_DIR, "index.html"), 'w') as f:
        #    f.write(render_to_string("home.html", {"articles" : d}).encode('utf-8'))
    
    duration = clock() - start
    return "sniffed %s articles in %s seconds" % ( Pad.objects.count(), duration )

eric's avatar
eric committed
205
206
207
208
209

class Command(BaseCommand):
    args = ''
    help = 'Create an index of all the articles’ metadata'

210
    print("Starting to index, this might take some time...")
eric's avatar
eric committed
211
    def handle(self, *args, **options):
212
213
        return snif()