index.py 8.24 KB
Newer Older
eric's avatar
eric committed
1
2
3
4
5
6
# -*- coding: utf-8 -*-

# Python imports

import os
import json
7
from urllib import error
8
from time import clock
eric's avatar
eric committed
9
10
11
12
13
14
15

# PyPi imports

import rdflib

# Django imports

16
from django.core.management.base import BaseCommand
eric's avatar
eric committed
17
18
19

# Django Apps import

gijs's avatar
gijs committed
20
from django.contrib.sites.models import Site
21
from etherpadlite.models import Pad
alexandre's avatar
alexandre committed
22
from django.conf import settings
eric's avatar
eric committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

"""
We scrape all the pages, construct a graph, and ask the RDF store to return us all the metadata.

The result will looks something like this:

+----------------------------------------------------------------------+--------------------------------------------+---------------------------------------------------+
| subject                                                              | predicate                                  | object                                            |
+----------------------------------------------------------------------+--------------------------------------------+---------------------------------------------------+
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://purl.org/dc/terms/creator           | Élodie Royer et Yoann Gourmel                     |
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://purl.org/dc/terms/title             | The Play/ザ・プレイ                                 |
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://www.w3.org/ns/md#item               | http://www.w3.org/1999/02/22-rdf-syntax-ns#nil    |
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://purl.org/dc/terms/created           | 2012-06-02T00:00:00                               |
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://www.w3.org/1999/xhtml/vocab#license | http://creativecommons.org/licenses/by-nd/3.0/fr/ |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://purl.org/dc/terms/title             | Utopia                                            |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://purl.org/dc/terms/creator           | Bernadette Mayer                                  |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://www.w3.org/1999/xhtml/vocab#license |                                                   |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://purl.org/dc/terms/created           | 2014-10-02T00:00:00                               |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://www.w3.org/ns/md#item               | http://www.w3.org/1999/02/22-rdf-syntax-ns#nil    |
+----------------------------------------------------------------------+--------------------------------------------+---------------------------------------------------+

We then use python to construct a list that is easy to use in a template, something like:

[
  {
48
49
50
51
52
53
54
55
    "language": "fr", 
    "author": "Mayer, Bernadette", 
    "title": "Utopia", 
    "href": "http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md", 
    "authors": [
      "Mayer, Bernadette"
    ], 
    "date": "2014-10-10T00:00:00"
eric's avatar
eric committed
56
57
  }, 
  {
58
59
60
61
62
63
64
65
66
67
    "license": "http://creativecommons.org/licenses/by-nd/3.0/fr/", 
    "language": "fr", 
    "author": "Gourmel, Yoann", 
    "title": "The Play/ザ・プレイ", 
    "href": "http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md", 
    "authors": [
      "Royer, Élodie", 
      "Gourmel, Yoann"
    ], 
    "date": "2012-06-10T00:00:00"
eric's avatar
eric committed
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
  }
]

"""


# This SPARQL QUERY it simply gets all the metadata in triples
sparql_query = """PREFIX dc: <http://purl.org/dc/elements/1.1/>
select ?subject ?predicate ?object
where { ?subject ?predicate ?object .\n
}
order by ?subject 
"""

# Map between predicate uris and more easy names to use in the template
short_names = {
    "http://www.w3.org/1999/xhtml/vocab#license" : "license",
    "http://purl.org/dc/terms/title" : "title",
eric's avatar
eric committed
86
    "http://purl.org/dc/terms/creator" : "authors",
eric's avatar
eric committed
87
    "http://purl.org/dc/terms/created" : "date",
88
    "http://purl.org/dc/terms/language" : "language",
svilayphiou's avatar
svilayphiou committed
89
    "http://purl.org/dc/terms/type" : "type",
90
    "http://purl.org/dc/terms/identifier" : "id",
eric's avatar
eric committed
91
92
    "http://commontag.org/ns#Tag" : "tags",
    "http://purl.org/dc/terms/subject" : "theme",
eric's avatar
eric committed
93
94
    "http://purl.org/dc/terms/contributor" : "participants",
    "http://purl.org/dc/terms/Location" : "place",
eric's avatar
eric committed
95
96
97
98
99
100
101
102
103
}

def query_results_to_template_articles(query_results):
    """
    Transform the RDFLIB SPARQL query result into the row that we want to use for the template
    """
    template_articles = []
    article = None
    current_uri = None
104
105
106
107
    
    if len(query_results) == 0:
        return template_articles
    
eric's avatar
eric committed
108
    for s, p, o in query_results:
gijs's avatar
gijs committed
109
110
111
        uri   = s.strip()
        key   = p.strip()
        value = o.strip()
eric's avatar
eric committed
112
113
114
115
116
117
118
119
120
121
        
        if uri != current_uri:
            if article:
                template_articles.append(article)
            article = {
                "href": uri
            }
            current_uri = uri
        
        if key in short_names:
eric's avatar
eric committed
122
123
124
125
            new_key = short_names[key]
            # Some terms give us lists:
            if key in [
               "http://purl.org/dc/terms/creator",
eric's avatar
eric committed
126
               "http://commontag.org/ns#Tag",
eric's avatar
eric committed
127
128
129
130
131
132
133
               "http://purl.org/dc/terms/contributor",
               "http://purl.org/dc/terms/Location"
            ]:
                if new_key not in article:
                   article[new_key] = []
                article[new_key].append(value)
            # Otherwise it is strings
134
135
            else:
                article[new_key] = value
eric's avatar
eric committed
136
137
138
    
    template_articles.append(article)
    
139
140
141
142
143
144
    # Ad hoc: on VJ14, we were getting a result:
    # {'date': u'2013-12-14T00:00:00',
    #  'href': u'http://video.constantvzw.org/VJ14/videoarchive/wolke/Wolke-JuliaRone.ogv',
    #  'license': u'Free Art Licence'}
    # Still don’t know why we get it—but then it balks because there is no title
    # So we first check if there is a title
145
    # Ad hoc: remove the about page (maybe have some meta info. that determine if a page can be sniffed?)
146
    template_articles = [article for article in template_articles if 'title' in article and article['title'] != "About"]
147
    
eric's avatar
eric committed
148
    return sorted(template_articles, key=lambda a: a['date'] if 'date' in a else 0, reverse=True)
eric's avatar
eric committed
149

150
def snif():
151
152
153
154
155
156
    HOST = None

    if Site.objects.count() > 0:
        site = Site.objects.all()[0]
        HOST = site.domain

157
158
    if not HOST:
        return "No site domain settings found"
159

160
    host = u"http://%s" % HOST
161
    
162
163
164
165
    start = clock()
    
    g = rdflib.Graph()
    
166
167
    i = 0
    total = Pad.objects.count()
168
    for pad in Pad.objects.all():
169
170
        i += 1
        txt = "checking pad %s of %s: %s" % (i, total, pad.display_slug)
gijs's avatar
gijs committed
171
172
        print(txt)
        print(host + pad.get_absolute_url())
173
174
175
176
        # We only want to index the articles—
        # For now we can distinguish them because they have url’s
        # ending in ‘.md’
        if not pad.display_slug.endswith('.md'):
177
            print("no *.md extension, probably not meant for publication")
178
179
180
            continue
        try:
            result = g.parse(host + pad.get_absolute_url())
181
            print("succesfully parsed")
gijs's avatar
gijs committed
182
        except error.HTTPError as e:
183
184
185
186
            if e.code == 403:
                # Some of the pads will not be public yet—
                # They gives a ‘403 FORBIDDEN’ response
                # this is expected, and we don’t need to scrape them
187
                print("pad not public")
188
189
190
191
192
193
                continue
            else:
                raise
    
    d = query_results_to_template_articles(g.query(sparql_query))
    
alexandre's avatar
alexandre committed
194
    with open(os.path.join(settings.BACKUP_DIR, "index.json"), 'w') as f:
gijs's avatar
gijs committed
195
        json.dump(d, f, indent=2, ensure_ascii=False)
alexandre's avatar
alexandre committed
196
        # with open(os.path.join(settings.BACKUP_DIR, "index.html"), 'w') as f:
197
198
199
200
201
        #    f.write(render_to_string("home.html", {"articles" : d}).encode('utf-8'))
    
    duration = clock() - start
    return "sniffed %s articles in %s seconds" % ( Pad.objects.count(), duration )

eric's avatar
eric committed
202
203
204
205
206

class Command(BaseCommand):
    args = ''
    help = 'Create an index of all the articles’ metadata'

207
    print("Starting to index, this might take some time...")
eric's avatar
eric committed
208
    def handle(self, *args, **options):
209
210
        return snif()