index.py 8.33 KB
Newer Older
eric's avatar
eric committed
1
2
3
4
5
6
# -*- coding: utf-8 -*-

# Python imports

import os
import json
7
from urllib import error
alexandre's avatar
alexandre committed
8
9
10
11
12
try:
    from time import clock
except ImportError:
    # Python >= 3.8
    from time import perf_counter as clock
eric's avatar
eric committed
13
14
15
16
17
18
19

# PyPi imports

import rdflib

# Django imports

20
from django.core.management.base import BaseCommand
eric's avatar
eric committed
21
22
23

# Django Apps import

gijs's avatar
gijs committed
24
from django.contrib.sites.models import Site
25
from etherpadlite.models import Pad
alexandre's avatar
alexandre committed
26
from django.conf import settings
eric's avatar
eric committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

"""
We scrape all the pages, construct a graph, and ask the RDF store to return us all the metadata.

The result will looks something like this:

+----------------------------------------------------------------------+--------------------------------------------+---------------------------------------------------+
| subject                                                              | predicate                                  | object                                            |
+----------------------------------------------------------------------+--------------------------------------------+---------------------------------------------------+
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://purl.org/dc/terms/creator           | Élodie Royer et Yoann Gourmel                     |
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://purl.org/dc/terms/title             | The Play/ザ・プレイ                                 |
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://www.w3.org/ns/md#item               | http://www.w3.org/1999/02/22-rdf-syntax-ns#nil    |
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://purl.org/dc/terms/created           | 2012-06-02T00:00:00                               |
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://www.w3.org/1999/xhtml/vocab#license | http://creativecommons.org/licenses/by-nd/3.0/fr/ |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://purl.org/dc/terms/title             | Utopia                                            |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://purl.org/dc/terms/creator           | Bernadette Mayer                                  |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://www.w3.org/1999/xhtml/vocab#license |                                                   |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://purl.org/dc/terms/created           | 2014-10-02T00:00:00                               |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://www.w3.org/ns/md#item               | http://www.w3.org/1999/02/22-rdf-syntax-ns#nil    |
+----------------------------------------------------------------------+--------------------------------------------+---------------------------------------------------+

We then use python to construct a list that is easy to use in a template, something like:

[
  {
52
53
54
55
56
57
58
59
    "language": "fr", 
    "author": "Mayer, Bernadette", 
    "title": "Utopia", 
    "href": "http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md", 
    "authors": [
      "Mayer, Bernadette"
    ], 
    "date": "2014-10-10T00:00:00"
eric's avatar
eric committed
60
61
  }, 
  {
62
63
64
65
66
67
68
69
70
71
    "license": "http://creativecommons.org/licenses/by-nd/3.0/fr/", 
    "language": "fr", 
    "author": "Gourmel, Yoann", 
    "title": "The Play/ザ・プレイ", 
    "href": "http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md", 
    "authors": [
      "Royer, Élodie", 
      "Gourmel, Yoann"
    ], 
    "date": "2012-06-10T00:00:00"
eric's avatar
eric committed
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
  }
]

"""


# This SPARQL QUERY it simply gets all the metadata in triples
sparql_query = """PREFIX dc: <http://purl.org/dc/elements/1.1/>
select ?subject ?predicate ?object
where { ?subject ?predicate ?object .\n
}
order by ?subject 
"""

# Map between predicate uris and more easy names to use in the template
short_names = {
    "http://www.w3.org/1999/xhtml/vocab#license" : "license",
    "http://purl.org/dc/terms/title" : "title",
eric's avatar
eric committed
90
    "http://purl.org/dc/terms/creator" : "authors",
eric's avatar
eric committed
91
    "http://purl.org/dc/terms/created" : "date",
92
    "http://purl.org/dc/terms/language" : "language",
svilayphiou's avatar
svilayphiou committed
93
    "http://purl.org/dc/terms/type" : "type",
94
    "http://purl.org/dc/terms/identifier" : "id",
eric's avatar
eric committed
95
96
    "http://commontag.org/ns#Tag" : "tags",
    "http://purl.org/dc/terms/subject" : "theme",
eric's avatar
eric committed
97
98
    "http://purl.org/dc/terms/contributor" : "participants",
    "http://purl.org/dc/terms/Location" : "place",
eric's avatar
eric committed
99
100
101
102
103
104
105
106
107
}

def query_results_to_template_articles(query_results):
    """
    Transform the RDFLIB SPARQL query result into the row that we want to use for the template
    """
    template_articles = []
    article = None
    current_uri = None
108
109
110
111
    
    if len(query_results) == 0:
        return template_articles
    
eric's avatar
eric committed
112
    for s, p, o in query_results:
gijs's avatar
gijs committed
113
114
115
        uri   = s.strip()
        key   = p.strip()
        value = o.strip()
eric's avatar
eric committed
116
117
118
119
120
121
122
123
124
125
        
        if uri != current_uri:
            if article:
                template_articles.append(article)
            article = {
                "href": uri
            }
            current_uri = uri
        
        if key in short_names:
eric's avatar
eric committed
126
127
128
129
            new_key = short_names[key]
            # Some terms give us lists:
            if key in [
               "http://purl.org/dc/terms/creator",
eric's avatar
eric committed
130
               "http://commontag.org/ns#Tag",
eric's avatar
eric committed
131
132
133
134
135
136
137
               "http://purl.org/dc/terms/contributor",
               "http://purl.org/dc/terms/Location"
            ]:
                if new_key not in article:
                   article[new_key] = []
                article[new_key].append(value)
            # Otherwise it is strings
138
139
            else:
                article[new_key] = value
eric's avatar
eric committed
140
141
142
    
    template_articles.append(article)
    
143
144
145
146
147
148
    # Ad hoc: on VJ14, we were getting a result:
    # {'date': u'2013-12-14T00:00:00',
    #  'href': u'http://video.constantvzw.org/VJ14/videoarchive/wolke/Wolke-JuliaRone.ogv',
    #  'license': u'Free Art Licence'}
    # Still don’t know why we get it—but then it balks because there is no title
    # So we first check if there is a title
149
    # Ad hoc: remove the about page (maybe have some meta info. that determine if a page can be sniffed?)
150
    template_articles = [article for article in template_articles if 'title' in article and article['title'] != "About"]
151
    
eric's avatar
eric committed
152
    return sorted(template_articles, key=lambda a: a['date'] if 'date' in a else 0, reverse=True)
eric's avatar
eric committed
153

154
def snif():
155
156
157
158
159
160
    HOST = None

    if Site.objects.count() > 0:
        site = Site.objects.all()[0]
        HOST = site.domain

161
162
    if not HOST:
        return "No site domain settings found"
163

164
    host = u"http://%s" % HOST
165
    
166
167
168
169
    start = clock()
    
    g = rdflib.Graph()
    
170
171
    i = 0
    total = Pad.objects.count()
172
    for pad in Pad.objects.all():
173
174
        i += 1
        txt = "checking pad %s of %s: %s" % (i, total, pad.display_slug)
gijs's avatar
gijs committed
175
176
        print(txt)
        print(host + pad.get_absolute_url())
177
178
179
180
        # We only want to index the articles—
        # For now we can distinguish them because they have url’s
        # ending in ‘.md’
        if not pad.display_slug.endswith('.md'):
181
            print("no *.md extension, probably not meant for publication")
182
183
184
            continue
        try:
            result = g.parse(host + pad.get_absolute_url())
185
            print("succesfully parsed")
gijs's avatar
gijs committed
186
        except error.HTTPError as e:
187
188
189
190
            if e.code == 403:
                # Some of the pads will not be public yet—
                # They gives a ‘403 FORBIDDEN’ response
                # this is expected, and we don’t need to scrape them
191
                print("pad not public")
192
193
194
195
196
197
                continue
            else:
                raise
    
    d = query_results_to_template_articles(g.query(sparql_query))
    
alexandre's avatar
alexandre committed
198
    with open(os.path.join(settings.BACKUP_DIR, "index.json"), 'w') as f:
gijs's avatar
gijs committed
199
        json.dump(d, f, indent=2, ensure_ascii=False)
alexandre's avatar
alexandre committed
200
        # with open(os.path.join(settings.BACKUP_DIR, "index.html"), 'w') as f:
201
202
203
204
205
        #    f.write(render_to_string("home.html", {"articles" : d}).encode('utf-8'))
    
    duration = clock() - start
    return "sniffed %s articles in %s seconds" % ( Pad.objects.count(), duration )

eric's avatar
eric committed
206
207
208
209
210

class Command(BaseCommand):
    args = ''
    help = 'Create an index of all the articles’ metadata'

211
    print("Starting to index, this might take some time...")
eric's avatar
eric committed
212
    def handle(self, *args, **options):
213
214
        return snif()