index.py 8.42 KB
Newer Older
eric's avatar
eric committed
1
2
3
4
5
# -*- coding: utf-8 -*-

# Python imports

import os
6
import re
7
import sys
eric's avatar
eric committed
8
9
import codecs
import json
10
from urllib import error
11
from time import clock
eric's avatar
eric committed
12
13
14
15
16
17
18
19
20
21
22
23

# PyPi imports

import rdflib

# Django imports

from django.core.management.base import BaseCommand, CommandError
from django.template.loader import render_to_string

# Django Apps import

24
#from django.contrib.sites.models import Site
eric's avatar
eric committed
25
from etherpadlite.models import Pad, PadAuthor
26
from ethertoff.settings import BACKUP_DIR
eric's avatar
eric committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

"""
We scrape all the pages, construct a graph, and ask the RDF store to return us all the metadata.

The result will looks something like this:

+----------------------------------------------------------------------+--------------------------------------------+---------------------------------------------------+
| subject                                                              | predicate                                  | object                                            |
+----------------------------------------------------------------------+--------------------------------------------+---------------------------------------------------+
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://purl.org/dc/terms/creator           | Élodie Royer et Yoann Gourmel                     |
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://purl.org/dc/terms/title             | The Play/ザ・プレイ                                 |
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://www.w3.org/ns/md#item               | http://www.w3.org/1999/02/22-rdf-syntax-ns#nil    |
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://purl.org/dc/terms/created           | 2012-06-02T00:00:00                               |
| http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md | http://www.w3.org/1999/xhtml/vocab#license | http://creativecommons.org/licenses/by-nd/3.0/fr/ |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://purl.org/dc/terms/title             | Utopia                                            |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://purl.org/dc/terms/creator           | Bernadette Mayer                                  |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://www.w3.org/1999/xhtml/vocab#license |                                                   |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://purl.org/dc/terms/created           | 2014-10-02T00:00:00                               |
| http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md              | http://www.w3.org/ns/md#item               | http://www.w3.org/1999/02/22-rdf-syntax-ns#nil    |
+----------------------------------------------------------------------+--------------------------------------------+---------------------------------------------------+

We then use python to construct a list that is easy to use in a template, something like:

[
  {
52
53
54
55
56
57
58
59
    "language": "fr", 
    "author": "Mayer, Bernadette", 
    "title": "Utopia", 
    "href": "http://127.0.0.1:8000/r/B_Bernadette-Mayer_Utopia_FR.md", 
    "authors": [
      "Mayer, Bernadette"
    ], 
    "date": "2014-10-10T00:00:00"
eric's avatar
eric committed
60
61
  }, 
  {
62
63
64
65
66
67
68
69
70
71
    "license": "http://creativecommons.org/licenses/by-nd/3.0/fr/", 
    "language": "fr", 
    "author": "Gourmel, Yoann", 
    "title": "The Play/ザ・プレイ", 
    "href": "http://127.0.0.1:8000/r/06_Elodie_Royer_Yoann_Gourmel_The_Play_FR.md", 
    "authors": [
      "Royer, Élodie", 
      "Gourmel, Yoann"
    ], 
    "date": "2012-06-10T00:00:00"
eric's avatar
eric committed
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
  }
]

"""


# This SPARQL QUERY it simply gets all the metadata in triples
sparql_query = """PREFIX dc: <http://purl.org/dc/elements/1.1/>
select ?subject ?predicate ?object
where { ?subject ?predicate ?object .\n
}
order by ?subject 
"""

# Map between predicate uris and more easy names to use in the template
short_names = {
    "http://www.w3.org/1999/xhtml/vocab#license" : "license",
    "http://purl.org/dc/terms/title" : "title",
eric's avatar
eric committed
90
    "http://purl.org/dc/terms/creator" : "authors",
eric's avatar
eric committed
91
    "http://purl.org/dc/terms/created" : "date",
92
    "http://purl.org/dc/terms/language" : "language",
svilayphiou's avatar
svilayphiou committed
93
    "http://purl.org/dc/terms/type" : "type",
94
    "http://purl.org/dc/terms/identifier" : "id",
eric's avatar
eric committed
95
96
    "http://commontag.org/ns#Tag" : "tags",
    "http://purl.org/dc/terms/subject" : "theme",
eric's avatar
eric committed
97
98
    "http://purl.org/dc/terms/contributor" : "participants",
    "http://purl.org/dc/terms/Location" : "place",
eric's avatar
eric committed
99
100
}

101
HOST = None
102
103
104
#if Site.objects.count() > 0:
    #site = Site.objects.all()[0]
    #HOST = site.domain
105

eric's avatar
eric committed
106
107
108
109
110
111
112
def query_results_to_template_articles(query_results):
    """
    Transform the RDFLIB SPARQL query result into the row that we want to use for the template
    """
    template_articles = []
    article = None
    current_uri = None
113
114
115
116
    
    if len(query_results) == 0:
        return template_articles
    
eric's avatar
eric committed
117
    for s, p, o in query_results:
118
        print(s.encode('utf-8'), p.encode('utf-8'), o.encode('utf-8'))
eric's avatar
eric committed
119
120
121
122
123
124
125
126
127
128
129
130
131
        uri   = unicode(s).strip()
        key   = unicode(p).strip()
        value = unicode(o).strip()
        
        if uri != current_uri:
            if article:
                template_articles.append(article)
            article = {
                "href": uri
            }
            current_uri = uri
        
        if key in short_names:
eric's avatar
eric committed
132
133
134
135
            new_key = short_names[key]
            # Some terms give us lists:
            if key in [
               "http://purl.org/dc/terms/creator",
eric's avatar
eric committed
136
               "http://commontag.org/ns#Tag",
eric's avatar
eric committed
137
138
139
140
141
142
143
               "http://purl.org/dc/terms/contributor",
               "http://purl.org/dc/terms/Location"
            ]:
                if new_key not in article:
                   article[new_key] = []
                article[new_key].append(value)
            # Otherwise it is strings
144
145
            else:
                article[new_key] = value
eric's avatar
eric committed
146
147
148
    
    template_articles.append(article)
    
149
150
151
152
153
154
    # Ad hoc: on VJ14, we were getting a result:
    # {'date': u'2013-12-14T00:00:00',
    #  'href': u'http://video.constantvzw.org/VJ14/videoarchive/wolke/Wolke-JuliaRone.ogv',
    #  'license': u'Free Art Licence'}
    # Still don’t know why we get it—but then it balks because there is no title
    # So we first check if there is a title
155
    # Ad hoc: remove the about page (maybe have some meta info. that determine if a page can be sniffed?)
156
    template_articles = [article for article in template_articles if 'title' in article and article['title'] != "About"]
157
    
eric's avatar
eric committed
158
    return sorted(template_articles, key=lambda a: a['date'] if 'date' in a else 0, reverse=True)
eric's avatar
eric committed
159

160
def snif():
161
162
163
    global HOST
    if not HOST:
        return "No site domain settings found"
164
    host = u"http://%s" % HOST
165
    
166
167
168
169
    start = clock()
    
    g = rdflib.Graph()
    
170
171
    i = 0
    total = Pad.objects.count()
172
    for pad in Pad.objects.all():
173
174
        i += 1
        txt = "checking pad %s of %s: %s" % (i, total, pad.display_slug)
175
        print(txt.encode('utf-8'))
176
177
178
179
        # We only want to index the articles—
        # For now we can distinguish them because they have url’s
        # ending in ‘.md’
        if not pad.display_slug.endswith('.md'):
180
            print("no *.md extension, probably not meant for publication")
181
182
183
            continue
        try:
            result = g.parse(host + pad.get_absolute_url())
184
185
            print("succesfully parsed")
        except(HTTPError, e):
186
187
188
189
            if e.code == 403:
                # Some of the pads will not be public yet—
                # They gives a ‘403 FORBIDDEN’ response
                # this is expected, and we don’t need to scrape them
190
                print("pad not public")
191
192
193
194
195
196
197
198
199
200
201
202
203
204
                continue
            else:
                raise
    
    d = query_results_to_template_articles(g.query(sparql_query))
    
    with open(os.path.join(BACKUP_DIR, "index.json"), 'w') as f:
        f.write(json.dumps(d, indent=2, ensure_ascii=False).encode('utf-8'))
        # with open(os.path.join(BACKUP_DIR, "index.html"), 'w') as f:
        #    f.write(render_to_string("home.html", {"articles" : d}).encode('utf-8'))
    
    duration = clock() - start
    return "sniffed %s articles in %s seconds" % ( Pad.objects.count(), duration )

eric's avatar
eric committed
205
206
207
208
209

class Command(BaseCommand):
    args = ''
    help = 'Create an index of all the articles’ metadata'

210
    print("Starting to index, this might take some time...")
eric's avatar
eric committed
211
    def handle(self, *args, **options):
212
213
        return snif()