parse.py 5.21 KB
Newer Older
gijs's avatar
gijs committed
1
2
3
4
import markdown
import os.path
import urllib

gijs's avatar
gijs committed
5
from .models import modelFor, collectionFor, UnknownContentTypeError, knownContentTypes, resolveReferences, knownContentType
6
from .utils import info, debug, warn, keyFilter
gijs's avatar
gijs committed
7
8
9
10

from markdown.extensions.toc import TocExtension
from py_etherpad import EtherpadLiteClient

11
from django.core.management.base import BaseCommand
gijs's avatar
gijs committed
12
13
14
from django.utils.safestring import mark_safe
from etherpadlite.models import Pad

15
16
from .settings import DEFAULT_CONTENT_TYPE

alexandre's avatar
alexandre committed
17
from django.conf import settings
gijs's avatar
gijs committed
18

19
20
from generator.extract_meta import extract_meta

21
22
"""
  
gijs's avatar
gijs committed
23
  Loop through all the pads and 'parse' them as markdown.
24
25
26
27
28
  This should return both the content and a dictionary for the metadata

  From this information a model is contstructed. The metadata is further
  parsed depending the field type.

29
30
  Links will try to look up their targets. If the target pad isn't parsed yet
  a stub is created to be filled later in the process. 
31
32
33
34

  TODO: decouple metadata parsing and linking. To make sure all data is seen
  before linking is performed.

35
36
37
38
  If both keys and labels are used to address models. Depending the order of
  encountering we might create an instance for the label and another for the
  key. Especially when the label / title is later changed.

39
40
"""

gijs's avatar
gijs committed
41
42
def parse_pads ():
  epclient = None
gijs's avatar
gijs committed
43
44
  models = []

gijs's avatar
gijs committed
45
46
47
48
49
  for pad in Pad.objects.all():
    if not epclient:
      epclient = EtherpadLiteClient(pad.server.apikey, pad.server.apiurl)

    name, extension = os.path.splitext(pad.display_slug)
alexandre's avatar
alexandre committed
50
    padID = pad.publicpadid if pad.is_public else pad.group.groupID + '$' + urllib.parse.quote(pad.name.replace(settings.PAD_NAMESPACE_SEPARATOR, '_'))
51
    
gijs's avatar
gijs committed
52
    info('Reading {}'.format(pad.display_slug))
gijs's avatar
gijs committed
53

54
55
56
57
58
59
60
    try:
      source = epclient.getText(padID)['text']
    except ValueError:
      warn('Could not find pad {}'.format(pad.display_slug))
      continue


gijs's avatar
gijs committed
61
    if extension in ['.md', '.markdown']:
62
      # source, collectedLinkTargets = resolveReferences(source, source=None)
gijs's avatar
gijs committed
63

64
65
66
67
      # md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(baselevel=2), 'attr_list'])
      # content = mark_safe(md.convert(source))
      meta, content = extract_meta(source)
      label = None
gijs's avatar
gijs committed
68
69

      try:
70
        # meta = md.Meta
gijs's avatar
gijs committed
71
72
        meta['pk'] = pad.pk

73
        # if the first line of the metadata is a known contenttype
74
        # use it as such. It's value becomes the key and potentially
75
76
        # the label
        firstMetaKey, firstMetaValue = list(meta.items())[0]
gijs's avatar
gijs committed
77
        
gijs's avatar
gijs committed
78
        if knownContentType(firstMetaKey):
79
80
          contentType = firstMetaKey
          key = keyFilter(firstMetaValue)
81
          label = firstMetaValue
gijs's avatar
gijs committed
82

83
84
85
86
87
          if 'type' in meta:
            warn('Both valid contenttype present in the first row ({0}) as well as a type declaration ({1}), using {0}'.format(contentType, meta['type'][0]), pad.display_slug)
        else:
          if 'type' in meta:
            if meta['type'] == ['biography']:
88
              warn("Outdated contenttype biography. for pad: {}".format(pad.display_slug))
89
90
91
              meta['type'] = ['produser']
            contentType = meta['type'][0]
          else:
92
            debug("No contenttype found, applied default contenttype for pad: {}".format(pad.display_slug))
93
94
95
96
            contentType = DEFAULT_CONTENT_TYPE
          key = modelFor(contentType).extractKey(meta)

        collection = collectionFor(contentType)
97
        
gijs's avatar
gijs committed
98
        debug('Extracted key: {}'.format(key))
99
        model = collection.instantiate(key=key, label=label, metadata=meta, content=content, source_path=pad.display_slug)
gijs's avatar
gijs committed
100
        models.append(model)
101
102
        # resolveReferences()

103
104
105
106
        # if collectedLinkTargets:
        #   # print('Collected link targets')
        #   for linkTarget in collectedLinkTargets:
        #     # print(linkTarget.contentType, linkTarget)
gijs's avatar
gijs committed
107

108
109
110
111
        #     # TODO, simplify linking process
        #     # make references to more than just tags
        #     if linkTarget.contentType == 'tag' and 'tags' in model.metadataFields:
        #       current = model.tags if hasattr(model, 'tags') else []
gijs's avatar
gijs committed
112

113
114
        #       if linkTarget not in current:
        #         model.tags = current + model.metadataFields['tags']([str(linkTarget)], model)
gijs's avatar
gijs committed
115

gijs's avatar
gijs committed
116
      except UnknownContentTypeError as e:
gijs's avatar
gijs committed
117
118
        debug('Skipped `{}`'.format(name))
        debug(e)
gijs's avatar
gijs committed
119
120
        pass

gijs's avatar
gijs committed
121
    info('Read {}'.format(pad.display_slug))
122
123

    # Excecuting links
gijs's avatar
gijs committed
124
  for model in models:
125
126
    # resolve links
    # collect inline links
gijs's avatar
gijs committed
127
128
129
    model.resolveLinks()
    
    if model.content:
gijs's avatar
gijs committed
130
131
      # Render inline references
      content, _ = resolveReferences(model) # Second return are the collected references
gijs's avatar
gijs committed
132
133
      # render markdown
      md = markdown.Markdown(extensions=['extra', TocExtension(baselevel=2), 'attr_list'])
gijs's avatar
gijs committed
134
135
      model.content = mark_safe(md.convert(content))
      
gijs's avatar
gijs committed
136
  return models
137

gijs's avatar
gijs committed
138
139
140
141
142
143
144
145
146
147
class Command(BaseCommand):
  args = ''
  help = 'Generate a static interpretation of the pads'


  def handle(self, *args, **options):
    parse_pads()

    for produser in collectionFor('produser').models:
      for k in dir(produser):
gijs's avatar
gijs committed
148
        info(getattr(produser, k))
gijs's avatar
gijs committed
149
150

    # print(collectionFor('produser').models)
alexandre's avatar
alexandre committed
151
    # print(collectionFor('event').models, collectionFor('event').models[0].metadata, collectionFor('event').models[0].metadata['produser'].metadata)