88c748cecf532cb8873fd4651a29838738d83d6d
[mediagoblin.git] / mediagoblin / storage.py
1 # GNU MediaGoblin -- federated, autonomous media hosting
2 # Copyright (C) 2011 Free Software Foundation, Inc
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17 import os
18 import re
19 import shutil
20 import urlparse
21 import uuid
22 import cloudfiles
23
24 from werkzeug.utils import secure_filename
25
26 from mediagoblin import util
27
28 ########
29 # Errors
30 ########
31
32
33 class Error(Exception):
34 pass
35
36
37 class InvalidFilepath(Error):
38 pass
39
40
41 class NoWebServing(Error):
42 pass
43
44
45 class NotImplementedError(Error):
46 pass
47
48
49 ###############################################
50 # Storage interface & basic file implementation
51 ###############################################
52
53 class StorageInterface(object):
54 """
55 Interface for the storage API.
56
57 This interface doesn't actually provide behavior, but it defines
58 what kind of storage patterns subclasses should provide.
59
60 It is important to note that the storage API idea of a "filepath"
61 is actually like ['dir1', 'dir2', 'file.jpg'], so keep that in
62 mind while reading method documentation.
63
64 You should set up your __init__ method with whatever keyword
65 arguments are appropriate to your storage system, but you should
66 also passively accept all extraneous keyword arguments like:
67
68 def __init__(self, **kwargs):
69 pass
70
71 See BasicFileStorage as a simple implementation of the
72 StorageInterface.
73 """
74
75 # Whether this file store is on the local filesystem.
76 local_storage = False
77
78 def __raise_not_implemented(self):
79 """
80 Raise a warning about some component not implemented by a
81 subclass of this interface.
82 """
83 raise NotImplementedError(
84 "This feature not implemented in this storage API implementation.")
85
86 def file_exists(self, filepath):
87 """
88 Return a boolean asserting whether or not file at filepath
89 exists in our storage system.
90
91 Returns:
92 True / False depending on whether file exists or not.
93 """
94 # Subclasses should override this method.
95 self.__raise_not_implemented()
96
97 def get_file(self, filepath, mode='r'):
98 """
99 Return a file-like object for reading/writing from this filepath.
100
101 Should create directories, buckets, whatever, as necessary.
102 """
103 # Subclasses should override this method.
104 self.__raise_not_implemented()
105
106 def delete_file(self, filepath):
107 """
108 Delete or dereference the file at filepath.
109
110 This might need to delete directories, buckets, whatever, for
111 cleanliness. (Be sure to avoid race conditions on that though)
112 """
113 # Subclasses should override this method.
114 self.__raise_not_implemented()
115
116 def file_url(self, filepath):
117 """
118 Get the URL for this file. This assumes our storage has been
119 mounted with some kind of URL which makes this possible.
120 """
121 # Subclasses should override this method.
122 self.__raise_not_implemented()
123
124 def get_unique_filepath(self, filepath):
125 """
126 If a filename at filepath already exists, generate a new name.
127
128 Eg, if the filename doesn't exist:
129 >>> storage_handler.get_unique_filename(['dir1', 'dir2', 'fname.jpg'])
130 [u'dir1', u'dir2', u'fname.jpg']
131
132 But if a file does exist, let's get one back with at uuid tacked on:
133 >>> storage_handler.get_unique_filename(['dir1', 'dir2', 'fname.jpg'])
134 [u'dir1', u'dir2', u'd02c3571-dd62-4479-9d62-9e3012dada29-fname.jpg']
135 """
136 # Make sure we have a clean filepath to start with, since
137 # we'll be possibly tacking on stuff to the filename.
138 filepath = clean_listy_filepath(filepath)
139
140 if self.file_exists(filepath):
141 return filepath[:-1] + ["%s-%s" % (uuid.uuid4(), filepath[-1])]
142 else:
143 return filepath
144
145 def get_local_path(self, filepath):
146 """
147 If this is a local_storage implementation, give us a link to
148 the local filesystem reference to this file.
149
150 >>> storage_handler.get_local_path(['foo', 'bar', 'baz.jpg'])
151 u'/path/to/mounting/foo/bar/baz.jpg'
152 """
153 # Subclasses should override this method, if applicable.
154 self.__raise_not_implemented()
155
156 def copy_locally(self, filepath, dest_path):
157 """
158 Copy this file locally.
159
160 A basic working method for this is provided that should
161 function both for local_storage systems and remote storge
162 systems, but if more efficient systems for copying locally
163 apply to your system, override this method with something more
164 appropriate.
165 """
166 if self.local_storage:
167 shutil.copy(
168 self.get_local_path(filepath), dest_path)
169 else:
170 with self.get_file(filepath, 'rb') as source_file:
171 with file(dest_path, 'wb') as dest_file:
172 dest_file.write(source_file.read())
173
174
175 class BasicFileStorage(StorageInterface):
176 """
177 Basic local filesystem implementation of storage API
178 """
179
180 local_storage = True
181
182 def __init__(self, base_dir, base_url=None, **kwargs):
183 """
184 Keyword arguments:
185 - base_dir: Base directory things will be served out of. MUST
186 be an absolute path.
187 - base_url: URL files will be served from
188 """
189 self.base_dir = base_dir
190 self.base_url = base_url
191
192 def _resolve_filepath(self, filepath):
193 """
194 Transform the given filepath into a local filesystem filepath.
195 """
196 return os.path.join(
197 self.base_dir, *clean_listy_filepath(filepath))
198
199 def file_exists(self, filepath):
200 return os.path.exists(self._resolve_filepath(filepath))
201
202 def get_file(self, filepath, mode='r'):
203 # Make directories if necessary
204 if len(filepath) > 1:
205 directory = self._resolve_filepath(filepath[:-1])
206 if not os.path.exists(directory):
207 os.makedirs(directory)
208
209 # Grab and return the file in the mode specified
210 return open(self._resolve_filepath(filepath), mode)
211
212 def delete_file(self, filepath):
213 # TODO: Also delete unused directories if empty (safely, with
214 # checks to avoid race conditions).
215 os.remove(self._resolve_filepath(filepath))
216
217 def file_url(self, filepath):
218 if not self.base_url:
219 raise NoWebServing(
220 "base_url not set, cannot provide file urls")
221
222 return urlparse.urljoin(
223 self.base_url,
224 '/'.join(clean_listy_filepath(filepath)))
225
226 def get_local_path(self, filepath):
227 return self._resolve_filepath(filepath)
228
229
230 class CloudFilesStorage(StorageInterface):
231 def __init__(self, **kwargs):
232 self.param_container = kwargs.get('cloudfiles_container')
233 self.param_user = kwargs.get('cloudfiles_user')
234 self.param_api_key = kwargs.get('cloudfiles_api_key')
235 self.param_host = kwargs.get('cloudfiles_host')
236 self.param_use_servicenet = kwargs.get('cloudfiles_use_servicenet')
237
238 if not self.param_host:
239 print('No CloudFiles host URL specified, '
240 'defaulting to Rackspace US')
241
242 self.connection = cloudfiles.get_connection(
243 username=self.param_user,
244 api_key=self.param_api_key,
245 servicenet=True if self.param_use_servicenet == 'true' or \
246 self.param_use_servicenet == True else False)
247
248 if not self.param_container == \
249 self.connection.get_container(self.param_container):
250 self.container = self.connection.create_container(
251 self.param_container)
252 self.container.make_public(
253 ttl=60 * 60 * 2)
254 else:
255 self.container = self.connection.get_container(
256 self.param_container)
257
258 def _resolve_filepath(self, filepath):
259 return '/'.join(
260 clean_listy_filepath(filepath))
261
262 def file_exists(self, filepath):
263 try:
264 object = self.container.get_object(
265 self._resolve_filepath(filepath))
266 return True
267 except cloudfiles.errors.NoSuchObject:
268 return False
269
270 def get_file(self, filepath, mode='r'):
271 try:
272 obj = self.container.get_object(
273 self._resolve_filepath(filepath))
274 except cloudfiles.errors.NoSuchObject:
275 obj = self.container.create_object(
276 self._resolve_filepath(filepath))
277
278 return obj
279
280 def delete_file(self, filepath):
281 # TODO: Also delete unused directories if empty (safely, with
282 # checks to avoid race conditions).
283 self.container.delete_object(filepath)
284
285 def file_url(self, filepath):
286 return self.get_file(filepath).public_uri()
287
288
289 class MountStorage(StorageInterface):
290 """
291 Experimental "Mount" virtual Storage Interface
292
293 This isn't an interface to some real storage, instead
294 it's a redirecting interface, that redirects requests
295 to other "StorageInterface"s.
296 For example, requests for ["store1", "a"] to first
297 storage with the path ["a"], etc.
298
299 To set this up, you currently need to call the mount()
300 method with the target path and a backend, that shall
301 be available under that target path.
302 You have to mount things in a sensible order,
303 especially you can't mount ["a", "b"] before ["a"].
304 """
305 def __init__(self, **kwargs):
306 self.mounttab = {}
307
308 def mount(self, dirpath, backend):
309 """
310 Mount a new backend under dirpath
311 """
312 new_ent = clean_listy_filepath(dirpath)
313
314 print "Mounting:", repr(new_ent)
315 already, rem_1, table, rem_2 = self._resolve_to_backend(new_ent, True)
316 print "===", repr(already), repr(rem_1), repr(rem_2), len(table)
317
318 assert (len(rem_2) > 0) or (None not in table), \
319 "That path is already mounted"
320 assert (len(rem_2) > 0) or (len(table)==0), \
321 "A longer path is already mounted here"
322
323 for part in rem_2:
324 table[part] = {}
325 table = table[part]
326 table[None] = backend
327
328 def _resolve_to_backend(self, filepath, extra_info = False):
329 """
330 extra_info = True is for internal use!
331
332 Normally, returns the backend and the filepath inside that backend.
333
334 With extra_info = True it returns the last directory node and the
335 remaining filepath from there in addition.
336 """
337 table = self.mounttab
338 filepath = filepath[:]
339 res_fp = None
340 while True:
341 new_be = table.get(None)
342 if (new_be is not None) or res_fp is None:
343 res_be = new_be
344 res_fp = filepath[:]
345 res_extra = (table, filepath[:])
346 # print "... New res: %r, %r, %r" % (res_be, res_fp, res_extra)
347 if len(filepath) == 0:
348 break
349 query = filepath.pop(0)
350 entry = table.get(query)
351 if entry is not None:
352 table = entry
353 res_extra = (table, filepath[:])
354 else:
355 break
356 if extra_info:
357 return (res_be, res_fp) + res_extra
358 else:
359 return (res_be, res_fp)
360
361 def resolve_to_backend(self, filepath):
362 backend, filepath = self._resolve_to_backend(filepath)
363 if backend is None:
364 raise Error("Path not mounted")
365 return backend, filepath
366
367 def __repr__(self, table = None, indent = []):
368 res = []
369 if table is None:
370 res.append("MountStorage<")
371 table = self.mounttab
372 v = table.get(None)
373 if v:
374 res.append(" " * len(indent) + repr(indent) + ": " + repr(v))
375 for k, v in table.iteritems():
376 if k == None:
377 continue
378 res.append(" " * len(indent) + repr(k) + ":")
379 res += self.__repr__(v, indent + [k])
380 if table is self.mounttab:
381 res.append(">")
382 return "\n".join(res)
383 else:
384 return res
385
386 def file_exists(self, filepath):
387 backend, filepath = self.resolve_to_backend(filepath)
388 return backend.file_exists(filepath)
389
390 def get_file(self, filepath, mode='r'):
391 backend, filepath = self.resolve_to_backend(filepath)
392 return backend.get_file(filepath, mode)
393
394 def delete_file(self, filepath):
395 backend, filepath = self.resolve_to_backend(filepath)
396 return backend.delete_file(filepath)
397
398 def file_url(self, filepath):
399 backend, filepath = self.resolve_to_backend(filepath)
400 return backend.file_url(filepath)
401
402 def get_local_path(self, filepath):
403 backend, filepath = self.resolve_to_backend(filepath)
404 return backend.get_local_path(filepath)
405
406 def copy_locally(self, filepath, dest_path):
407 """
408 Need to override copy_locally, because the local_storage
409 attribute is not correct.
410 """
411 backend, filepath = self.resolve_to_backend(filepath)
412 backend.copy_locally(filepath, dest_path)
413
414
415 ###########
416 # Utilities
417 ###########
418
419 def clean_listy_filepath(listy_filepath):
420 """
421 Take a listy filepath (like ['dir1', 'dir2', 'filename.jpg']) and
422 clean out any nastiness from it.
423
424
425 >>> clean_listy_filepath([u'/dir1/', u'foo/../nasty', u'linooks.jpg'])
426 [u'dir1', u'foo_.._nasty', u'linooks.jpg']
427
428 Args:
429 - listy_filepath: a list of filepath components, mediagoblin
430 storage API style.
431
432 Returns:
433 A cleaned list of unicode objects.
434 """
435 cleaned_filepath = [
436 unicode(secure_filename(filepath))
437 for filepath in listy_filepath]
438
439 if u'' in cleaned_filepath:
440 raise InvalidFilepath(
441 "A filename component could not be resolved into a usable name.")
442
443 return cleaned_filepath
444
445
446 def storage_system_from_config(paste_config, storage_prefix):
447 """
448 Utility for setting up a storage system from the paste app config.
449
450 Note that a special argument may be passed in to the paste_config
451 which is "${storage_prefix}_storage_class" which will provide an
452 import path to a storage system. This defaults to
453 "mediagoblin.storage:BasicFileStorage" if otherwise undefined.
454
455 Arguments:
456 - paste_config: dictionary of config parameters
457 - storage_prefix: the storage system we're setting up / will be
458 getting keys/arguments from. For example 'publicstore' will
459 grab all arguments that are like 'publicstore_FOO'.
460
461 Returns:
462 An instantiated storage system.
463
464 Example:
465 storage_system_from_config(
466 {'publicstore_base_url': '/media/',
467 'publicstore_base_dir': '/var/whatever/media/'},
468 'publicstore')
469
470 Will return:
471 BasicFileStorage(
472 base_url='/media/',
473 base_dir='/var/whatever/media')
474 """
475 prefix_re = re.compile('^%s_(.+)$' % re.escape(storage_prefix))
476
477 config_params = dict(
478 [(prefix_re.match(key).groups()[0], value)
479 for key, value in paste_config.iteritems()
480 if prefix_re.match(key)])
481
482 if 'storage_class' in config_params:
483 storage_class = config_params['storage_class']
484 config_params.pop('storage_class')
485 else:
486 storage_class = "mediagoblin.storage:BasicFileStorage"
487
488 storage_class = util.import_component(storage_class)
489 return storage_class(**config_params)