Updates version number in docs
[mediagoblin.git] / mediagoblin / storage.py
1 # GNU MediaGoblin -- federated, autonomous media hosting
2 # Copyright (C) 2011 MediaGoblin contributors. See AUTHORS.
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17 import os
18 import shutil
19 import urlparse
20 import uuid
21 import cloudfiles
22 import mimetypes
23 import tempfile
24
25 from werkzeug.utils import secure_filename
26
27 from mediagoblin import util
28
29 ########
30 # Errors
31 ########
32
33
34 class Error(Exception):
35 pass
36
37
38 class InvalidFilepath(Error):
39 pass
40
41
42 class NoWebServing(Error):
43 pass
44
45
46 class NotImplementedError(Error):
47 pass
48
49
50 ###############################################
51 # Storage interface & basic file implementation
52 ###############################################
53
54 class StorageInterface(object):
55 """
56 Interface for the storage API.
57
58 This interface doesn't actually provide behavior, but it defines
59 what kind of storage patterns subclasses should provide.
60
61 It is important to note that the storage API idea of a "filepath"
62 is actually like ['dir1', 'dir2', 'file.jpg'], so keep that in
63 mind while reading method documentation.
64
65 You should set up your __init__ method with whatever keyword
66 arguments are appropriate to your storage system, but you should
67 also passively accept all extraneous keyword arguments like:
68
69 def __init__(self, **kwargs):
70 pass
71
72 See BasicFileStorage as a simple implementation of the
73 StorageInterface.
74 """
75
76 # Whether this file store is on the local filesystem.
77 local_storage = False
78
79 def __raise_not_implemented(self):
80 """
81 Raise a warning about some component not implemented by a
82 subclass of this interface.
83 """
84 raise NotImplementedError(
85 "This feature not implemented in this storage API implementation.")
86
87 def file_exists(self, filepath):
88 """
89 Return a boolean asserting whether or not file at filepath
90 exists in our storage system.
91
92 Returns:
93 True / False depending on whether file exists or not.
94 """
95 # Subclasses should override this method.
96 self.__raise_not_implemented()
97
98 def get_file(self, filepath, mode='r'):
99 """
100 Return a file-like object for reading/writing from this filepath.
101
102 Should create directories, buckets, whatever, as necessary.
103 """
104 # Subclasses should override this method.
105 self.__raise_not_implemented()
106
107 def delete_file(self, filepath):
108 """
109 Delete or dereference the file at filepath.
110
111 This might need to delete directories, buckets, whatever, for
112 cleanliness. (Be sure to avoid race conditions on that though)
113 """
114 # Subclasses should override this method.
115 self.__raise_not_implemented()
116
117 def file_url(self, filepath):
118 """
119 Get the URL for this file. This assumes our storage has been
120 mounted with some kind of URL which makes this possible.
121 """
122 # Subclasses should override this method.
123 self.__raise_not_implemented()
124
125 def get_unique_filepath(self, filepath):
126 """
127 If a filename at filepath already exists, generate a new name.
128
129 Eg, if the filename doesn't exist:
130 >>> storage_handler.get_unique_filename(['dir1', 'dir2', 'fname.jpg'])
131 [u'dir1', u'dir2', u'fname.jpg']
132
133 But if a file does exist, let's get one back with at uuid tacked on:
134 >>> storage_handler.get_unique_filename(['dir1', 'dir2', 'fname.jpg'])
135 [u'dir1', u'dir2', u'd02c3571-dd62-4479-9d62-9e3012dada29-fname.jpg']
136 """
137 # Make sure we have a clean filepath to start with, since
138 # we'll be possibly tacking on stuff to the filename.
139 filepath = clean_listy_filepath(filepath)
140
141 if self.file_exists(filepath):
142 return filepath[:-1] + ["%s-%s" % (uuid.uuid4(), filepath[-1])]
143 else:
144 return filepath
145
146 def get_local_path(self, filepath):
147 """
148 If this is a local_storage implementation, give us a link to
149 the local filesystem reference to this file.
150
151 >>> storage_handler.get_local_path(['foo', 'bar', 'baz.jpg'])
152 u'/path/to/mounting/foo/bar/baz.jpg'
153 """
154 # Subclasses should override this method, if applicable.
155 self.__raise_not_implemented()
156
157 def copy_locally(self, filepath, dest_path):
158 """
159 Copy this file locally.
160
161 A basic working method for this is provided that should
162 function both for local_storage systems and remote storge
163 systems, but if more efficient systems for copying locally
164 apply to your system, override this method with something more
165 appropriate.
166 """
167 if self.local_storage:
168 shutil.copy(
169 self.get_local_path(filepath), dest_path)
170 else:
171 with self.get_file(filepath, 'rb') as source_file:
172 with file(dest_path, 'wb') as dest_file:
173 dest_file.write(source_file.read())
174
175
176 class BasicFileStorage(StorageInterface):
177 """
178 Basic local filesystem implementation of storage API
179 """
180
181 local_storage = True
182
183 def __init__(self, base_dir, base_url=None, **kwargs):
184 """
185 Keyword arguments:
186 - base_dir: Base directory things will be served out of. MUST
187 be an absolute path.
188 - base_url: URL files will be served from
189 """
190 self.base_dir = base_dir
191 self.base_url = base_url
192
193 def _resolve_filepath(self, filepath):
194 """
195 Transform the given filepath into a local filesystem filepath.
196 """
197 return os.path.join(
198 self.base_dir, *clean_listy_filepath(filepath))
199
200 def file_exists(self, filepath):
201 return os.path.exists(self._resolve_filepath(filepath))
202
203 def get_file(self, filepath, mode='r'):
204 # Make directories if necessary
205 if len(filepath) > 1:
206 directory = self._resolve_filepath(filepath[:-1])
207 if not os.path.exists(directory):
208 os.makedirs(directory)
209
210 # Grab and return the file in the mode specified
211 return open(self._resolve_filepath(filepath), mode)
212
213 def delete_file(self, filepath):
214 # TODO: Also delete unused directories if empty (safely, with
215 # checks to avoid race conditions).
216 os.remove(self._resolve_filepath(filepath))
217
218 def file_url(self, filepath):
219 if not self.base_url:
220 raise NoWebServing(
221 "base_url not set, cannot provide file urls")
222
223 return urlparse.urljoin(
224 self.base_url,
225 '/'.join(clean_listy_filepath(filepath)))
226
227 def get_local_path(self, filepath):
228 return self._resolve_filepath(filepath)
229
230
231 # ----------------------------------------------------
232 # OpenStack/Rackspace Cloud's Swift/CloudFiles support
233 # ----------------------------------------------------
234
235 class CloudFilesStorage(StorageInterface):
236 def __init__(self, **kwargs):
237 self.param_container = kwargs.get('cloudfiles_container')
238 self.param_user = kwargs.get('cloudfiles_user')
239 self.param_api_key = kwargs.get('cloudfiles_api_key')
240 self.param_host = kwargs.get('cloudfiles_host')
241 self.param_use_servicenet = kwargs.get('cloudfiles_use_servicenet')
242
243 if not self.param_host:
244 print('No CloudFiles host URL specified, '
245 'defaulting to Rackspace US')
246
247 self.connection = cloudfiles.get_connection(
248 username=self.param_user,
249 api_key=self.param_api_key,
250 servicenet=True if self.param_use_servicenet == 'true' or \
251 self.param_use_servicenet == True else False)
252
253 if not self.param_container == \
254 self.connection.get_container(self.param_container):
255 self.container = self.connection.create_container(
256 self.param_container)
257 self.container.make_public(
258 ttl=60 * 60 * 2)
259 else:
260 self.container = self.connection.get_container(
261 self.param_container)
262
263 self.container_uri = self.container.public_uri()
264
265 def _resolve_filepath(self, filepath):
266 return '/'.join(
267 clean_listy_filepath(filepath))
268
269 def file_exists(self, filepath):
270 try:
271 object = self.container.get_object(
272 self._resolve_filepath(filepath))
273 return True
274 except cloudfiles.errors.NoSuchObject:
275 return False
276
277 def get_file(self, filepath, *args, **kwargs):
278 """
279 - Doesn't care about the "mode" argument
280 """
281 try:
282 obj = self.container.get_object(
283 self._resolve_filepath(filepath))
284 except cloudfiles.errors.NoSuchObject:
285 obj = self.container.create_object(
286 self._resolve_filepath(filepath))
287
288 mimetype = mimetypes.guess_type(
289 filepath[-1])
290
291 if mimetype:
292 obj.content_type = mimetype[0]
293
294 return StorageObjectWrapper(obj, *args, **kwargs)
295
296 def delete_file(self, filepath):
297 # TODO: Also delete unused directories if empty (safely, with
298 # checks to avoid race conditions).
299 self.container.delete_object(
300 self._resolve_filepath(filepath))
301
302 def file_url(self, filepath):
303 return '/'.join([
304 self.container_uri,
305 self._resolve_filepath(filepath)])
306
307
308 class StorageObjectWrapper():
309 """
310 Wrapper for python-cloudfiles's cloudfiles.storage_object.Object
311 used to circumvent the mystic `medium.jpg` corruption issue, where
312 we had both python-cloudfiles and PIL doing buffering on both
313 ends and that breaking things.
314
315 This wrapper currently meets mediagoblin's needs for a public_store
316 file-like object.
317 """
318 def __init__(self, storage_object, *args, **kwargs):
319 self.storage_object = storage_object
320
321 def read(self, *args, **kwargs):
322 return self.storage_object.read(*args, **kwargs)
323
324 def write(self, data, *args, **kwargs):
325 if self.storage_object.size and type(data) == str:
326 data = self.read() + data
327
328 self.storage_object.write(data, *args, **kwargs)
329
330
331 # ------------
332 # MountStorage
333 # ------------
334
335 class MountStorage(StorageInterface):
336 """
337 Experimental "Mount" virtual Storage Interface
338
339 This isn't an interface to some real storage, instead it's a
340 redirecting interface, that redirects requests to other
341 "StorageInterface"s.
342
343 For example, say you have the paths:
344
345 1. ['user_data', 'cwebber', 'avatar.jpg']
346 2. ['user_data', 'elrond', 'avatar.jpg']
347 3. ['media_entries', '34352f304c3f4d0ad8ad0f043522b6f2', 'thumb.jpg']
348
349 You could mount media_entries under CloudFileStorage and user_data
350 under BasicFileStorage. Then 1 would be passed to
351 BasicFileStorage under the path ['cwebber', 'avatar.jpg'] and 3
352 would be passed to CloudFileStorage under
353 ['34352f304c3f4d0ad8ad0f043522b6f2', 'thumb.jpg'].
354
355 In other words, this is kind of like mounting /home/ and /etc/
356 under different filesystems on your operating system... but with
357 mediagoblin filestorages :)
358
359 To set this up, you currently need to call the mount() method with
360 the target path and a backend, that shall be available under that
361 target path. You have to mount things in a sensible order,
362 especially you can't mount ["a", "b"] before ["a"].
363 """
364 def __init__(self, **kwargs):
365 self.mounttab = {}
366
367 def mount(self, dirpath, backend):
368 """
369 Mount a new backend under dirpath
370 """
371 new_ent = clean_listy_filepath(dirpath)
372
373 print "Mounting:", repr(new_ent)
374 already, rem_1, table, rem_2 = self._resolve_to_backend(new_ent, True)
375 print "===", repr(already), repr(rem_1), repr(rem_2), len(table)
376
377 assert (len(rem_2) > 0) or (None not in table), \
378 "That path is already mounted"
379 assert (len(rem_2) > 0) or (len(table)==0), \
380 "A longer path is already mounted here"
381
382 for part in rem_2:
383 table[part] = {}
384 table = table[part]
385 table[None] = backend
386
387 def _resolve_to_backend(self, filepath, extra_info = False):
388 """
389 extra_info = True is for internal use!
390
391 Normally, returns the backend and the filepath inside that backend.
392
393 With extra_info = True it returns the last directory node and the
394 remaining filepath from there in addition.
395 """
396 table = self.mounttab
397 filepath = filepath[:]
398 res_fp = None
399 while True:
400 new_be = table.get(None)
401 if (new_be is not None) or res_fp is None:
402 res_be = new_be
403 res_fp = filepath[:]
404 res_extra = (table, filepath[:])
405 # print "... New res: %r, %r, %r" % (res_be, res_fp, res_extra)
406 if len(filepath) == 0:
407 break
408 query = filepath.pop(0)
409 entry = table.get(query)
410 if entry is not None:
411 table = entry
412 res_extra = (table, filepath[:])
413 else:
414 break
415 if extra_info:
416 return (res_be, res_fp) + res_extra
417 else:
418 return (res_be, res_fp)
419
420 def resolve_to_backend(self, filepath):
421 backend, filepath = self._resolve_to_backend(filepath)
422 if backend is None:
423 raise Error("Path not mounted")
424 return backend, filepath
425
426 def __repr__(self, table = None, indent = []):
427 res = []
428 if table is None:
429 res.append("MountStorage<")
430 table = self.mounttab
431 v = table.get(None)
432 if v:
433 res.append(" " * len(indent) + repr(indent) + ": " + repr(v))
434 for k, v in table.iteritems():
435 if k == None:
436 continue
437 res.append(" " * len(indent) + repr(k) + ":")
438 res += self.__repr__(v, indent + [k])
439 if table is self.mounttab:
440 res.append(">")
441 return "\n".join(res)
442 else:
443 return res
444
445 def file_exists(self, filepath):
446 backend, filepath = self.resolve_to_backend(filepath)
447 return backend.file_exists(filepath)
448
449 def get_file(self, filepath, mode='r'):
450 backend, filepath = self.resolve_to_backend(filepath)
451 return backend.get_file(filepath, mode)
452
453 def delete_file(self, filepath):
454 backend, filepath = self.resolve_to_backend(filepath)
455 return backend.delete_file(filepath)
456
457 def file_url(self, filepath):
458 backend, filepath = self.resolve_to_backend(filepath)
459 return backend.file_url(filepath)
460
461 def get_local_path(self, filepath):
462 backend, filepath = self.resolve_to_backend(filepath)
463 return backend.get_local_path(filepath)
464
465 def copy_locally(self, filepath, dest_path):
466 """
467 Need to override copy_locally, because the local_storage
468 attribute is not correct.
469 """
470 backend, filepath = self.resolve_to_backend(filepath)
471 backend.copy_locally(filepath, dest_path)
472
473
474 ###########
475 # Utilities
476 ###########
477
478 def clean_listy_filepath(listy_filepath):
479 """
480 Take a listy filepath (like ['dir1', 'dir2', 'filename.jpg']) and
481 clean out any nastiness from it.
482
483
484 >>> clean_listy_filepath([u'/dir1/', u'foo/../nasty', u'linooks.jpg'])
485 [u'dir1', u'foo_.._nasty', u'linooks.jpg']
486
487 Args:
488 - listy_filepath: a list of filepath components, mediagoblin
489 storage API style.
490
491 Returns:
492 A cleaned list of unicode objects.
493 """
494 cleaned_filepath = [
495 unicode(secure_filename(filepath))
496 for filepath in listy_filepath]
497
498 if u'' in cleaned_filepath:
499 raise InvalidFilepath(
500 "A filename component could not be resolved into a usable name.")
501
502 return cleaned_filepath
503
504
505 def storage_system_from_config(config_section):
506 """
507 Utility for setting up a storage system from a config section.
508
509 Note that a special argument may be passed in to
510 the config_section which is "storage_class" which will provide an
511 import path to a storage system. This defaults to
512 "mediagoblin.storage:BasicFileStorage" if otherwise undefined.
513
514 Arguments:
515 - config_section: dictionary of config parameters
516
517 Returns:
518 An instantiated storage system.
519
520 Example:
521 storage_system_from_config(
522 {'base_url': '/media/',
523 'base_dir': '/var/whatever/media/'})
524
525 Will return:
526 BasicFileStorage(
527 base_url='/media/',
528 base_dir='/var/whatever/media')
529 """
530 # This construct is needed, because dict(config) does
531 # not replace the variables in the config items.
532 config_params = dict(config_section.iteritems())
533
534 if 'storage_class' in config_params:
535 storage_class = config_params['storage_class']
536 config_params.pop('storage_class')
537 else:
538 storage_class = "mediagoblin.storage:BasicFileStorage"
539
540 storage_class = util.import_component(storage_class)
541 return storage_class(**config_params)