[mediagoblin.git] / mediagoblin / gmg_commands / batchaddmedia.py

# GNU MediaGoblin -- federated, autonomous media hosting
# Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import os
import tempfile, tarfile, zipfile, subprocess, requests
from csv import reader as csv_reader
from urlparse import urlparse
from pyld import jsonld

from mediagoblin.gmg_commands import util as commands_util
from mediagoblin.submit.lib import (
    submit_media, get_upload_file_limits,
    FileUploadLimit, UserUploadLimit, UserPastUploadLimit)
from mediagoblin.tools.metadata import compact_and_validate

from jsonschema.exceptions import ValidationError


def parser_setup(subparser):
    subparser.description = """\
This command allows the administrator to upload many media files at once."""
    subparser.add_argument(
        'username',
        help="Name of user these media entries belong to")
    subparser.add_argument(
        'target_path',
        help=("""\
Path to a local archive or directory containing a "location.csv" and a 
"metadata.csv" file. These are csv (comma seperated value) files with the
locations and metadata of the files to be uploaded. The location must be listed
with either the URL of the remote media file or the filesystem path of a local
file. The metadata should be provided with one column for each of the 15 Dublin
Core properties (http://dublincore.org/documents/dces/). Both "location.csv" and
"metadata.csv" must begin with a row demonstrating the order of the columns. We
have provided an example of these files at <url to be added>
"""))
    subparser.add_argument(
        '--celery',
        action='store_true',
        help="Don't process eagerly, pass off to celery")


def batchaddmedia(args):
    # Run eagerly unless explicetly set not to
    if not args.celery:
        os.environ['CELERY_ALWAYS_EAGER'] = 'true'

    app = commands_util.setup_app(args)

    files_uploaded, files_attempted = 0, 0

    # get the user
    user = app.db.User.query.filter_by(username=args.username.lower()).first()
    if user is None:
        print "Sorry, no user by username '%s' exists" % args.username
        return

    upload_limit, max_file_size = get_upload_file_limits(user)
    temp_files = []

    if os.path.isdir(args.target_path):
        dir_path = args.target_path

    elif tarfile.is_tarfile(args.target_path):
        dir_path = tempfile.mkdtemp()
        temp_files.append(dir_path)
        tar = tarfile.open(args.target_path)
        tar.extractall(path=dir_path)

    elif zipfile.is_zipfile(args.target_path):
        dir_path = tempfile.mkdtemp()
        temp_files.append(dir_path)
        zipped_file = zipfile.ZipFile(args.target_path)
        zipped_file.extractall(path=dir_path)

    else:
        print "Couldn't recognize the file. This script only accepts tar files,\
zip files and directories"
    if dir_path.endswith('/'):
        dir_path = dir_path[:-1]

    location_file_path = os.path.join(dir_path,"location.csv")
    metadata_file_path = os.path.join(dir_path, "metadata.csv")

    # check for the location file, if it exists...
    abs_location_filename = os.path.abspath(location_file_path)
    if not os.path.exists(abs_location_filename):
        print "Can't find a file with filename '%s'" % location_file_path
        return

    # check for the metadata file, if it exists...
    abs_metadata_filename = os.path.abspath(metadata_file_path)
    if not os.path.exists(abs_metadata_filename):
        print "Can't find a file with filename '%s'" % metadata_file_path
        return

    upload_limit, max_file_size = get_upload_file_limits(user)

    def maybe_unicodeify(some_string):
        # this is kinda terrible
        if some_string is None:
            return None
        else:
            return unicode(some_string)

    with file(abs_location_filename, 'r') as all_locations:
        contents = all_locations.read()
        media_locations = parse_csv_file(contents)

    with file(abs_metadata_filename, 'r') as all_metadata:
        contents = all_metadata.read()
        media_metadata = parse_csv_file(contents)

    for media_id in media_locations.keys():
        files_attempted += 1

        file_metadata = media_metadata[media_id]
        try:
            json_ld_metadata = compact_and_validate(file_metadata)
        except ValidationError, exc:
            print "Error with '%s' value '%s': %s" % (
                media_id, exc.path[0], exc.message)
            continue

        original_location = media_locations[media_id]['media:original']
        url = urlparse(original_location)

        title = json_ld_metadata.get('dcterms:title')
        description = json_ld_metadata.get('dcterms:description')

        license = json_ld_metadata.get('license')
        filename = url.path.split()[-1]

        if url.scheme == 'http':
            res = requests.get(url.geturl(), stream=True)
            media_file = res.raw

        elif url.scheme == '':
            path = url.path
            if os.path.isabs(path):
                file_abs_path = os.path.abspath(path)
            else:
                file_path = os.path.join(dir_path, path)
                file_abs_path = os.path.abspath(file_path)
            try:
                media_file = file(file_abs_path, 'r')
            except IOError:
                print "\
FAIL: Local file {filename} could not be accessed.".format(filename=filename)
                print "Skipping it."
                continue
        try:
            submit_media(
                mg_app=app,
                user=user,
                submitted_file=media_file,
                filename=filename,
                title=maybe_unicodeify(title),
                description=maybe_unicodeify(description),
                license=maybe_unicodeify(license),
                metadata=json_ld_metadata,
                tags_string=u"",
                upload_limit=upload_limit, max_file_size=max_file_size)
            print "Successfully uploading {filename}!".format(filename=filename)
            print ""
            files_uploaded += 1
        except FileUploadLimit:
            print "FAIL: This file is larger than the upload limits for this site."
        except UserUploadLimit:
            print "FAIL: This file will put this user past their upload limits."
        except UserPastUploadLimit:
            print "FAIL: This user is already past their upload limits."
    print "\
{files_uploaded} out of {files_attempted} files successfully uploaded".format(
        files_uploaded=files_uploaded,
        files_attempted=files_attempted)
    teardown(temp_files)


def parse_csv_file(file_contents):
    list_of_contents = file_contents.split('\n')
    key, lines = (list_of_contents[0].split(','),
                  list_of_contents[1:])
    objects_dict = {}

    # Build a dictionaryfrom mediagoblin.tools.translate import lazy_pass_to_ugettext as _
    for line in lines:
        if line.isspace() or line == '': continue
        values = csv_reader([line]).next()
        line_dict = dict([(key[i], val)
            for i, val in enumerate(values)])
        media_id = line_dict['media:id']
        objects_dict[media_id] = (line_dict)

    return objects_dict


def teardown(temp_files):
    for temp_file in temp_files:
        subprocess.call(['rm','-r',temp_file])
Commit	Line	Data
8aa01597	1	# GNU MediaGoblin -- federated, autonomous media hosting
	2	# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
	3	#
	4	# This program is free software: you can redistribute it and/or modify
	5	# it under the terms of the GNU Affero General Public License as published by
	6	# the Free Software Foundation, either version 3 of the License, or
	7	# (at your option) any later version.
	8	#
	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU Affero General Public License for more details.
	13	#
	14	# You should have received a copy of the GNU Affero General Public License
	15	# along with this program. If not, see <http://www.gnu.org/licenses/>.
	16
	17	import os
6fab7734	18	import tempfile, tarfile, zipfile, subprocess, requests
268f2430	19	from csv import reader as csv_reader
	20	from urlparse import urlparse
	21	from pyld import jsonld
8aa01597	22
	23	from mediagoblin.gmg_commands import util as commands_util
	24	from mediagoblin.submit.lib import (
	25	submit_media, get_upload_file_limits,
	26	FileUploadLimit, UserUploadLimit, UserPastUploadLimit)
6fab7734	27	from mediagoblin.tools.metadata import compact_and_validate
8aa01597	28
26b3d6cf	29	from jsonschema.exceptions import ValidationError
af3a9107	30
8aa01597	31
8aa01597	32	def parser_setup(subparser):
28ecc53a	33	subparser.description = """\
28ecc53a	34	This command allows the administrator to upload many media files at once."""
8aa01597	35	subparser.add_argument(
8aa01597	36	'username',
28ecc53a	37	help="Name of user these media entries belong to")
8aa01597	38	subparser.add_argument(
268f2430	39	'target_path',
28ecc53a	40	help=("""\
	41	Path to a local archive or directory containing a "location.csv" and a
	42	"metadata.csv" file. These are csv (comma seperated value) files with the
	43	locations and metadata of the files to be uploaded. The location must be listed
	44	with either the URL of the remote media file or the filesystem path of a local
	45	file. The metadata should be provided with one column for each of the 15 Dublin
	46	Core properties (http://dublincore.org/documents/dces/). Both "location.csv" and
	47	"metadata.csv" must begin with a row demonstrating the order of the columns. We
	48	have provided an example of these files at <url to be added>
	49	"""))
8aa01597	50	subparser.add_argument(
	51	'--celery',
	52	action='store_true',
	53	help="Don't process eagerly, pass off to celery")
	54
	55
	56	def batchaddmedia(args):
	57	# Run eagerly unless explicetly set not to
	58	if not args.celery:
	59	os.environ['CELERY_ALWAYS_EAGER'] = 'true'
	60
	61	app = commands_util.setup_app(args)
	62
5c14f62d	63	files_uploaded, files_attempted = 0, 0
5c14f62d	64
8aa01597	65	# get the user
	66	user = app.db.User.query.filter_by(username=args.username.lower()).first()
	67	if user is None:
28ecc53a	68	print "Sorry, no user by username '%s' exists" % args.username
8aa01597	69	return
268f2430	70
	71	upload_limit, max_file_size = get_upload_file_limits(user)
	72	temp_files = []
	73
5c14f62d	74	if os.path.isdir(args.target_path):
	75	dir_path = args.target_path
	76
	77	elif tarfile.is_tarfile(args.target_path):
268f2430	78	dir_path = tempfile.mkdtemp()
	79	temp_files.append(dir_path)
	80	tar = tarfile.open(args.target_path)
	81	tar.extractall(path=dir_path)
	82
28ecc53a	83	elif zipfile.is_zipfile(args.target_path):
	84	dir_path = tempfile.mkdtemp()
	85	temp_files.append(dir_path)
	86	zipped_file = zipfile.ZipFile(args.target_path)
	87	zipped_file.extractall(path=dir_path)
	88
28ecc53a	89	else:
	90	print "Couldn't recognize the file. This script only accepts tar files,\
	91	zip files and directories"
	92	if dir_path.endswith('/'):
	93	dir_path = dir_path[:-1]
	94
18a9c50d	95	location_file_path = os.path.join(dir_path,"location.csv")
18a9c50d	96	metadata_file_path = os.path.join(dir_path, "metadata.csv")
28ecc53a	97
8aa01597	98	# check for the location file, if it exists...
268f2430	99	abs_location_filename = os.path.abspath(location_file_path)
8aa01597	100	if not os.path.exists(abs_location_filename):
268f2430	101	print "Can't find a file with filename '%s'" % location_file_path
8aa01597	102	return
8aa01597	103
268f2430	104	# check for the metadata file, if it exists...
268f2430	105	abs_metadata_filename = os.path.abspath(metadata_file_path)
8aa01597	106	if not os.path.exists(abs_metadata_filename):
268f2430	107	print "Can't find a file with filename '%s'" % metadata_file_path
8aa01597	108	return
	109
	110	upload_limit, max_file_size = get_upload_file_limits(user)
	111
	112	def maybe_unicodeify(some_string):
	113	# this is kinda terrible
	114	if some_string is None:
	115	return None
	116	else:
	117	return unicode(some_string)
	118
	119	with file(abs_location_filename, 'r') as all_locations:
	120	contents = all_locations.read()
	121	media_locations = parse_csv_file(contents)
	122
	123	with file(abs_metadata_filename, 'r') as all_metadata:
	124	contents = all_metadata.read()
	125	media_metadata = parse_csv_file(contents)
	126
268f2430	127	for media_id in media_locations.keys():
e46760d3	128	files_attempted += 1
e46760d3	129
6fab7734 CAW	130	file_metadata = media_metadata[media_id]
	131	try:
	132	json_ld_metadata = compact_and_validate(file_metadata)
	133	except ValidationError, exc:
	134	print "Error with '%s' value '%s': %s" % (
	135	media_id, exc.path[0], exc.message)
	136	continue
26b3d6cf	137
268f2430	138	original_location = media_locations[media_id]['media:original']
	139	url = urlparse(original_location)
	140
6fab7734 CAW	141	title = json_ld_metadata.get('dcterms:title')
6fab7734 CAW	142	description = json_ld_metadata.get('dcterms:description')
af3a9107	143
6fab7734	144	license = json_ld_metadata.get('license')
268f2430	145	filename = url.path.split()[-1]
268f2430	146
268f2430	147	if url.scheme == 'http':
7ff99dab	148	res = requests.get(url.geturl(), stream=True)
ecea4847	149	media_file = res.raw
268f2430	150
	151	elif url.scheme == '':
	152	path = url.path
	153	if os.path.isabs(path):
	154	file_abs_path = os.path.abspath(path)
	155	else:
18a9c50d	156	file_path = os.path.join(dir_path, path)
268f2430	157	file_abs_path = os.path.abspath(file_path)
	158	try:
	159	media_file = file(file_abs_path, 'r')
	160	except IOError:
5c14f62d	161	print "\
5c14f62d	162	FAIL: Local file {filename} could not be accessed.".format(filename=filename)
268f2430	163	print "Skipping it."
268f2430	164	continue
268f2430	165	try:
	166	submit_media(
	167	mg_app=app,
	168	user=user,
	169	submitted_file=media_file,
	170	filename=filename,
	171	title=maybe_unicodeify(title),
	172	description=maybe_unicodeify(description),
	173	license=maybe_unicodeify(license),
45f426dd	174	metadata=json_ld_metadata,
268f2430	175	tags_string=u"",
	176	upload_limit=upload_limit, max_file_size=max_file_size)
	177	print "Successfully uploading {filename}!".format(filename=filename)
	178	print ""
5c14f62d	179	files_uploaded += 1
268f2430	180	except FileUploadLimit:
5c14f62d	181	print "FAIL: This file is larger than the upload limits for this site."
268f2430	182	except UserUploadLimit:
5c14f62d	183	print "FAIL: This file will put this user past their upload limits."
268f2430	184	except UserPastUploadLimit:
5c14f62d	185	print "FAIL: This user is already past their upload limits."
	186	print "\
	187	{files_uploaded} out of {files_attempted} files successfully uploaded".format(
	188	files_uploaded=files_uploaded,
	189	files_attempted=files_attempted)
268f2430	190	teardown(temp_files)
268f2430	191
28ecc53a	192
8aa01597	193	def parse_csv_file(file_contents):
	194	list_of_contents = file_contents.split('\n')
	195	key, lines = (list_of_contents[0].split(','),
	196	list_of_contents[1:])
268f2430	197	objects_dict = {}
8aa01597	198
8c7cccf6	199	# Build a dictionaryfrom mediagoblin.tools.translate import lazy_pass_to_ugettext as _
8aa01597	200	for line in lines:
8aa01597	201	if line.isspace() or line == '': continue
268f2430	202	values = csv_reader([line]).next()
268f2430	203	line_dict = dict([(key[i], val)
8aa01597	204	for i, val in enumerate(values)])
268f2430	205	media_id = line_dict['media:id']
268f2430	206	objects_dict[media_id] = (line_dict)
8aa01597	207
268f2430	208	return objects_dict
8aa01597	209
77d51d4f	210
268f2430	211	def teardown(temp_files):
	212	for temp_file in temp_files:
	213	subprocess.call(['rm','-r',temp_file])