Fix audio thumbnailing once and for all.
[mediagoblin.git] / extlib / freesound / audioprocessing.py
1 #!/usr/bin/env python
2 # processing.py -- various audio processing functions
3 # Copyright (C) 2008 MUSIC TECHNOLOGY GROUP (MTG)
4 # UNIVERSITAT POMPEU FABRA
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU Affero General Public License as
8 # published by the Free Software Foundation, either version 3 of the
9 # License, or (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU Affero General Public License for more details.
15 #
16 # You should have received a copy of the GNU Affero General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 #
19 # Authors:
20 # Bram de Jong <bram.dejong at domain.com where domain in gmail>
21 # 2012, Joar Wandborg <first name at last name dot se>
22
23 from PIL import Image, ImageDraw, ImageColor #@UnresolvedImport
24 from functools import partial
25 import math
26 import numpy
27 import os
28 import re
29 import signal
30
31
32 def get_sound_type(input_filename):
33 sound_type = os.path.splitext(input_filename.lower())[1].strip(".")
34
35 if sound_type == "fla":
36 sound_type = "flac"
37 elif sound_type == "aif":
38 sound_type = "aiff"
39
40 return sound_type
41
42
43 try:
44 import scikits.audiolab as audiolab
45 except ImportError:
46 print "WARNING: audiolab is not installed so wav2png will not work"
47 import subprocess
48
49 class AudioProcessingException(Exception):
50 pass
51
52 class TestAudioFile(object):
53 """A class that mimics audiolab.sndfile but generates noise instead of reading
54 a wave file. Additionally it can be told to have a "broken" header and thus crashing
55 in the middle of the file. Also useful for testing ultra-short files of 20 samples."""
56 def __init__(self, num_frames, has_broken_header=False):
57 self.seekpoint = 0
58 self.nframes = num_frames
59 self.samplerate = 44100
60 self.channels = 1
61 self.has_broken_header = has_broken_header
62
63 def seek(self, seekpoint):
64 self.seekpoint = seekpoint
65
66 def read_frames(self, frames_to_read):
67 if self.has_broken_header and self.seekpoint + frames_to_read > self.num_frames / 2:
68 raise RuntimeError()
69
70 num_frames_left = self.num_frames - self.seekpoint
71 will_read = num_frames_left if num_frames_left < frames_to_read else frames_to_read
72 self.seekpoint += will_read
73 return numpy.random.random(will_read)*2 - 1
74
75
76 def get_max_level(filename):
77 max_value = 0
78 buffer_size = 4096
79 audio_file = audiolab.Sndfile(filename, 'r')
80 n_samples_left = audio_file.nframes
81
82 while n_samples_left:
83 to_read = min(buffer_size, n_samples_left)
84
85 try:
86 samples = audio_file.read_frames(to_read)
87 except RuntimeError:
88 # this can happen with a broken header
89 break
90
91 # convert to mono by selecting left channel only
92 if audio_file.channels > 1:
93 samples = samples[:,0]
94
95 max_value = max(max_value, numpy.abs(samples).max())
96
97 n_samples_left -= to_read
98
99 audio_file.close()
100
101 return max_value
102
103 class AudioProcessor(object):
104 """
105 The audio processor processes chunks of audio an calculates the spectrac centroid and the peak
106 samples in that chunk of audio.
107 """
108 def __init__(self, input_filename, fft_size, window_function=numpy.hanning):
109 max_level = get_max_level(input_filename)
110
111 self.audio_file = audiolab.Sndfile(input_filename, 'r')
112 self.fft_size = fft_size
113 self.window = window_function(self.fft_size)
114 self.spectrum_range = None
115 self.lower = 100
116 self.higher = 22050
117 self.lower_log = math.log10(self.lower)
118 self.higher_log = math.log10(self.higher)
119 self.clip = lambda val, low, high: min(high, max(low, val))
120
121 # figure out what the maximum value is for an FFT doing the FFT of a DC signal
122 fft = numpy.fft.rfft(numpy.ones(fft_size) * self.window)
123 max_fft = (numpy.abs(fft)).max()
124 # set the scale to normalized audio and normalized FFT
125 self.scale = 1.0/max_level/max_fft if max_level > 0 else 1
126
127 def read(self, start, size, resize_if_less=False):
128 """ read size samples starting at start, if resize_if_less is True and less than size
129 samples are read, resize the array to size and fill with zeros """
130
131 # number of zeros to add to start and end of the buffer
132 add_to_start = 0
133 add_to_end = 0
134
135 if start < 0:
136 # the first FFT window starts centered around zero
137 if size + start <= 0:
138 return numpy.zeros(size) if resize_if_less else numpy.array([])
139 else:
140 self.audio_file.seek(0)
141
142 add_to_start = -start # remember: start is negative!
143 to_read = size + start
144
145 if to_read > self.audio_file.nframes:
146 add_to_end = to_read - self.audio_file.nframes
147 to_read = self.audio_file.nframes
148 else:
149 self.audio_file.seek(start)
150
151 to_read = size
152 if start + to_read >= self.audio_file.nframes:
153 to_read = self.audio_file.nframes - start
154 add_to_end = size - to_read
155
156 try:
157 samples = self.audio_file.read_frames(to_read)
158 except RuntimeError:
159 # this can happen for wave files with broken headers...
160 return numpy.zeros(size) if resize_if_less else numpy.zeros(2)
161
162 # convert to mono by selecting left channel only
163 if self.audio_file.channels > 1:
164 samples = samples[:,0]
165
166 if resize_if_less and (add_to_start > 0 or add_to_end > 0):
167 if add_to_start > 0:
168 samples = numpy.concatenate((numpy.zeros(add_to_start), samples), axis=1)
169
170 if add_to_end > 0:
171 samples = numpy.resize(samples, size)
172 samples[size - add_to_end:] = 0
173
174 return samples
175
176
177 def spectral_centroid(self, seek_point, spec_range=110.0):
178 """ starting at seek_point read fft_size samples, and calculate the spectral centroid """
179
180 samples = self.read(seek_point - self.fft_size/2, self.fft_size, True)
181
182 samples *= self.window
183 fft = numpy.fft.rfft(samples)
184 spectrum = self.scale * numpy.abs(fft) # normalized abs(FFT) between 0 and 1
185 length = numpy.float64(spectrum.shape[0])
186
187 # scale the db spectrum from [- spec_range db ... 0 db] > [0..1]
188 db_spectrum = ((20*(numpy.log10(spectrum + 1e-60))).clip(-spec_range, 0.0) + spec_range)/spec_range
189
190 energy = spectrum.sum()
191 spectral_centroid = 0
192
193 if energy > 1e-60:
194 # calculate the spectral centroid
195
196 if self.spectrum_range == None:
197 self.spectrum_range = numpy.arange(length)
198
199 spectral_centroid = (spectrum * self.spectrum_range).sum() / (energy * (length - 1)) * self.audio_file.samplerate * 0.5
200
201 # clip > log10 > scale between 0 and 1
202 spectral_centroid = (math.log10(self.clip(spectral_centroid, self.lower, self.higher)) - self.lower_log) / (self.higher_log - self.lower_log)
203
204 return (spectral_centroid, db_spectrum)
205
206
207 def peaks(self, start_seek, end_seek):
208 """ read all samples between start_seek and end_seek, then find the minimum and maximum peak
209 in that range. Returns that pair in the order they were found. So if min was found first,
210 it returns (min, max) else the other way around. """
211
212 # larger blocksizes are faster but take more mem...
213 # Aha, Watson, a clue, a tradeof!
214 block_size = 4096
215
216 max_index = -1
217 max_value = -1
218 min_index = -1
219 min_value = 1
220
221 if start_seek < 0:
222 start_seek = 0
223
224 if end_seek > self.audio_file.nframes:
225 end_seek = self.audio_file.nframes
226
227 if end_seek <= start_seek:
228 samples = self.read(start_seek, 1)
229 return (samples[0], samples[0])
230
231 if block_size > end_seek - start_seek:
232 block_size = end_seek - start_seek
233
234 for i in range(start_seek, end_seek, block_size):
235 samples = self.read(i, block_size)
236
237 local_max_index = numpy.argmax(samples)
238 local_max_value = samples[local_max_index]
239
240 if local_max_value > max_value:
241 max_value = local_max_value
242 max_index = local_max_index
243
244 local_min_index = numpy.argmin(samples)
245 local_min_value = samples[local_min_index]
246
247 if local_min_value < min_value:
248 min_value = local_min_value
249 min_index = local_min_index
250
251 return (min_value, max_value) if min_index < max_index else (max_value, min_value)
252
253
254 def interpolate_colors(colors, flat=False, num_colors=256):
255 """ given a list of colors, create a larger list of colors interpolating
256 the first one. If flatten is True a list of numers will be returned. If
257 False, a list of (r,g,b) tuples. num_colors is the number of colors wanted
258 in the final list """
259
260 palette = []
261
262 for i in range(num_colors):
263 index = (i * (len(colors) - 1))/(num_colors - 1.0)
264 index_int = int(index)
265 alpha = index - float(index_int)
266
267 if alpha > 0:
268 r = (1.0 - alpha) * colors[index_int][0] + alpha * colors[index_int + 1][0]
269 g = (1.0 - alpha) * colors[index_int][1] + alpha * colors[index_int + 1][1]
270 b = (1.0 - alpha) * colors[index_int][2] + alpha * colors[index_int + 1][2]
271 else:
272 r = (1.0 - alpha) * colors[index_int][0]
273 g = (1.0 - alpha) * colors[index_int][1]
274 b = (1.0 - alpha) * colors[index_int][2]
275
276 if flat:
277 palette.extend((int(r), int(g), int(b)))
278 else:
279 palette.append((int(r), int(g), int(b)))
280
281 return palette
282
283
284 def desaturate(rgb, amount):
285 """
286 desaturate colors by amount
287 amount == 0, no change
288 amount == 1, grey
289 """
290 luminosity = sum(rgb) / 3.0
291 desat = lambda color: color - amount * (color - luminosity)
292
293 return tuple(map(int, map(desat, rgb)))
294
295
296 class WaveformImage(object):
297 """
298 Given peaks and spectral centroids from the AudioProcessor, this class will construct
299 a wavefile image which can be saved as PNG.
300 """
301 def __init__(self, image_width, image_height, palette=1):
302 if image_height % 2 == 0:
303 raise AudioProcessingException("Height should be uneven: images look much better at uneven height")
304
305 if palette == 1:
306 background_color = (0,0,0)
307 colors = [
308 (50,0,200),
309 (0,220,80),
310 (255,224,0),
311 (255,70,0),
312 ]
313 elif palette == 2:
314 background_color = (0,0,0)
315 colors = [self.color_from_value(value/29.0) for value in range(0,30)]
316 elif palette == 3:
317 background_color = (213, 217, 221)
318 colors = map( partial(desaturate, amount=0.7), [
319 (50,0,200),
320 (0,220,80),
321 (255,224,0),
322 ])
323 elif palette == 4:
324 background_color = (213, 217, 221)
325 colors = map( partial(desaturate, amount=0.8), [self.color_from_value(value/29.0) for value in range(0,30)])
326
327 self.image = Image.new("RGB", (image_width, image_height), background_color)
328
329 self.image_width = image_width
330 self.image_height = image_height
331
332 self.draw = ImageDraw.Draw(self.image)
333 self.previous_x, self.previous_y = None, None
334
335 self.color_lookup = interpolate_colors(colors)
336 self.pix = self.image.load()
337
338 def color_from_value(self, value):
339 """ given a value between 0 and 1, return an (r,g,b) tuple """
340
341 return ImageColor.getrgb("hsl(%d,%d%%,%d%%)" % (int( (1.0 - value) * 360 ), 80, 50))
342
343 def draw_peaks(self, x, peaks, spectral_centroid):
344 """ draw 2 peaks at x using the spectral_centroid for color """
345
346 y1 = self.image_height * 0.5 - peaks[0] * (self.image_height - 4) * 0.5
347 y2 = self.image_height * 0.5 - peaks[1] * (self.image_height - 4) * 0.5
348
349 line_color = self.color_lookup[int(spectral_centroid*255.0)]
350
351 if self.previous_y != None:
352 self.draw.line([self.previous_x, self.previous_y, x, y1, x, y2], line_color)
353 else:
354 self.draw.line([x, y1, x, y2], line_color)
355
356 self.previous_x, self.previous_y = x, y2
357
358 self.draw_anti_aliased_pixels(x, y1, y2, line_color)
359
360 def draw_anti_aliased_pixels(self, x, y1, y2, color):
361 """ vertical anti-aliasing at y1 and y2 """
362
363 y_max = max(y1, y2)
364 y_max_int = int(y_max)
365 alpha = y_max - y_max_int
366
367 if alpha > 0.0 and alpha < 1.0 and y_max_int + 1 < self.image_height:
368 current_pix = self.pix[x, y_max_int + 1]
369
370 r = int((1-alpha)*current_pix[0] + alpha*color[0])
371 g = int((1-alpha)*current_pix[1] + alpha*color[1])
372 b = int((1-alpha)*current_pix[2] + alpha*color[2])
373
374 self.pix[x, y_max_int + 1] = (r,g,b)
375
376 y_min = min(y1, y2)
377 y_min_int = int(y_min)
378 alpha = 1.0 - (y_min - y_min_int)
379
380 if alpha > 0.0 and alpha < 1.0 and y_min_int - 1 >= 0:
381 current_pix = self.pix[x, y_min_int - 1]
382
383 r = int((1-alpha)*current_pix[0] + alpha*color[0])
384 g = int((1-alpha)*current_pix[1] + alpha*color[1])
385 b = int((1-alpha)*current_pix[2] + alpha*color[2])
386
387 self.pix[x, y_min_int - 1] = (r,g,b)
388
389 def save(self, filename):
390 # draw a zero "zero" line
391 a = 25
392 for x in range(self.image_width):
393 self.pix[x, self.image_height/2] = tuple(map(lambda p: p+a, self.pix[x, self.image_height/2]))
394
395 self.image.save(filename)
396
397
398 class SpectrogramImage(object):
399 """
400 Given spectra from the AudioProcessor, this class will construct a wavefile image which
401 can be saved as PNG.
402 """
403 def __init__(self, image_width, image_height, fft_size):
404 self.image_width = image_width
405 self.image_height = image_height
406 self.fft_size = fft_size
407
408 self.image = Image.new("RGBA", (image_height, image_width))
409
410 colors = [
411 (0, 0, 0, 0),
412 (58/4, 68/4, 65/4, 255),
413 (80/2, 100/2, 153/2, 255),
414 (90, 180, 100, 255),
415 (224, 224, 44, 255),
416 (255, 60, 30, 255),
417 (255, 255, 255, 255)
418 ]
419 self.palette = interpolate_colors(colors)
420
421 # generate the lookup which translates y-coordinate to fft-bin
422 self.y_to_bin = []
423 f_min = 100.0
424 f_max = 22050.0
425 y_min = math.log10(f_min)
426 y_max = math.log10(f_max)
427 for y in range(self.image_height):
428 freq = math.pow(10.0, y_min + y / (image_height - 1.0) *(y_max - y_min))
429 bin = freq / 22050.0 * (self.fft_size/2 + 1)
430
431 if bin < self.fft_size/2:
432 alpha = bin - int(bin)
433
434 self.y_to_bin.append((int(bin), alpha * 255))
435
436 # this is a bit strange, but using image.load()[x,y] = ... is
437 # a lot slower than using image.putadata and then rotating the image
438 # so we store all the pixels in an array and then create the image when saving
439 self.pixels = []
440
441 def draw_spectrum(self, x, spectrum):
442 # for all frequencies, draw the pixels
443 for (index, alpha) in self.y_to_bin:
444 self.pixels.append( self.palette[int((255.0-alpha) * spectrum[index] + alpha * spectrum[index + 1])] )
445
446 # if the FFT is too small to fill up the image, fill with black to the top
447 for y in range(len(self.y_to_bin), self.image_height): #@UnusedVariable
448 self.pixels.append(self.palette[0])
449
450 def save(self, filename, quality=80):
451 assert filename.lower().endswith(".jpg")
452 self.image.putdata(self.pixels)
453 self.image.transpose(Image.ROTATE_90).save(filename, quality=quality)
454
455
456 def create_wave_images(input_filename, output_filename_w, output_filename_s, image_width, image_height, fft_size, progress_callback=None):
457 """
458 Utility function for creating both wavefile and spectrum images from an audio input file.
459 """
460 processor = AudioProcessor(input_filename, fft_size, numpy.hanning)
461 samples_per_pixel = processor.audio_file.nframes / float(image_width)
462
463 waveform = WaveformImage(image_width, image_height)
464 spectrogram = SpectrogramImage(image_width, image_height, fft_size)
465
466 for x in range(image_width):
467
468 if progress_callback and x % (image_width/10) == 0:
469 progress_callback((x*100)/image_width)
470
471 seek_point = int(x * samples_per_pixel)
472 next_seek_point = int((x + 1) * samples_per_pixel)
473
474 (spectral_centroid, db_spectrum) = processor.spectral_centroid(seek_point)
475 peaks = processor.peaks(seek_point, next_seek_point)
476
477 waveform.draw_peaks(x, peaks, spectral_centroid)
478 spectrogram.draw_spectrum(x, db_spectrum)
479
480 if progress_callback:
481 progress_callback(100)
482
483 waveform.save(output_filename_w)
484 spectrogram.save(output_filename_s)
485
486
487 class NoSpaceLeftException(Exception):
488 pass
489
490 def convert_to_pcm(input_filename, output_filename):
491 """
492 converts any audio file type to pcm audio
493 """
494
495 if not os.path.exists(input_filename):
496 raise AudioProcessingException("file %s does not exist" % input_filename)
497
498 sound_type = get_sound_type(input_filename)
499
500 if sound_type == "mp3":
501 cmd = ["lame", "--decode", input_filename, output_filename]
502 elif sound_type == "ogg":
503 cmd = ["oggdec", input_filename, "-o", output_filename]
504 elif sound_type == "flac":
505 cmd = ["flac", "-f", "-d", "-s", "-o", output_filename, input_filename]
506 else:
507 return False
508
509 process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
510 (stdout, stderr) = process.communicate()
511
512 if process.returncode != 0 or not os.path.exists(output_filename):
513 if "No space left on device" in stderr + " " + stdout:
514 raise NoSpaceLeftException
515 raise AudioProcessingException("failed converting to pcm data:\n" + " ".join(cmd) + "\n" + stderr + "\n" + stdout)
516
517 return True
518
519
520 def stereofy_and_find_info(stereofy_executble_path, input_filename, output_filename):
521 """
522 converts a pcm wave file to two channel, 16 bit integer
523 """
524
525 if not os.path.exists(input_filename):
526 raise AudioProcessingException("file %s does not exist" % input_filename)
527
528 cmd = [stereofy_executble_path, "--input", input_filename, "--output", output_filename]
529
530 process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
531 (stdout, stderr) = process.communicate()
532
533 if process.returncode != 0 or not os.path.exists(output_filename):
534 if "No space left on device" in stderr + " " + stdout:
535 raise NoSpaceLeftException
536 raise AudioProcessingException("failed calling stereofy data:\n" + " ".join(cmd) + "\n" + stderr + "\n" + stdout)
537
538 stdout = (stdout + " " + stderr).replace("\n", " ")
539
540 duration = 0
541 m = re.match(r".*#duration (?P<duration>[\d\.]+).*", stdout)
542 if m != None:
543 duration = float(m.group("duration"))
544
545 channels = 0
546 m = re.match(r".*#channels (?P<channels>\d+).*", stdout)
547 if m != None:
548 channels = float(m.group("channels"))
549
550 samplerate = 0
551 m = re.match(r".*#samplerate (?P<samplerate>\d+).*", stdout)
552 if m != None:
553 samplerate = float(m.group("samplerate"))
554
555 bitdepth = None
556 m = re.match(r".*#bitdepth (?P<bitdepth>\d+).*", stdout)
557 if m != None:
558 bitdepth = float(m.group("bitdepth"))
559
560 bitrate = (os.path.getsize(input_filename) * 8.0) / 1024.0 / duration if duration > 0 else 0
561
562 return dict(duration=duration, channels=channels, samplerate=samplerate, bitrate=bitrate, bitdepth=bitdepth)
563
564
565 def convert_to_mp3(input_filename, output_filename, quality=70):
566 """
567 converts the incoming wave file to a mp3 file
568 """
569
570 if not os.path.exists(input_filename):
571 raise AudioProcessingException("file %s does not exist" % input_filename)
572
573 command = ["lame", "--silent", "--abr", str(quality), input_filename, output_filename]
574
575 process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
576 (stdout, stderr) = process.communicate()
577
578 if process.returncode != 0 or not os.path.exists(output_filename):
579 raise AudioProcessingException(stdout)
580
581 def convert_to_ogg(input_filename, output_filename, quality=1):
582 """
583 converts the incoming wave file to n ogg file
584 """
585
586 if not os.path.exists(input_filename):
587 raise AudioProcessingException("file %s does not exist" % input_filename)
588
589 command = ["oggenc", "-q", str(quality), input_filename, "-o", output_filename]
590
591 process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
592 (stdout, stderr) = process.communicate()
593
594 if process.returncode != 0 or not os.path.exists(output_filename):
595 raise AudioProcessingException(stdout)
596
597 def convert_using_ffmpeg(input_filename, output_filename):
598 """
599 converts the incoming wave file to stereo pcm using fffmpeg
600 """
601 TIMEOUT = 3 * 60
602 def alarm_handler(signum, frame):
603 raise AudioProcessingException("timeout while waiting for ffmpeg")
604
605 if not os.path.exists(input_filename):
606 raise AudioProcessingException("file %s does not exist" % input_filename)
607
608 command = ["ffmpeg", "-y", "-i", input_filename, "-ac","1","-acodec", "pcm_s16le", "-ar", "44100", output_filename]
609
610 process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
611 signal.signal(signal.SIGALRM,alarm_handler)
612 signal.alarm(TIMEOUT)
613 (stdout, stderr) = process.communicate()
614 signal.alarm(0)
615 if process.returncode != 0 or not os.path.exists(output_filename):
616 raise AudioProcessingException(stdout)