Work around lack of scikits.audiolab support on Python 3.
[mediagoblin.git] / extlib / freesound / audioprocessing.py
1 #!/usr/bin/env python
2 # processing.py -- various audio processing functions
3 # Copyright (C) 2008 MUSIC TECHNOLOGY GROUP (MTG)
4 # UNIVERSITAT POMPEU FABRA
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU Affero General Public License as
8 # published by the Free Software Foundation, either version 3 of the
9 # License, or (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU Affero General Public License for more details.
15 #
16 # You should have received a copy of the GNU Affero General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 #
19 # Authors:
20 # Bram de Jong <bram.dejong at domain.com where domain in gmail>
21 # 2012, Joar Wandborg <first name at last name dot se>
22
23 from PIL import Image, ImageDraw, ImageColor #@UnresolvedImport
24 from functools import partial
25 import math
26 import numpy
27 import os
28 import re
29 import signal
30
31
32 def get_sound_type(input_filename):
33 sound_type = os.path.splitext(input_filename.lower())[1].strip(".")
34
35 if sound_type == "fla":
36 sound_type = "flac"
37 elif sound_type == "aif":
38 sound_type = "aiff"
39
40 return sound_type
41
42
43 try:
44 import scikits.audiolab as audiolab
45 except ImportError:
46 print("WARNING: audiolab is not installed so wav2png will not work")
47
48 # Hack to prevent errors when uploading audio files. The issue is that
49 # scikits.audiolab does not support Python 3. By replacing it with a mock
50 # implementation here, we can accept audio files, but we won't get the nice
51 # waveform image.
52 import six
53 if six.PY3:
54 class MockSndfile(object):
55 def __init__(self, *args, **kwargs):
56 self.nframes = 0
57 self.channels = 1
58 self.samplerate = 44100
59
60 def read_frames(self, *args):
61 return []
62
63 def seek(self, *args):
64 return
65
66 def close(self):
67 return
68 import unittest.mock as mock
69 audiolab = mock.Mock()
70 audiolab.Sndfile = MockSndfile
71
72 import subprocess
73
74 class AudioProcessingException(Exception):
75 pass
76
77 class TestAudioFile(object):
78 """A class that mimics audiolab.sndfile but generates noise instead of reading
79 a wave file. Additionally it can be told to have a "broken" header and thus crashing
80 in the middle of the file. Also useful for testing ultra-short files of 20 samples."""
81 def __init__(self, num_frames, has_broken_header=False):
82 self.seekpoint = 0
83 self.nframes = num_frames
84 self.samplerate = 44100
85 self.channels = 1
86 self.has_broken_header = has_broken_header
87
88 def seek(self, seekpoint):
89 self.seekpoint = seekpoint
90
91 def read_frames(self, frames_to_read):
92 if self.has_broken_header and self.seekpoint + frames_to_read > self.num_frames / 2:
93 raise RuntimeError()
94
95 num_frames_left = self.num_frames - self.seekpoint
96 will_read = num_frames_left if num_frames_left < frames_to_read else frames_to_read
97 self.seekpoint += will_read
98 return numpy.random.random(will_read)*2 - 1
99
100
101 def get_max_level(filename):
102 max_value = 0
103 buffer_size = 4096
104 audio_file = audiolab.Sndfile(filename, 'r')
105 n_samples_left = audio_file.nframes
106
107 while n_samples_left:
108 to_read = min(buffer_size, n_samples_left)
109
110 try:
111 samples = audio_file.read_frames(to_read)
112 except RuntimeError:
113 # this can happen with a broken header
114 break
115
116 # convert to mono by selecting left channel only
117 if audio_file.channels > 1:
118 samples = samples[:,0]
119
120 max_value = max(max_value, numpy.abs(samples).max())
121
122 n_samples_left -= to_read
123
124 audio_file.close()
125
126 return max_value
127
128 class AudioProcessor(object):
129 """
130 The audio processor processes chunks of audio an calculates the spectrac centroid and the peak
131 samples in that chunk of audio.
132 """
133 def __init__(self, input_filename, fft_size, window_function=numpy.hanning):
134 max_level = get_max_level(input_filename)
135
136 self.audio_file = audiolab.Sndfile(input_filename, 'r')
137 self.fft_size = fft_size
138 self.window = window_function(self.fft_size)
139 self.spectrum_range = None
140 self.lower = 100
141 self.higher = 22050
142 self.lower_log = math.log10(self.lower)
143 self.higher_log = math.log10(self.higher)
144 self.clip = lambda val, low, high: min(high, max(low, val))
145
146 # figure out what the maximum value is for an FFT doing the FFT of a DC signal
147 fft = numpy.fft.rfft(numpy.ones(fft_size) * self.window)
148 max_fft = (numpy.abs(fft)).max()
149 # set the scale to normalized audio and normalized FFT
150 self.scale = 1.0/max_level/max_fft if max_level > 0 else 1
151
152 def read(self, start, size, resize_if_less=False):
153 """ read size samples starting at start, if resize_if_less is True and less than size
154 samples are read, resize the array to size and fill with zeros """
155
156 # number of zeros to add to start and end of the buffer
157 add_to_start = 0
158 add_to_end = 0
159
160 if start < 0:
161 # the first FFT window starts centered around zero
162 if size + start <= 0:
163 return numpy.zeros(size) if resize_if_less else numpy.array([])
164 else:
165 self.audio_file.seek(0)
166
167 add_to_start = -start # remember: start is negative!
168 to_read = size + start
169
170 if to_read > self.audio_file.nframes:
171 add_to_end = to_read - self.audio_file.nframes
172 to_read = self.audio_file.nframes
173 else:
174 self.audio_file.seek(start)
175
176 to_read = size
177 if start + to_read >= self.audio_file.nframes:
178 to_read = self.audio_file.nframes - start
179 add_to_end = size - to_read
180
181 try:
182 samples = self.audio_file.read_frames(to_read)
183 except RuntimeError:
184 # this can happen for wave files with broken headers...
185 return numpy.zeros(size) if resize_if_less else numpy.zeros(2)
186
187 # convert to mono by selecting left channel only
188 if self.audio_file.channels > 1:
189 samples = samples[:,0]
190
191 if resize_if_less and (add_to_start > 0 or add_to_end > 0):
192 if add_to_start > 0:
193 samples = numpy.concatenate((numpy.zeros(add_to_start), samples), axis=1)
194
195 if add_to_end > 0:
196 samples = numpy.resize(samples, size)
197 samples[size - add_to_end:] = 0
198
199 return samples
200
201
202 def spectral_centroid(self, seek_point, spec_range=110.0):
203 """ starting at seek_point read fft_size samples, and calculate the spectral centroid """
204
205 samples = self.read(seek_point - self.fft_size/2, self.fft_size, True)
206
207 samples *= self.window
208 fft = numpy.fft.rfft(samples)
209 spectrum = self.scale * numpy.abs(fft) # normalized abs(FFT) between 0 and 1
210 length = numpy.float64(spectrum.shape[0])
211
212 # scale the db spectrum from [- spec_range db ... 0 db] > [0..1]
213 db_spectrum = ((20*(numpy.log10(spectrum + 1e-60))).clip(-spec_range, 0.0) + spec_range)/spec_range
214
215 energy = spectrum.sum()
216 spectral_centroid = 0
217
218 if energy > 1e-60:
219 # calculate the spectral centroid
220
221 if self.spectrum_range == None:
222 self.spectrum_range = numpy.arange(length)
223
224 spectral_centroid = (spectrum * self.spectrum_range).sum() / (energy * (length - 1)) * self.audio_file.samplerate * 0.5
225
226 # clip > log10 > scale between 0 and 1
227 spectral_centroid = (math.log10(self.clip(spectral_centroid, self.lower, self.higher)) - self.lower_log) / (self.higher_log - self.lower_log)
228
229 return (spectral_centroid, db_spectrum)
230
231
232 def peaks(self, start_seek, end_seek):
233 """ read all samples between start_seek and end_seek, then find the minimum and maximum peak
234 in that range. Returns that pair in the order they were found. So if min was found first,
235 it returns (min, max) else the other way around. """
236
237 # larger blocksizes are faster but take more mem...
238 # Aha, Watson, a clue, a tradeof!
239 block_size = 4096
240
241 max_index = -1
242 max_value = -1
243 min_index = -1
244 min_value = 1
245
246 if start_seek < 0:
247 start_seek = 0
248
249 if end_seek > self.audio_file.nframes:
250 end_seek = self.audio_file.nframes
251
252 if end_seek <= start_seek:
253 samples = self.read(start_seek, 1)
254 return (samples[0], samples[0])
255
256 if block_size > end_seek - start_seek:
257 block_size = end_seek - start_seek
258
259 for i in range(start_seek, end_seek, block_size):
260 samples = self.read(i, block_size)
261
262 local_max_index = numpy.argmax(samples)
263 local_max_value = samples[local_max_index]
264
265 if local_max_value > max_value:
266 max_value = local_max_value
267 max_index = local_max_index
268
269 local_min_index = numpy.argmin(samples)
270 local_min_value = samples[local_min_index]
271
272 if local_min_value < min_value:
273 min_value = local_min_value
274 min_index = local_min_index
275
276 return (min_value, max_value) if min_index < max_index else (max_value, min_value)
277
278
279 def interpolate_colors(colors, flat=False, num_colors=256):
280 """ given a list of colors, create a larger list of colors interpolating
281 the first one. If flatten is True a list of numers will be returned. If
282 False, a list of (r,g,b) tuples. num_colors is the number of colors wanted
283 in the final list """
284
285 palette = []
286
287 for i in range(num_colors):
288 index = (i * (len(colors) - 1))/(num_colors - 1.0)
289 index_int = int(index)
290 alpha = index - float(index_int)
291
292 if alpha > 0:
293 r = (1.0 - alpha) * colors[index_int][0] + alpha * colors[index_int + 1][0]
294 g = (1.0 - alpha) * colors[index_int][1] + alpha * colors[index_int + 1][1]
295 b = (1.0 - alpha) * colors[index_int][2] + alpha * colors[index_int + 1][2]
296 else:
297 r = (1.0 - alpha) * colors[index_int][0]
298 g = (1.0 - alpha) * colors[index_int][1]
299 b = (1.0 - alpha) * colors[index_int][2]
300
301 if flat:
302 palette.extend((int(r), int(g), int(b)))
303 else:
304 palette.append((int(r), int(g), int(b)))
305
306 return palette
307
308
309 def desaturate(rgb, amount):
310 """
311 desaturate colors by amount
312 amount == 0, no change
313 amount == 1, grey
314 """
315 luminosity = sum(rgb) / 3.0
316 desat = lambda color: color - amount * (color - luminosity)
317
318 return tuple(map(int, map(desat, rgb)))
319
320
321 class WaveformImage(object):
322 """
323 Given peaks and spectral centroids from the AudioProcessor, this class will construct
324 a wavefile image which can be saved as PNG.
325 """
326 def __init__(self, image_width, image_height, palette=1):
327 if image_height % 2 == 0:
328 raise AudioProcessingException("Height should be uneven: images look much better at uneven height")
329
330 if palette == 1:
331 background_color = (0,0,0)
332 colors = [
333 (50,0,200),
334 (0,220,80),
335 (255,224,0),
336 (255,70,0),
337 ]
338 elif palette == 2:
339 background_color = (0,0,0)
340 colors = [self.color_from_value(value/29.0) for value in range(0,30)]
341 elif palette == 3:
342 background_color = (213, 217, 221)
343 colors = map( partial(desaturate, amount=0.7), [
344 (50,0,200),
345 (0,220,80),
346 (255,224,0),
347 ])
348 elif palette == 4:
349 background_color = (213, 217, 221)
350 colors = map( partial(desaturate, amount=0.8), [self.color_from_value(value/29.0) for value in range(0,30)])
351
352 self.image = Image.new("RGB", (image_width, image_height), background_color)
353
354 self.image_width = image_width
355 self.image_height = image_height
356
357 self.draw = ImageDraw.Draw(self.image)
358 self.previous_x, self.previous_y = None, None
359
360 self.color_lookup = interpolate_colors(colors)
361 self.pix = self.image.load()
362
363 def color_from_value(self, value):
364 """ given a value between 0 and 1, return an (r,g,b) tuple """
365
366 return ImageColor.getrgb("hsl(%d,%d%%,%d%%)" % (int( (1.0 - value) * 360 ), 80, 50))
367
368 def draw_peaks(self, x, peaks, spectral_centroid):
369 """ draw 2 peaks at x using the spectral_centroid for color """
370
371 y1 = self.image_height * 0.5 - peaks[0] * (self.image_height - 4) * 0.5
372 y2 = self.image_height * 0.5 - peaks[1] * (self.image_height - 4) * 0.5
373
374 line_color = self.color_lookup[int(spectral_centroid*255.0)]
375
376 if self.previous_y != None:
377 self.draw.line([self.previous_x, self.previous_y, x, y1, x, y2], line_color)
378 else:
379 self.draw.line([x, y1, x, y2], line_color)
380
381 self.previous_x, self.previous_y = x, y2
382
383 self.draw_anti_aliased_pixels(x, y1, y2, line_color)
384
385 def draw_anti_aliased_pixels(self, x, y1, y2, color):
386 """ vertical anti-aliasing at y1 and y2 """
387
388 y_max = max(y1, y2)
389 y_max_int = int(y_max)
390 alpha = y_max - y_max_int
391
392 if alpha > 0.0 and alpha < 1.0 and y_max_int + 1 < self.image_height:
393 current_pix = self.pix[x, y_max_int + 1]
394
395 r = int((1-alpha)*current_pix[0] + alpha*color[0])
396 g = int((1-alpha)*current_pix[1] + alpha*color[1])
397 b = int((1-alpha)*current_pix[2] + alpha*color[2])
398
399 self.pix[x, y_max_int + 1] = (r,g,b)
400
401 y_min = min(y1, y2)
402 y_min_int = int(y_min)
403 alpha = 1.0 - (y_min - y_min_int)
404
405 if alpha > 0.0 and alpha < 1.0 and y_min_int - 1 >= 0:
406 current_pix = self.pix[x, y_min_int - 1]
407
408 r = int((1-alpha)*current_pix[0] + alpha*color[0])
409 g = int((1-alpha)*current_pix[1] + alpha*color[1])
410 b = int((1-alpha)*current_pix[2] + alpha*color[2])
411
412 self.pix[x, y_min_int - 1] = (r,g,b)
413
414 def save(self, filename):
415 # draw a zero "zero" line
416 a = 25
417 for x in range(self.image_width):
418 self.pix[x, self.image_height/2] = tuple(map(lambda p: p+a, self.pix[x, self.image_height/2]))
419
420 self.image.save(filename)
421
422
423 class SpectrogramImage(object):
424 """
425 Given spectra from the AudioProcessor, this class will construct a wavefile image which
426 can be saved as PNG.
427 """
428 def __init__(self, image_width, image_height, fft_size):
429 self.image_width = image_width
430 self.image_height = image_height
431 self.fft_size = fft_size
432
433 self.image = Image.new("RGBA", (image_height, image_width))
434
435 colors = [
436 (0, 0, 0, 0),
437 (58/4, 68/4, 65/4, 255),
438 (80/2, 100/2, 153/2, 255),
439 (90, 180, 100, 255),
440 (224, 224, 44, 255),
441 (255, 60, 30, 255),
442 (255, 255, 255, 255)
443 ]
444 self.palette = interpolate_colors(colors)
445
446 # generate the lookup which translates y-coordinate to fft-bin
447 self.y_to_bin = []
448 f_min = 100.0
449 f_max = 22050.0
450 y_min = math.log10(f_min)
451 y_max = math.log10(f_max)
452 for y in range(self.image_height):
453 freq = math.pow(10.0, y_min + y / (image_height - 1.0) *(y_max - y_min))
454 bin = freq / 22050.0 * (self.fft_size/2 + 1)
455
456 if bin < self.fft_size/2:
457 alpha = bin - int(bin)
458
459 self.y_to_bin.append((int(bin), alpha * 255))
460
461 # this is a bit strange, but using image.load()[x,y] = ... is
462 # a lot slower than using image.putadata and then rotating the image
463 # so we store all the pixels in an array and then create the image when saving
464 self.pixels = []
465
466 def draw_spectrum(self, x, spectrum):
467 # for all frequencies, draw the pixels
468 for (index, alpha) in self.y_to_bin:
469 self.pixels.append( self.palette[int((255.0-alpha) * spectrum[index] + alpha * spectrum[index + 1])] )
470
471 # if the FFT is too small to fill up the image, fill with black to the top
472 for y in range(len(self.y_to_bin), self.image_height): #@UnusedVariable
473 self.pixels.append(self.palette[0])
474
475 def save(self, filename, quality=80):
476 assert filename.lower().endswith(".jpg")
477 self.image.putdata(self.pixels)
478 self.image.transpose(Image.ROTATE_90).save(filename, quality=quality)
479
480
481 def create_wave_images(input_filename, output_filename_w, output_filename_s, image_width, image_height, fft_size, progress_callback=None):
482 """
483 Utility function for creating both wavefile and spectrum images from an audio input file.
484 """
485 processor = AudioProcessor(input_filename, fft_size, numpy.hanning)
486 samples_per_pixel = processor.audio_file.nframes / float(image_width)
487
488 waveform = WaveformImage(image_width, image_height)
489 spectrogram = SpectrogramImage(image_width, image_height, fft_size)
490
491 for x in range(image_width):
492
493 if progress_callback and x % (image_width/10) == 0:
494 progress_callback((x*100)/image_width)
495
496 seek_point = int(x * samples_per_pixel)
497 next_seek_point = int((x + 1) * samples_per_pixel)
498
499 (spectral_centroid, db_spectrum) = processor.spectral_centroid(seek_point)
500 peaks = processor.peaks(seek_point, next_seek_point)
501
502 waveform.draw_peaks(x, peaks, spectral_centroid)
503 spectrogram.draw_spectrum(x, db_spectrum)
504
505 if progress_callback:
506 progress_callback(100)
507
508 waveform.save(output_filename_w)
509 spectrogram.save(output_filename_s)
510
511
512 class NoSpaceLeftException(Exception):
513 pass
514
515 def convert_to_pcm(input_filename, output_filename):
516 """
517 converts any audio file type to pcm audio
518 """
519
520 if not os.path.exists(input_filename):
521 raise AudioProcessingException("file %s does not exist" % input_filename)
522
523 sound_type = get_sound_type(input_filename)
524
525 if sound_type == "mp3":
526 cmd = ["lame", "--decode", input_filename, output_filename]
527 elif sound_type == "ogg":
528 cmd = ["oggdec", input_filename, "-o", output_filename]
529 elif sound_type == "flac":
530 cmd = ["flac", "-f", "-d", "-s", "-o", output_filename, input_filename]
531 else:
532 return False
533
534 process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
535 (stdout, stderr) = process.communicate()
536
537 if process.returncode != 0 or not os.path.exists(output_filename):
538 if "No space left on device" in stderr + " " + stdout:
539 raise NoSpaceLeftException
540 raise AudioProcessingException("failed converting to pcm data:\n" + " ".join(cmd) + "\n" + stderr + "\n" + stdout)
541
542 return True
543
544
545 def stereofy_and_find_info(stereofy_executble_path, input_filename, output_filename):
546 """
547 converts a pcm wave file to two channel, 16 bit integer
548 """
549
550 if not os.path.exists(input_filename):
551 raise AudioProcessingException("file %s does not exist" % input_filename)
552
553 cmd = [stereofy_executble_path, "--input", input_filename, "--output", output_filename]
554
555 process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
556 (stdout, stderr) = process.communicate()
557
558 if process.returncode != 0 or not os.path.exists(output_filename):
559 if "No space left on device" in stderr + " " + stdout:
560 raise NoSpaceLeftException
561 raise AudioProcessingException("failed calling stereofy data:\n" + " ".join(cmd) + "\n" + stderr + "\n" + stdout)
562
563 stdout = (stdout + " " + stderr).replace("\n", " ")
564
565 duration = 0
566 m = re.match(r".*#duration (?P<duration>[\d\.]+).*", stdout)
567 if m != None:
568 duration = float(m.group("duration"))
569
570 channels = 0
571 m = re.match(r".*#channels (?P<channels>\d+).*", stdout)
572 if m != None:
573 channels = float(m.group("channels"))
574
575 samplerate = 0
576 m = re.match(r".*#samplerate (?P<samplerate>\d+).*", stdout)
577 if m != None:
578 samplerate = float(m.group("samplerate"))
579
580 bitdepth = None
581 m = re.match(r".*#bitdepth (?P<bitdepth>\d+).*", stdout)
582 if m != None:
583 bitdepth = float(m.group("bitdepth"))
584
585 bitrate = (os.path.getsize(input_filename) * 8.0) / 1024.0 / duration if duration > 0 else 0
586
587 return dict(duration=duration, channels=channels, samplerate=samplerate, bitrate=bitrate, bitdepth=bitdepth)
588
589
590 def convert_to_mp3(input_filename, output_filename, quality=70):
591 """
592 converts the incoming wave file to a mp3 file
593 """
594
595 if not os.path.exists(input_filename):
596 raise AudioProcessingException("file %s does not exist" % input_filename)
597
598 command = ["lame", "--silent", "--abr", str(quality), input_filename, output_filename]
599
600 process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
601 (stdout, stderr) = process.communicate()
602
603 if process.returncode != 0 or not os.path.exists(output_filename):
604 raise AudioProcessingException(stdout)
605
606 def convert_to_ogg(input_filename, output_filename, quality=1):
607 """
608 converts the incoming wave file to n ogg file
609 """
610
611 if not os.path.exists(input_filename):
612 raise AudioProcessingException("file %s does not exist" % input_filename)
613
614 command = ["oggenc", "-q", str(quality), input_filename, "-o", output_filename]
615
616 process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
617 (stdout, stderr) = process.communicate()
618
619 if process.returncode != 0 or not os.path.exists(output_filename):
620 raise AudioProcessingException(stdout)
621
622 def convert_using_ffmpeg(input_filename, output_filename):
623 """
624 converts the incoming wave file to stereo pcm using fffmpeg
625 """
626 TIMEOUT = 3 * 60
627 def alarm_handler(signum, frame):
628 raise AudioProcessingException("timeout while waiting for ffmpeg")
629
630 if not os.path.exists(input_filename):
631 raise AudioProcessingException("file %s does not exist" % input_filename)
632
633 command = ["ffmpeg", "-y", "-i", input_filename, "-ac","1","-acodec", "pcm_s16le", "-ar", "44100", output_filename]
634
635 process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
636 signal.signal(signal.SIGALRM,alarm_handler)
637 signal.alarm(TIMEOUT)
638 (stdout, stderr) = process.communicate()
639 signal.alarm(0)
640 if process.returncode != 0 or not os.path.exists(output_filename):
641 raise AudioProcessingException(stdout)