2 # processing.py -- various audio processing functions
3 # Copyright (C) 2008 MUSIC TECHNOLOGY GROUP (MTG)
4 # UNIVERSITAT POMPEU FABRA
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU Affero General Public License as
8 # published by the Free Software Foundation, either version 3 of the
9 # License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU Affero General Public License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 # Bram de Jong <bram.dejong at domain.com where domain in gmail>
21 # 2012, Joar Wandborg <first name at last name dot se>
23 from PIL
import Image
, ImageDraw
, ImageColor
#@UnresolvedImport
24 from functools
import partial
32 def get_sound_type(input_filename
):
33 sound_type
= os
.path
.splitext(input_filename
.lower())[1].strip(".")
35 if sound_type
== "fla":
37 elif sound_type
== "aif":
44 import scikits
.audiolab
as audiolab
46 print("WARNING: audiolab is not installed so wav2png will not work")
48 # Hack to prevent errors when uploading audio files. The issue is that
49 # scikits.audiolab does not support Python 3. By replacing it with a mock
50 # implementation here, we can accept audio files, but we won't get the nice
54 class MockSndfile(object):
55 def __init__(self
, *args
, **kwargs
):
58 self
.samplerate
= 44100
60 def read_frames(self
, *args
):
63 def seek(self
, *args
):
68 import unittest
.mock
as mock
69 audiolab
= mock
.Mock()
70 audiolab
.Sndfile
= MockSndfile
74 class AudioProcessingException(Exception):
77 class TestAudioFile(object):
78 """A class that mimics audiolab.sndfile but generates noise instead of reading
79 a wave file. Additionally it can be told to have a "broken" header and thus crashing
80 in the middle of the file. Also useful for testing ultra-short files of 20 samples."""
81 def __init__(self
, num_frames
, has_broken_header
=False):
83 self
.nframes
= num_frames
84 self
.samplerate
= 44100
86 self
.has_broken_header
= has_broken_header
88 def seek(self
, seekpoint
):
89 self
.seekpoint
= seekpoint
91 def read_frames(self
, frames_to_read
):
92 if self
.has_broken_header
and self
.seekpoint
+ frames_to_read
> self
.num_frames
/ 2:
95 num_frames_left
= self
.num_frames
- self
.seekpoint
96 will_read
= num_frames_left
if num_frames_left
< frames_to_read
else frames_to_read
97 self
.seekpoint
+= will_read
98 return numpy
.random
.random(will_read
)*2 - 1
101 def get_max_level(filename
):
104 audio_file
= audiolab
.Sndfile(filename
, 'r')
105 n_samples_left
= audio_file
.nframes
107 while n_samples_left
:
108 to_read
= min(buffer_size
, n_samples_left
)
111 samples
= audio_file
.read_frames(to_read
)
113 # this can happen with a broken header
116 # convert to mono by selecting left channel only
117 if audio_file
.channels
> 1:
118 samples
= samples
[:,0]
120 max_value
= max(max_value
, numpy
.abs(samples
).max())
122 n_samples_left
-= to_read
128 class AudioProcessor(object):
130 The audio processor processes chunks of audio an calculates the spectrac centroid and the peak
131 samples in that chunk of audio.
133 def __init__(self
, input_filename
, fft_size
, window_function
=numpy
.hanning
):
134 max_level
= get_max_level(input_filename
)
136 self
.audio_file
= audiolab
.Sndfile(input_filename
, 'r')
137 self
.fft_size
= fft_size
138 self
.window
= window_function(self
.fft_size
)
139 self
.spectrum_range
= None
142 self
.lower_log
= math
.log10(self
.lower
)
143 self
.higher_log
= math
.log10(self
.higher
)
144 self
.clip
= lambda val
, low
, high
: min(high
, max(low
, val
))
146 # figure out what the maximum value is for an FFT doing the FFT of a DC signal
147 fft
= numpy
.fft
.rfft(numpy
.ones(fft_size
) * self
.window
)
148 max_fft
= (numpy
.abs(fft
)).max()
149 # set the scale to normalized audio and normalized FFT
150 self
.scale
= 1.0/max_level
/max_fft
if max_level
> 0 else 1
152 def read(self
, start
, size
, resize_if_less
=False):
153 """ read size samples starting at start, if resize_if_less is True and less than size
154 samples are read, resize the array to size and fill with zeros """
156 # number of zeros to add to start and end of the buffer
161 # the first FFT window starts centered around zero
162 if size
+ start
<= 0:
163 return numpy
.zeros(size
) if resize_if_less
else numpy
.array([])
165 self
.audio_file
.seek(0)
167 add_to_start
= -start
# remember: start is negative!
168 to_read
= size
+ start
170 if to_read
> self
.audio_file
.nframes
:
171 add_to_end
= to_read
- self
.audio_file
.nframes
172 to_read
= self
.audio_file
.nframes
174 self
.audio_file
.seek(start
)
177 if start
+ to_read
>= self
.audio_file
.nframes
:
178 to_read
= self
.audio_file
.nframes
- start
179 add_to_end
= size
- to_read
182 samples
= self
.audio_file
.read_frames(to_read
)
184 # this can happen for wave files with broken headers...
185 return numpy
.zeros(size
) if resize_if_less
else numpy
.zeros(2)
187 # convert to mono by selecting left channel only
188 if self
.audio_file
.channels
> 1:
189 samples
= samples
[:,0]
191 if resize_if_less
and (add_to_start
> 0 or add_to_end
> 0):
193 samples
= numpy
.concatenate((numpy
.zeros(add_to_start
), samples
), axis
=1)
196 samples
= numpy
.resize(samples
, size
)
197 samples
[size
- add_to_end
:] = 0
202 def spectral_centroid(self
, seek_point
, spec_range
=110.0):
203 """ starting at seek_point read fft_size samples, and calculate the spectral centroid """
205 samples
= self
.read(seek_point
- self
.fft_size
/2, self
.fft_size
, True)
207 samples
*= self
.window
208 fft
= numpy
.fft
.rfft(samples
)
209 spectrum
= self
.scale
* numpy
.abs(fft
) # normalized abs(FFT) between 0 and 1
210 length
= numpy
.float64(spectrum
.shape
[0])
212 # scale the db spectrum from [- spec_range db ... 0 db] > [0..1]
213 db_spectrum
= ((20*(numpy
.log10(spectrum
+ 1e-60))).clip(-spec_range
, 0.0) + spec_range
)/spec_range
215 energy
= spectrum
.sum()
216 spectral_centroid
= 0
219 # calculate the spectral centroid
221 if self
.spectrum_range
== None:
222 self
.spectrum_range
= numpy
.arange(length
)
224 spectral_centroid
= (spectrum
* self
.spectrum_range
).sum() / (energy
* (length
- 1)) * self
.audio_file
.samplerate
* 0.5
226 # clip > log10 > scale between 0 and 1
227 spectral_centroid
= (math
.log10(self
.clip(spectral_centroid
, self
.lower
, self
.higher
)) - self
.lower_log
) / (self
.higher_log
- self
.lower_log
)
229 return (spectral_centroid
, db_spectrum
)
232 def peaks(self
, start_seek
, end_seek
):
233 """ read all samples between start_seek and end_seek, then find the minimum and maximum peak
234 in that range. Returns that pair in the order they were found. So if min was found first,
235 it returns (min, max) else the other way around. """
237 # larger blocksizes are faster but take more mem...
238 # Aha, Watson, a clue, a tradeof!
249 if end_seek
> self
.audio_file
.nframes
:
250 end_seek
= self
.audio_file
.nframes
252 if end_seek
<= start_seek
:
253 samples
= self
.read(start_seek
, 1)
254 return (samples
[0], samples
[0])
256 if block_size
> end_seek
- start_seek
:
257 block_size
= end_seek
- start_seek
259 for i
in range(start_seek
, end_seek
, block_size
):
260 samples
= self
.read(i
, block_size
)
262 local_max_index
= numpy
.argmax(samples
)
263 local_max_value
= samples
[local_max_index
]
265 if local_max_value
> max_value
:
266 max_value
= local_max_value
267 max_index
= local_max_index
269 local_min_index
= numpy
.argmin(samples
)
270 local_min_value
= samples
[local_min_index
]
272 if local_min_value
< min_value
:
273 min_value
= local_min_value
274 min_index
= local_min_index
276 return (min_value
, max_value
) if min_index
< max_index
else (max_value
, min_value
)
279 def interpolate_colors(colors
, flat
=False, num_colors
=256):
280 """ given a list of colors, create a larger list of colors interpolating
281 the first one. If flatten is True a list of numers will be returned. If
282 False, a list of (r,g,b) tuples. num_colors is the number of colors wanted
283 in the final list """
287 for i
in range(num_colors
):
288 index
= (i
* (len(colors
) - 1))/(num_colors
- 1.0)
289 index_int
= int(index
)
290 alpha
= index
- float(index_int
)
293 r
= (1.0 - alpha
) * colors
[index_int
][0] + alpha
* colors
[index_int
+ 1][0]
294 g
= (1.0 - alpha
) * colors
[index_int
][1] + alpha
* colors
[index_int
+ 1][1]
295 b
= (1.0 - alpha
) * colors
[index_int
][2] + alpha
* colors
[index_int
+ 1][2]
297 r
= (1.0 - alpha
) * colors
[index_int
][0]
298 g
= (1.0 - alpha
) * colors
[index_int
][1]
299 b
= (1.0 - alpha
) * colors
[index_int
][2]
302 palette
.extend((int(r
), int(g
), int(b
)))
304 palette
.append((int(r
), int(g
), int(b
)))
309 def desaturate(rgb
, amount
):
311 desaturate colors by amount
312 amount == 0, no change
315 luminosity
= sum(rgb
) / 3.0
316 desat
= lambda color
: color
- amount
* (color
- luminosity
)
318 return tuple(map(int, map(desat
, rgb
)))
321 class WaveformImage(object):
323 Given peaks and spectral centroids from the AudioProcessor, this class will construct
324 a wavefile image which can be saved as PNG.
326 def __init__(self
, image_width
, image_height
, palette
=1):
327 if image_height
% 2 == 0:
328 raise AudioProcessingException("Height should be uneven: images look much better at uneven height")
331 background_color
= (0,0,0)
339 background_color
= (0,0,0)
340 colors
= [self
.color_from_value(value
/29.0) for value
in range(0,30)]
342 background_color
= (213, 217, 221)
343 colors
= map( partial(desaturate
, amount
=0.7), [
349 background_color
= (213, 217, 221)
350 colors
= map( partial(desaturate
, amount
=0.8), [self
.color_from_value(value
/29.0) for value
in range(0,30)])
352 self
.image
= Image
.new("RGB", (image_width
, image_height
), background_color
)
354 self
.image_width
= image_width
355 self
.image_height
= image_height
357 self
.draw
= ImageDraw
.Draw(self
.image
)
358 self
.previous_x
, self
.previous_y
= None, None
360 self
.color_lookup
= interpolate_colors(colors
)
361 self
.pix
= self
.image
.load()
363 def color_from_value(self
, value
):
364 """ given a value between 0 and 1, return an (r,g,b) tuple """
366 return ImageColor
.getrgb("hsl(%d,%d%%,%d%%)" % (int( (1.0 - value
) * 360 ), 80, 50))
368 def draw_peaks(self
, x
, peaks
, spectral_centroid
):
369 """ draw 2 peaks at x using the spectral_centroid for color """
371 y1
= self
.image_height
* 0.5 - peaks
[0] * (self
.image_height
- 4) * 0.5
372 y2
= self
.image_height
* 0.5 - peaks
[1] * (self
.image_height
- 4) * 0.5
374 line_color
= self
.color_lookup
[int(spectral_centroid
*255.0)]
376 if self
.previous_y
!= None:
377 self
.draw
.line([self
.previous_x
, self
.previous_y
, x
, y1
, x
, y2
], line_color
)
379 self
.draw
.line([x
, y1
, x
, y2
], line_color
)
381 self
.previous_x
, self
.previous_y
= x
, y2
383 self
.draw_anti_aliased_pixels(x
, y1
, y2
, line_color
)
385 def draw_anti_aliased_pixels(self
, x
, y1
, y2
, color
):
386 """ vertical anti-aliasing at y1 and y2 """
389 y_max_int
= int(y_max
)
390 alpha
= y_max
- y_max_int
392 if alpha
> 0.0 and alpha
< 1.0 and y_max_int
+ 1 < self
.image_height
:
393 current_pix
= self
.pix
[x
, y_max_int
+ 1]
395 r
= int((1-alpha
)*current_pix
[0] + alpha
*color
[0])
396 g
= int((1-alpha
)*current_pix
[1] + alpha
*color
[1])
397 b
= int((1-alpha
)*current_pix
[2] + alpha
*color
[2])
399 self
.pix
[x
, y_max_int
+ 1] = (r
,g
,b
)
402 y_min_int
= int(y_min
)
403 alpha
= 1.0 - (y_min
- y_min_int
)
405 if alpha
> 0.0 and alpha
< 1.0 and y_min_int
- 1 >= 0:
406 current_pix
= self
.pix
[x
, y_min_int
- 1]
408 r
= int((1-alpha
)*current_pix
[0] + alpha
*color
[0])
409 g
= int((1-alpha
)*current_pix
[1] + alpha
*color
[1])
410 b
= int((1-alpha
)*current_pix
[2] + alpha
*color
[2])
412 self
.pix
[x
, y_min_int
- 1] = (r
,g
,b
)
414 def save(self
, filename
):
415 # draw a zero "zero" line
417 for x
in range(self
.image_width
):
418 self
.pix
[x
, self
.image_height
/2] = tuple(map(lambda p
: p
+a
, self
.pix
[x
, self
.image_height
/2]))
420 self
.image
.save(filename
)
423 class SpectrogramImage(object):
425 Given spectra from the AudioProcessor, this class will construct a wavefile image which
428 def __init__(self
, image_width
, image_height
, fft_size
):
429 self
.image_width
= image_width
430 self
.image_height
= image_height
431 self
.fft_size
= fft_size
433 self
.image
= Image
.new("RGBA", (image_height
, image_width
))
437 (58/4, 68/4, 65/4, 255),
438 (80/2, 100/2, 153/2, 255),
444 self
.palette
= interpolate_colors(colors
)
446 # generate the lookup which translates y-coordinate to fft-bin
450 y_min
= math
.log10(f_min
)
451 y_max
= math
.log10(f_max
)
452 for y
in range(self
.image_height
):
453 freq
= math
.pow(10.0, y_min
+ y
/ (image_height
- 1.0) *(y_max
- y_min
))
454 bin
= freq
/ 22050.0 * (self
.fft_size
/2 + 1)
456 if bin
< self
.fft_size
/2:
457 alpha
= bin
- int(bin
)
459 self
.y_to_bin
.append((int(bin
), alpha
* 255))
461 # this is a bit strange, but using image.load()[x,y] = ... is
462 # a lot slower than using image.putadata and then rotating the image
463 # so we store all the pixels in an array and then create the image when saving
466 def draw_spectrum(self
, x
, spectrum
):
467 # for all frequencies, draw the pixels
468 for (index
, alpha
) in self
.y_to_bin
:
469 self
.pixels
.append( self
.palette
[int((255.0-alpha
) * spectrum
[index
] + alpha
* spectrum
[index
+ 1])] )
471 # if the FFT is too small to fill up the image, fill with black to the top
472 for y
in range(len(self
.y_to_bin
), self
.image_height
): #@UnusedVariable
473 self
.pixels
.append(self
.palette
[0])
475 def save(self
, filename
, quality
=80):
476 assert filename
.lower().endswith(".jpg")
477 self
.image
.putdata(self
.pixels
)
478 self
.image
.transpose(Image
.ROTATE_90
).save(filename
, quality
=quality
)
481 def create_wave_images(input_filename
, output_filename_w
, output_filename_s
, image_width
, image_height
, fft_size
, progress_callback
=None):
483 Utility function for creating both wavefile and spectrum images from an audio input file.
485 processor
= AudioProcessor(input_filename
, fft_size
, numpy
.hanning
)
486 samples_per_pixel
= processor
.audio_file
.nframes
/ float(image_width
)
488 waveform
= WaveformImage(image_width
, image_height
)
489 spectrogram
= SpectrogramImage(image_width
, image_height
, fft_size
)
491 for x
in range(image_width
):
493 if progress_callback
and x
% (image_width
/10) == 0:
494 progress_callback((x
*100)/image_width
)
496 seek_point
= int(x
* samples_per_pixel
)
497 next_seek_point
= int((x
+ 1) * samples_per_pixel
)
499 (spectral_centroid
, db_spectrum
) = processor
.spectral_centroid(seek_point
)
500 peaks
= processor
.peaks(seek_point
, next_seek_point
)
502 waveform
.draw_peaks(x
, peaks
, spectral_centroid
)
503 spectrogram
.draw_spectrum(x
, db_spectrum
)
505 if progress_callback
:
506 progress_callback(100)
508 waveform
.save(output_filename_w
)
509 spectrogram
.save(output_filename_s
)
512 class NoSpaceLeftException(Exception):
515 def convert_to_pcm(input_filename
, output_filename
):
517 converts any audio file type to pcm audio
520 if not os
.path
.exists(input_filename
):
521 raise AudioProcessingException("file %s does not exist" % input_filename
)
523 sound_type
= get_sound_type(input_filename
)
525 if sound_type
== "mp3":
526 cmd
= ["lame", "--decode", input_filename
, output_filename
]
527 elif sound_type
== "ogg":
528 cmd
= ["oggdec", input_filename
, "-o", output_filename
]
529 elif sound_type
== "flac":
530 cmd
= ["flac", "-f", "-d", "-s", "-o", output_filename
, input_filename
]
534 process
= subprocess
.Popen(cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
535 (stdout
, stderr
) = process
.communicate()
537 if process
.returncode
!= 0 or not os
.path
.exists(output_filename
):
538 if "No space left on device" in stderr
+ " " + stdout
:
539 raise NoSpaceLeftException
540 raise AudioProcessingException("failed converting to pcm data:\n" + " ".join(cmd
) + "\n" + stderr
+ "\n" + stdout
)
545 def stereofy_and_find_info(stereofy_executble_path
, input_filename
, output_filename
):
547 converts a pcm wave file to two channel, 16 bit integer
550 if not os
.path
.exists(input_filename
):
551 raise AudioProcessingException("file %s does not exist" % input_filename
)
553 cmd
= [stereofy_executble_path
, "--input", input_filename
, "--output", output_filename
]
555 process
= subprocess
.Popen(cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
556 (stdout
, stderr
) = process
.communicate()
558 if process
.returncode
!= 0 or not os
.path
.exists(output_filename
):
559 if "No space left on device" in stderr
+ " " + stdout
:
560 raise NoSpaceLeftException
561 raise AudioProcessingException("failed calling stereofy data:\n" + " ".join(cmd
) + "\n" + stderr
+ "\n" + stdout
)
563 stdout
= (stdout
+ " " + stderr
).replace("\n", " ")
566 m
= re
.match(r
".*#duration (?P<duration>[\d\.]+).*", stdout
)
568 duration
= float(m
.group("duration"))
571 m
= re
.match(r
".*#channels (?P<channels>\d+).*", stdout
)
573 channels
= float(m
.group("channels"))
576 m
= re
.match(r
".*#samplerate (?P<samplerate>\d+).*", stdout
)
578 samplerate
= float(m
.group("samplerate"))
581 m
= re
.match(r
".*#bitdepth (?P<bitdepth>\d+).*", stdout
)
583 bitdepth
= float(m
.group("bitdepth"))
585 bitrate
= (os
.path
.getsize(input_filename
) * 8.0) / 1024.0 / duration
if duration
> 0 else 0
587 return dict(duration
=duration
, channels
=channels
, samplerate
=samplerate
, bitrate
=bitrate
, bitdepth
=bitdepth
)
590 def convert_to_mp3(input_filename
, output_filename
, quality
=70):
592 converts the incoming wave file to a mp3 file
595 if not os
.path
.exists(input_filename
):
596 raise AudioProcessingException("file %s does not exist" % input_filename
)
598 command
= ["lame", "--silent", "--abr", str(quality
), input_filename
, output_filename
]
600 process
= subprocess
.Popen(command
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
601 (stdout
, stderr
) = process
.communicate()
603 if process
.returncode
!= 0 or not os
.path
.exists(output_filename
):
604 raise AudioProcessingException(stdout
)
606 def convert_to_ogg(input_filename
, output_filename
, quality
=1):
608 converts the incoming wave file to n ogg file
611 if not os
.path
.exists(input_filename
):
612 raise AudioProcessingException("file %s does not exist" % input_filename
)
614 command
= ["oggenc", "-q", str(quality
), input_filename
, "-o", output_filename
]
616 process
= subprocess
.Popen(command
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
617 (stdout
, stderr
) = process
.communicate()
619 if process
.returncode
!= 0 or not os
.path
.exists(output_filename
):
620 raise AudioProcessingException(stdout
)
622 def convert_using_ffmpeg(input_filename
, output_filename
):
624 converts the incoming wave file to stereo pcm using fffmpeg
627 def alarm_handler(signum
, frame
):
628 raise AudioProcessingException("timeout while waiting for ffmpeg")
630 if not os
.path
.exists(input_filename
):
631 raise AudioProcessingException("file %s does not exist" % input_filename
)
633 command
= ["ffmpeg", "-y", "-i", input_filename
, "-ac","1","-acodec", "pcm_s16le", "-ar", "44100", output_filename
]
635 process
= subprocess
.Popen(command
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
636 signal
.signal(signal
.SIGALRM
,alarm_handler
)
637 signal
.alarm(TIMEOUT
)
638 (stdout
, stderr
) = process
.communicate()
640 if process
.returncode
!= 0 or not os
.path
.exists(output_filename
):
641 raise AudioProcessingException(stdout
)