2 # processing.py -- various audio processing functions
3 # Copyright (C) 2008 MUSIC TECHNOLOGY GROUP (MTG)
4 # UNIVERSITAT POMPEU FABRA
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU Affero General Public License as
8 # published by the Free Software Foundation, either version 3 of the
9 # License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU Affero General Public License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 # Bram de Jong <bram.dejong at domain.com where domain in gmail>
21 # 2012, Joar Wandborg <first name at last name dot se>
23 import Image
, ImageDraw
, ImageColor
#@UnresolvedImport
24 from functools
import partial
32 def get_sound_type(input_filename
):
33 sound_type
= os
.path
.splitext(input_filename
.lower())[1].strip(".")
35 if sound_type
== "fla":
37 elif sound_type
== "aif":
44 import scikits
.audiolab
as audiolab
46 print "WARNING: audiolab is not installed so wav2png will not work"
49 class AudioProcessingException(Exception):
52 class TestAudioFile(object):
53 """A class that mimics audiolab.sndfile but generates noise instead of reading
54 a wave file. Additionally it can be told to have a "broken" header and thus crashing
55 in the middle of the file. Also useful for testing ultra-short files of 20 samples."""
56 def __init__(self
, num_frames
, has_broken_header
=False):
58 self
.nframes
= num_frames
59 self
.samplerate
= 44100
61 self
.has_broken_header
= has_broken_header
63 def seek(self
, seekpoint
):
64 self
.seekpoint
= seekpoint
66 def read_frames(self
, frames_to_read
):
67 if self
.has_broken_header
and self
.seekpoint
+ frames_to_read
> self
.num_frames
/ 2:
70 num_frames_left
= self
.num_frames
- self
.seekpoint
71 will_read
= num_frames_left
if num_frames_left
< frames_to_read
else frames_to_read
72 self
.seekpoint
+= will_read
73 return numpy
.random
.random(will_read
)*2 - 1
76 def get_max_level(filename
):
79 audio_file
= audiolab
.Sndfile(filename
, 'r')
80 n_samples_left
= audio_file
.nframes
83 to_read
= min(buffer_size
, n_samples_left
)
86 samples
= audio_file
.read_frames(to_read
)
88 # this can happen with a broken header
91 # convert to mono by selecting left channel only
92 if audio_file
.channels
> 1:
93 samples
= samples
[:,0]
95 max_value
= max(max_value
, numpy
.abs(samples
).max())
97 n_samples_left
-= to_read
103 class AudioProcessor(object):
105 The audio processor processes chunks of audio an calculates the spectrac centroid and the peak
106 samples in that chunk of audio.
108 def __init__(self
, input_filename
, fft_size
, window_function
=numpy
.hanning
):
109 max_level
= get_max_level(input_filename
)
111 self
.audio_file
= audiolab
.Sndfile(input_filename
, 'r')
112 self
.fft_size
= fft_size
113 self
.window
= window_function(self
.fft_size
)
114 self
.spectrum_range
= None
117 self
.lower_log
= math
.log10(self
.lower
)
118 self
.higher_log
= math
.log10(self
.higher
)
119 self
.clip
= lambda val
, low
, high
: min(high
, max(low
, val
))
121 # figure out what the maximum value is for an FFT doing the FFT of a DC signal
122 fft
= numpy
.fft
.rfft(numpy
.ones(fft_size
) * self
.window
)
123 max_fft
= (numpy
.abs(fft
)).max()
124 # set the scale to normalized audio and normalized FFT
125 self
.scale
= 1.0/max_level
/max_fft
if max_level
> 0 else 1
127 def read(self
, start
, size
, resize_if_less
=False):
128 """ read size samples starting at start, if resize_if_less is True and less than size
129 samples are read, resize the array to size and fill with zeros """
131 # number of zeros to add to start and end of the buffer
136 # the first FFT window starts centered around zero
137 if size
+ start
<= 0:
138 return numpy
.zeros(size
) if resize_if_less
else numpy
.array([])
140 self
.audio_file
.seek(0)
142 add_to_start
= -start
# remember: start is negative!
143 to_read
= size
+ start
145 if to_read
> self
.audio_file
.nframes
:
146 add_to_end
= to_read
- self
.audio_file
.nframes
147 to_read
= self
.audio_file
.nframes
149 self
.audio_file
.seek(start
)
152 if start
+ to_read
>= self
.audio_file
.nframes
:
153 to_read
= self
.audio_file
.nframes
- start
154 add_to_end
= size
- to_read
157 samples
= self
.audio_file
.read_frames(to_read
)
159 # this can happen for wave files with broken headers...
160 return numpy
.zeros(size
) if resize_if_less
else numpy
.zeros(2)
162 # convert to mono by selecting left channel only
163 if self
.audio_file
.channels
> 1:
164 samples
= samples
[:,0]
166 if resize_if_less
and (add_to_start
> 0 or add_to_end
> 0):
168 samples
= numpy
.concatenate((numpy
.zeros(add_to_start
), samples
), axis
=1)
171 samples
= numpy
.resize(samples
, size
)
172 samples
[size
- add_to_end
:] = 0
177 def spectral_centroid(self
, seek_point
, spec_range
=110.0):
178 """ starting at seek_point read fft_size samples, and calculate the spectral centroid """
180 samples
= self
.read(seek_point
- self
.fft_size
/2, self
.fft_size
, True)
182 samples
*= self
.window
183 fft
= numpy
.fft
.rfft(samples
)
184 spectrum
= self
.scale
* numpy
.abs(fft
) # normalized abs(FFT) between 0 and 1
185 length
= numpy
.float64(spectrum
.shape
[0])
187 # scale the db spectrum from [- spec_range db ... 0 db] > [0..1]
188 db_spectrum
= ((20*(numpy
.log10(spectrum
+ 1e-60))).clip(-spec_range
, 0.0) + spec_range
)/spec_range
190 energy
= spectrum
.sum()
191 spectral_centroid
= 0
194 # calculate the spectral centroid
196 if self
.spectrum_range
== None:
197 self
.spectrum_range
= numpy
.arange(length
)
199 spectral_centroid
= (spectrum
* self
.spectrum_range
).sum() / (energy
* (length
- 1)) * self
.audio_file
.samplerate
* 0.5
201 # clip > log10 > scale between 0 and 1
202 spectral_centroid
= (math
.log10(self
.clip(spectral_centroid
, self
.lower
, self
.higher
)) - self
.lower_log
) / (self
.higher_log
- self
.lower_log
)
204 return (spectral_centroid
, db_spectrum
)
207 def peaks(self
, start_seek
, end_seek
):
208 """ read all samples between start_seek and end_seek, then find the minimum and maximum peak
209 in that range. Returns that pair in the order they were found. So if min was found first,
210 it returns (min, max) else the other way around. """
212 # larger blocksizes are faster but take more mem...
213 # Aha, Watson, a clue, a tradeof!
224 if end_seek
> self
.audio_file
.nframes
:
225 end_seek
= self
.audio_file
.nframes
227 if end_seek
<= start_seek
:
228 samples
= self
.read(start_seek
, 1)
229 return (samples
[0], samples
[0])
231 if block_size
> end_seek
- start_seek
:
232 block_size
= end_seek
- start_seek
234 for i
in range(start_seek
, end_seek
, block_size
):
235 samples
= self
.read(i
, block_size
)
237 local_max_index
= numpy
.argmax(samples
)
238 local_max_value
= samples
[local_max_index
]
240 if local_max_value
> max_value
:
241 max_value
= local_max_value
242 max_index
= local_max_index
244 local_min_index
= numpy
.argmin(samples
)
245 local_min_value
= samples
[local_min_index
]
247 if local_min_value
< min_value
:
248 min_value
= local_min_value
249 min_index
= local_min_index
251 return (min_value
, max_value
) if min_index
< max_index
else (max_value
, min_value
)
254 def interpolate_colors(colors
, flat
=False, num_colors
=256):
255 """ given a list of colors, create a larger list of colors interpolating
256 the first one. If flatten is True a list of numers will be returned. If
257 False, a list of (r,g,b) tuples. num_colors is the number of colors wanted
258 in the final list """
262 for i
in range(num_colors
):
263 index
= (i
* (len(colors
) - 1))/(num_colors
- 1.0)
264 index_int
= int(index
)
265 alpha
= index
- float(index_int
)
268 r
= (1.0 - alpha
) * colors
[index_int
][0] + alpha
* colors
[index_int
+ 1][0]
269 g
= (1.0 - alpha
) * colors
[index_int
][1] + alpha
* colors
[index_int
+ 1][1]
270 b
= (1.0 - alpha
) * colors
[index_int
][2] + alpha
* colors
[index_int
+ 1][2]
272 r
= (1.0 - alpha
) * colors
[index_int
][0]
273 g
= (1.0 - alpha
) * colors
[index_int
][1]
274 b
= (1.0 - alpha
) * colors
[index_int
][2]
277 palette
.extend((int(r
), int(g
), int(b
)))
279 palette
.append((int(r
), int(g
), int(b
)))
284 def desaturate(rgb
, amount
):
286 desaturate colors by amount
287 amount == 0, no change
290 luminosity
= sum(rgb
) / 3.0
291 desat
= lambda color
: color
- amount
* (color
- luminosity
)
293 return tuple(map(int, map(desat
, rgb
)))
296 class WaveformImage(object):
298 Given peaks and spectral centroids from the AudioProcessor, this class will construct
299 a wavefile image which can be saved as PNG.
301 def __init__(self
, image_width
, image_height
, palette
=1):
302 if image_height
% 2 == 0:
303 raise AudioProcessingException
, "Height should be uneven: images look much better at uneven height"
306 background_color
= (0,0,0)
314 background_color
= (0,0,0)
315 colors
= [self
.color_from_value(value
/29.0) for value
in range(0,30)]
317 background_color
= (213, 217, 221)
318 colors
= map( partial(desaturate
, amount
=0.7), [
324 background_color
= (213, 217, 221)
325 colors
= map( partial(desaturate
, amount
=0.8), [self
.color_from_value(value
/29.0) for value
in range(0,30)])
327 self
.image
= Image
.new("RGB", (image_width
, image_height
), background_color
)
329 self
.image_width
= image_width
330 self
.image_height
= image_height
332 self
.draw
= ImageDraw
.Draw(self
.image
)
333 self
.previous_x
, self
.previous_y
= None, None
335 self
.color_lookup
= interpolate_colors(colors
)
336 self
.pix
= self
.image
.load()
338 def color_from_value(self
, value
):
339 """ given a value between 0 and 1, return an (r,g,b) tuple """
341 return ImageColor
.getrgb("hsl(%d,%d%%,%d%%)" % (int( (1.0 - value
) * 360 ), 80, 50))
343 def draw_peaks(self
, x
, peaks
, spectral_centroid
):
344 """ draw 2 peaks at x using the spectral_centroid for color """
346 y1
= self
.image_height
* 0.5 - peaks
[0] * (self
.image_height
- 4) * 0.5
347 y2
= self
.image_height
* 0.5 - peaks
[1] * (self
.image_height
- 4) * 0.5
349 line_color
= self
.color_lookup
[int(spectral_centroid
*255.0)]
351 if self
.previous_y
!= None:
352 self
.draw
.line([self
.previous_x
, self
.previous_y
, x
, y1
, x
, y2
], line_color
)
354 self
.draw
.line([x
, y1
, x
, y2
], line_color
)
356 self
.previous_x
, self
.previous_y
= x
, y2
358 self
.draw_anti_aliased_pixels(x
, y1
, y2
, line_color
)
360 def draw_anti_aliased_pixels(self
, x
, y1
, y2
, color
):
361 """ vertical anti-aliasing at y1 and y2 """
364 y_max_int
= int(y_max
)
365 alpha
= y_max
- y_max_int
367 if alpha
> 0.0 and alpha
< 1.0 and y_max_int
+ 1 < self
.image_height
:
368 current_pix
= self
.pix
[x
, y_max_int
+ 1]
370 r
= int((1-alpha
)*current_pix
[0] + alpha
*color
[0])
371 g
= int((1-alpha
)*current_pix
[1] + alpha
*color
[1])
372 b
= int((1-alpha
)*current_pix
[2] + alpha
*color
[2])
374 self
.pix
[x
, y_max_int
+ 1] = (r
,g
,b
)
377 y_min_int
= int(y_min
)
378 alpha
= 1.0 - (y_min
- y_min_int
)
380 if alpha
> 0.0 and alpha
< 1.0 and y_min_int
- 1 >= 0:
381 current_pix
= self
.pix
[x
, y_min_int
- 1]
383 r
= int((1-alpha
)*current_pix
[0] + alpha
*color
[0])
384 g
= int((1-alpha
)*current_pix
[1] + alpha
*color
[1])
385 b
= int((1-alpha
)*current_pix
[2] + alpha
*color
[2])
387 self
.pix
[x
, y_min_int
- 1] = (r
,g
,b
)
389 def save(self
, filename
):
390 # draw a zero "zero" line
392 for x
in range(self
.image_width
):
393 self
.pix
[x
, self
.image_height
/2] = tuple(map(lambda p
: p
+a
, self
.pix
[x
, self
.image_height
/2]))
395 self
.image
.save(filename
)
398 class SpectrogramImage(object):
400 Given spectra from the AudioProcessor, this class will construct a wavefile image which
403 def __init__(self
, image_width
, image_height
, fft_size
):
404 self
.image_width
= image_width
405 self
.image_height
= image_height
406 self
.fft_size
= fft_size
408 self
.image
= Image
.new("RGBA", (image_height
, image_width
))
412 (58/4, 68/4, 65/4, 255),
413 (80/2, 100/2, 153/2, 255),
419 self
.palette
= interpolate_colors(colors
)
421 # generate the lookup which translates y-coordinate to fft-bin
425 y_min
= math
.log10(f_min
)
426 y_max
= math
.log10(f_max
)
427 for y
in range(self
.image_height
):
428 freq
= math
.pow(10.0, y_min
+ y
/ (image_height
- 1.0) *(y_max
- y_min
))
429 bin
= freq
/ 22050.0 * (self
.fft_size
/2 + 1)
431 if bin
< self
.fft_size
/2:
432 alpha
= bin
- int(bin
)
434 self
.y_to_bin
.append((int(bin
), alpha
* 255))
436 # this is a bit strange, but using image.load()[x,y] = ... is
437 # a lot slower than using image.putadata and then rotating the image
438 # so we store all the pixels in an array and then create the image when saving
441 def draw_spectrum(self
, x
, spectrum
):
442 # for all frequencies, draw the pixels
443 for (index
, alpha
) in self
.y_to_bin
:
444 self
.pixels
.append( self
.palette
[int((255.0-alpha
) * spectrum
[index
] + alpha
* spectrum
[index
+ 1])] )
446 # if the FFT is too small to fill up the image, fill with black to the top
447 for y
in range(len(self
.y_to_bin
), self
.image_height
): #@UnusedVariable
448 self
.pixels
.append(self
.palette
[0])
450 def save(self
, filename
, quality
=80):
451 assert filename
.lower().endswith(".jpg")
452 self
.image
.putdata(self
.pixels
)
453 self
.image
.transpose(Image
.ROTATE_90
).save(filename
, quality
=quality
)
456 def create_wave_images(input_filename
, output_filename_w
, output_filename_s
, image_width
, image_height
, fft_size
, progress_callback
=None):
458 Utility function for creating both wavefile and spectrum images from an audio input file.
460 processor
= AudioProcessor(input_filename
, fft_size
, numpy
.hanning
)
461 samples_per_pixel
= processor
.audio_file
.nframes
/ float(image_width
)
463 waveform
= WaveformImage(image_width
, image_height
)
464 spectrogram
= SpectrogramImage(image_width
, image_height
, fft_size
)
466 for x
in range(image_width
):
468 if progress_callback
and x
% (image_width
/10) == 0:
469 progress_callback((x
*100)/image_width
)
471 seek_point
= int(x
* samples_per_pixel
)
472 next_seek_point
= int((x
+ 1) * samples_per_pixel
)
474 (spectral_centroid
, db_spectrum
) = processor
.spectral_centroid(seek_point
)
475 peaks
= processor
.peaks(seek_point
, next_seek_point
)
477 waveform
.draw_peaks(x
, peaks
, spectral_centroid
)
478 spectrogram
.draw_spectrum(x
, db_spectrum
)
480 if progress_callback
:
481 progress_callback(100)
483 waveform
.save(output_filename_w
)
484 spectrogram
.save(output_filename_s
)
487 class NoSpaceLeftException(Exception):
490 def convert_to_pcm(input_filename
, output_filename
):
492 converts any audio file type to pcm audio
495 if not os
.path
.exists(input_filename
):
496 raise AudioProcessingException
, "file %s does not exist" % input_filename
498 sound_type
= get_sound_type(input_filename
)
500 if sound_type
== "mp3":
501 cmd
= ["lame", "--decode", input_filename
, output_filename
]
502 elif sound_type
== "ogg":
503 cmd
= ["oggdec", input_filename
, "-o", output_filename
]
504 elif sound_type
== "flac":
505 cmd
= ["flac", "-f", "-d", "-s", "-o", output_filename
, input_filename
]
509 process
= subprocess
.Popen(cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
510 (stdout
, stderr
) = process
.communicate()
512 if process
.returncode
!= 0 or not os
.path
.exists(output_filename
):
513 if "No space left on device" in stderr
+ " " + stdout
:
514 raise NoSpaceLeftException
515 raise AudioProcessingException
, "failed converting to pcm data:\n" + " ".join(cmd
) + "\n" + stderr
+ "\n" + stdout
520 def stereofy_and_find_info(stereofy_executble_path
, input_filename
, output_filename
):
522 converts a pcm wave file to two channel, 16 bit integer
525 if not os
.path
.exists(input_filename
):
526 raise AudioProcessingException
, "file %s does not exist" % input_filename
528 cmd
= [stereofy_executble_path
, "--input", input_filename
, "--output", output_filename
]
530 process
= subprocess
.Popen(cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
531 (stdout
, stderr
) = process
.communicate()
533 if process
.returncode
!= 0 or not os
.path
.exists(output_filename
):
534 if "No space left on device" in stderr
+ " " + stdout
:
535 raise NoSpaceLeftException
536 raise AudioProcessingException
, "failed calling stereofy data:\n" + " ".join(cmd
) + "\n" + stderr
+ "\n" + stdout
538 stdout
= (stdout
+ " " + stderr
).replace("\n", " ")
541 m
= re
.match(r
".*#duration (?P<duration>[\d\.]+).*", stdout
)
543 duration
= float(m
.group("duration"))
546 m
= re
.match(r
".*#channels (?P<channels>\d+).*", stdout
)
548 channels
= float(m
.group("channels"))
551 m
= re
.match(r
".*#samplerate (?P<samplerate>\d+).*", stdout
)
553 samplerate
= float(m
.group("samplerate"))
556 m
= re
.match(r
".*#bitdepth (?P<bitdepth>\d+).*", stdout
)
558 bitdepth
= float(m
.group("bitdepth"))
560 bitrate
= (os
.path
.getsize(input_filename
) * 8.0) / 1024.0 / duration
if duration
> 0 else 0
562 return dict(duration
=duration
, channels
=channels
, samplerate
=samplerate
, bitrate
=bitrate
, bitdepth
=bitdepth
)
565 def convert_to_mp3(input_filename
, output_filename
, quality
=70):
567 converts the incoming wave file to a mp3 file
570 if not os
.path
.exists(input_filename
):
571 raise AudioProcessingException
, "file %s does not exist" % input_filename
573 command
= ["lame", "--silent", "--abr", str(quality
), input_filename
, output_filename
]
575 process
= subprocess
.Popen(command
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
576 (stdout
, stderr
) = process
.communicate()
578 if process
.returncode
!= 0 or not os
.path
.exists(output_filename
):
579 raise AudioProcessingException
, stdout
581 def convert_to_ogg(input_filename
, output_filename
, quality
=1):
583 converts the incoming wave file to n ogg file
586 if not os
.path
.exists(input_filename
):
587 raise AudioProcessingException
, "file %s does not exist" % input_filename
589 command
= ["oggenc", "-q", str(quality
), input_filename
, "-o", output_filename
]
591 process
= subprocess
.Popen(command
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
592 (stdout
, stderr
) = process
.communicate()
594 if process
.returncode
!= 0 or not os
.path
.exists(output_filename
):
595 raise AudioProcessingException
, stdout
597 def convert_using_ffmpeg(input_filename
, output_filename
):
599 converts the incoming wave file to stereo pcm using fffmpeg
602 def alarm_handler(signum
, frame
):
603 raise AudioProcessingException
, "timeout while waiting for ffmpeg"
605 if not os
.path
.exists(input_filename
):
606 raise AudioProcessingException
, "file %s does not exist" % input_filename
608 command
= ["ffmpeg", "-y", "-i", input_filename
, "-ac","1","-acodec", "pcm_s16le", "-ar", "44100", output_filename
]
610 process
= subprocess
.Popen(command
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
611 signal
.signal(signal
.SIGALRM
,alarm_handler
)
612 signal
.alarm(TIMEOUT
)
613 (stdout
, stderr
) = process
.communicate()
615 if process
.returncode
!= 0 or not os
.path
.exists(output_filename
):
616 raise AudioProcessingException
, stdout