1 # processing.py -- various audio processing functions
2 # Copyright (C) 2008 MUSIC TECHNOLOGY GROUP (MTG)
3 # UNIVERSITAT POMPEU FABRA
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Affero General Public License as
7 # published by the Free Software Foundation, either version 3 of the
8 # License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU Affero General Public License for more details.
15 # You should have received a copy of the GNU Affero General Public License
16 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19 # Bram de Jong <bram.dejong at domain.com where domain in gmail>
20 # 2012, Joar Wandborg <first name at last name dot se>
27 import scikits
.audiolab
as audiolab
29 print "WARNING: audiolab is not installed so wav2png will not work"
32 class AudioProcessingException(Exception):
36 class SpectrogramImage(object):
37 def __init__(self
, image_size
, fft_size
):
38 self
.image_width
, self
.image_height
= image_size
39 self
.fft_size
= fft_size
43 (58 / 4, 68 / 4, 65 / 4, 255),
44 (80 / 2, 100 / 2, 153 / 2, 255),
51 self
.palette
= interpolate_colors(colors
)
53 # Generate lookup table for y-coordinate from fft-bin
57 fft_max
= 22050.0 # kHz?
59 y_min
= math
.log10(fft_min
)
60 y_max
= math
.log10(fft_max
)
62 for y
in range(self
.image_height
):
65 y_min
+ y
/ (self
.image_height
- 1.0)
68 fft_bin
= freq
/ fft_max
* (self
.fft_size
/ 2 + 1)
70 if fft_bin
< self
.fft_size
/ 2:
71 alpha
= fft_bin
- int(fft_bin
)
73 self
.y_to_bin
.append((int(fft_bin
), alpha
* 255))
75 # this is a bit strange, but using image.load()[x,y] = ... is
76 # a lot slower than using image.putadata and then rotating the image
77 # so we store all the pixels in an array and then create the image when saving
80 def draw_spectrum(self
, x
, spectrum
):
81 # for all frequencies, draw the pixels
82 for index
, alpha
in self
.y_to_bin
:
84 self
.palette
[int((255.0 - alpha
) * spectrum
[index
]
85 + alpha
* spectrum
[index
+ 1])])
87 # if the FFT is too small to fill up the image, fill with black to the top
88 for y
in range(len(self
.y_to_bin
), self
.image_height
):
89 self
.pixels
.append(self
.palette
[0])
91 def save(self
, filename
, quality
=90):
92 self
.image
= Image
.new(
94 (self
.image_height
, self
.image_width
))
96 self
.image
.putdata(self
.pixels
)
97 self
.image
.transpose(Image
.ROTATE_90
).save(
102 class AudioProcessor(object):
104 The audio processor processes chunks of audio an calculates the spectrac centroid and the peak
105 samples in that chunk of audio.
107 def __init__(self
, input_filename
, fft_size
, window_function
=numpy
.hanning
):
108 max_level
= get_max_level(input_filename
)
110 self
.audio_file
= audiolab
.Sndfile(input_filename
, 'r')
111 self
.fft_size
= fft_size
112 self
.window
= window_function(self
.fft_size
)
113 self
.spectrum_range
= None
116 self
.lower_log
= math
.log10(self
.lower
)
117 self
.higher_log
= math
.log10(self
.higher
)
118 self
.clip
= lambda val
, low
, high
: min(high
, max(low
, val
))
120 # figure out what the maximum value is for an FFT doing the FFT of a DC signal
121 fft
= numpy
.fft
.rfft(numpy
.ones(fft_size
) * self
.window
)
122 max_fft
= (numpy
.abs(fft
)).max()
124 # set the scale to normalized audio and normalized FFT
125 self
.scale
= 1.0 / max_level
/ max_fft
if max_level
> 0 else 1
127 def read(self
, start
, size
, resize_if_less
=False):
128 """ read size samples starting at start, if resize_if_less is True and less than size
129 samples are read, resize the array to size and fill with zeros """
131 # number of zeros to add to start and end of the buffer
136 # the first FFT window starts centered around zero
137 if size
+ start
<= 0:
138 return numpy
.zeros(size
) if resize_if_less
else numpy
.array([])
140 self
.audio_file
.seek(0)
142 add_to_start
= - start
# remember: start is negative!
143 to_read
= size
+ start
145 if to_read
> self
.audio_file
.nframes
:
146 add_to_end
= to_read
- self
.audio_file
.nframes
147 to_read
= self
.audio_file
.nframes
149 self
.audio_file
.seek(start
)
152 if start
+ to_read
>= self
.audio_file
.nframes
:
153 to_read
= self
.audio_file
.nframes
- start
154 add_to_end
= size
- to_read
157 samples
= self
.audio_file
.read_frames(to_read
)
159 # this can happen for wave files with broken headers...
160 return numpy
.zeros(size
) if resize_if_less
else numpy
.zeros(2)
162 # convert to mono by selecting left channel only
163 if self
.audio_file
.channels
> 1:
164 samples
= samples
[:,0]
166 if resize_if_less
and (add_to_start
> 0 or add_to_end
> 0):
168 samples
= numpy
.concatenate((numpy
.zeros(add_to_start
), samples
), axis
=1)
171 samples
= numpy
.resize(samples
, size
)
172 samples
[size
- add_to_end
:] = 0
176 def spectral_centroid(self
, seek_point
, spec_range
=110.0):
177 """ starting at seek_point read fft_size samples, and calculate the spectral centroid """
179 samples
= self
.read(seek_point
- self
.fft_size
/2, self
.fft_size
, True)
181 samples
*= self
.window
182 fft
= numpy
.fft
.rfft(samples
)
183 spectrum
= self
.scale
* numpy
.abs(fft
) # normalized abs(FFT) between 0 and 1
185 length
= numpy
.float64(spectrum
.shape
[0])
187 # scale the db spectrum from [- spec_range db ... 0 db] > [0..1]
188 db_spectrum
= ((20*(numpy
.log10(spectrum
+ 1e-60))).clip(-spec_range
, 0.0) + spec_range
)/spec_range
190 energy
= spectrum
.sum()
191 spectral_centroid
= 0
194 # calculate the spectral centroid
196 if self
.spectrum_range
== None:
197 self
.spectrum_range
= numpy
.arange(length
)
199 spectral_centroid
= (spectrum
* self
.spectrum_range
).sum() / (energy
* (length
- 1)) * self
.audio_file
.samplerate
* 0.5
201 # clip > log10 > scale between 0 and 1
202 spectral_centroid
= (math
.log10(self
.clip(spectral_centroid
, self
.lower
, self
.higher
)) - self
.lower_log
) / (self
.higher_log
- self
.lower_log
)
204 return (spectral_centroid
, db_spectrum
)
207 def peaks(self
, start_seek
, end_seek
):
208 """ read all samples between start_seek and end_seek, then find the minimum and maximum peak
209 in that range. Returns that pair in the order they were found. So if min was found first,
210 it returns (min, max) else the other way around. """
212 # larger blocksizes are faster but take more mem...
213 # Aha, Watson, a clue, a tradeof!
224 if end_seek
> self
.audio_file
.nframes
:
225 end_seek
= self
.audio_file
.nframes
227 if end_seek
<= start_seek
:
228 samples
= self
.read(start_seek
, 1)
229 return (samples
[0], samples
[0])
231 if block_size
> end_seek
- start_seek
:
232 block_size
= end_seek
- start_seek
234 for i
in range(start_seek
, end_seek
, block_size
):
235 samples
= self
.read(i
, block_size
)
237 local_max_index
= numpy
.argmax(samples
)
238 local_max_value
= samples
[local_max_index
]
240 if local_max_value
> max_value
:
241 max_value
= local_max_value
242 max_index
= local_max_index
244 local_min_index
= numpy
.argmin(samples
)
245 local_min_value
= samples
[local_min_index
]
247 if local_min_value
< min_value
:
248 min_value
= local_min_value
249 min_index
= local_min_index
251 return (min_value
, max_value
) if min_index
< max_index
else (max_value
, min_value
)
254 def create_spectrogram_image(source_filename
, output_filename
,
255 image_size
, fft_size
, progress_callback
=None):
257 processor
= AudioProcessor(source_filename
, fft_size
, numpy
.hamming
)
258 samples_per_pixel
= processor
.audio_file
.nframes
/ float(image_size
[0])
260 spectrogram
= SpectrogramImage(image_size
, fft_size
)
262 for x
in range(image_size
[0]):
263 if progress_callback
and x
% (image_size
[0] / 10) == 0:
264 progress_callback((x
* 100) / image_size
[0])
266 seek_point
= int(x
* samples_per_pixel
)
267 next_seek_point
= int((x
+ 1) * samples_per_pixel
)
269 (spectral_centroid
, db_spectrum
) = processor
.spectral_centroid(seek_point
)
271 spectrogram
.draw_spectrum(x
, db_spectrum
)
273 if progress_callback
:
274 progress_callback(100)
276 spectrogram
.save(output_filename
)
279 def interpolate_colors(colors
, flat
=False, num_colors
=256):
283 for i
in range(num_colors
):
284 # TODO: What does this do?
287 (len(colors
) - 1) # 7
288 ) # 0..7..14..21..28...
290 (num_colors
- 1.0) # 255.0
293 # TODO: What is the meaning of 'alpha' in this context?
294 alpha
= index
- round(index
)
296 channels
= list('rgb')
299 for k
, v
in zip(range(len(channels
)), channels
):
304 colors
[int(index
)][k
]
306 alpha
* colors
[int(index
) + 1][k
]
312 colors
[int(index
)][k
]
317 tuple(int(values
[i
]) for i
in channels
))
320 tuple(int(values
[i
]) for i
in channels
))
325 def get_max_level(filename
):
328 audio_file
= audiolab
.Sndfile(filename
, 'r')
329 n_samples_left
= audio_file
.nframes
331 while n_samples_left
:
332 to_read
= min(buffer_size
, n_samples_left
)
335 samples
= audio_file
.read_frames(to_read
)
337 # this can happen with a broken header
340 # convert to mono by selecting left channel only
341 if audio_file
.channels
> 1:
342 samples
= samples
[:,0]
344 max_value
= max(max_value
, numpy
.abs(samples
).max())
346 n_samples_left
-= to_read
352 if __name__
== '__main__':
354 sys
.argv
[4] = int(sys
.argv
[4])
355 sys
.argv
[3] = tuple([int(i
) for i
in sys
.argv
[3].split('x')])
357 create_spectrogram_image(*sys
.argv
[1:])