[mediagoblin.git] / mediagoblin / media_types / audio / spectrogram.py

# processing.py -- various audio processing functions
# Copyright (C) 2008 MUSIC TECHNOLOGY GROUP (MTG)
#                    UNIVERSITAT POMPEU FABRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
#   Bram de Jong <bram.dejong at domain.com where domain in gmail>
#   2012, Joar Wandborg <first name at last name dot se>

from PIL import Image
import math
import numpy

try:
    import scikits.audiolab as audiolab
except ImportError:
    print "WARNING: audiolab is not installed so wav2png will not work"


class AudioProcessingException(Exception):
    pass


class SpectrogramImage(object):
    def __init__(self, image_size, fft_size):
        self.image_width, self.image_height = image_size
        self.fft_size = fft_size

        colors = [
            (0, 0, 0, 0),
            (58 / 4, 68 / 4, 65 / 4, 255),
            (80 / 2, 100 / 2, 153 / 2, 255),
            (90, 180, 100, 255),
            (224, 224, 44, 255),
            (255, 60, 30, 255),
            (255, 255, 255, 255)
         ]

        self.palette = interpolate_colors(colors)

        # Generate lookup table for y-coordinate from fft-bin
        self.y_to_bin = []

        fft_min = 100.0
        fft_max = 22050.0  # kHz?

        y_min = math.log10(fft_min)
        y_max = math.log10(fft_max)

        for y in range(self.image_height):
            freq = math.pow(
                    10.0,
                    y_min + y / (self.image_height - 1.0)
                    * (y_max - y_min))

            fft_bin = freq / fft_max * (self.fft_size / 2 + 1)

            if fft_bin < self.fft_size / 2:
                alpha = fft_bin - int(fft_bin)

                self.y_to_bin.append((int(fft_bin), alpha * 255))

        # this is a bit strange, but using image.load()[x,y] = ... is
        # a lot slower than using image.putadata and then rotating the image
        # so we store all the pixels in an array and then create the image when saving
        self.pixels = []

    def draw_spectrum(self, x, spectrum):
        # for all frequencies, draw the pixels
        for index, alpha in self.y_to_bin:
            self.pixels.append(
                    self.palette[int((255.0 - alpha) * spectrum[index]
                        + alpha * spectrum[index + 1])])

        # if the FFT is too small to fill up the image, fill with black to the top
        for y in range(len(self.y_to_bin), self.image_height):
            self.pixels.append(self.palette[0])

    def save(self, filename, quality=90):
        self.image = Image.new(
                'RGBA',
                (self.image_height, self.image_width))

        self.image.putdata(self.pixels)
        self.image.transpose(Image.ROTATE_90).save(
                filename,
                quality=quality)


class AudioProcessor(object):
    """
    The audio processor processes chunks of audio an calculates the spectrac centroid and the peak
    samples in that chunk of audio.
    """
    def __init__(self, input_filename, fft_size, window_function=numpy.hanning):
        max_level = get_max_level(input_filename)

        self.audio_file = audiolab.Sndfile(input_filename, 'r')
        self.fft_size = fft_size
        self.window = window_function(self.fft_size)
        self.spectrum_range = None
        self.lower = 100
        self.higher = 22050
        self.lower_log = math.log10(self.lower)
        self.higher_log = math.log10(self.higher)
        self.clip = lambda val, low, high: min(high, max(low, val))

        # figure out what the maximum value is for an FFT doing the FFT of a DC signal
        fft = numpy.fft.rfft(numpy.ones(fft_size) * self.window)
        max_fft = (numpy.abs(fft)).max()

        # set the scale to normalized audio and normalized FFT
        self.scale = 1.0 / max_level / max_fft if max_level > 0 else 1

    def read(self, start, size, resize_if_less=False):
        """ read size samples starting at start, if resize_if_less is True and less than size
        samples are read, resize the array to size and fill with zeros """

        # number of zeros to add to start and end of the buffer
        add_to_start = 0
        add_to_end = 0

        if start < 0:
            # the first FFT window starts centered around zero
            if size + start <= 0:
                return numpy.zeros(size) if resize_if_less else numpy.array([])
            else:
                self.audio_file.seek(0)

                add_to_start = - start  # remember: start is negative!
                to_read = size + start

                if to_read > self.audio_file.nframes:
                    add_to_end = to_read - self.audio_file.nframes
                    to_read = self.audio_file.nframes
        else:
            self.audio_file.seek(start)

            to_read = size
            if start + to_read >= self.audio_file.nframes:
                to_read = self.audio_file.nframes - start
                add_to_end = size - to_read

        try:
            samples = self.audio_file.read_frames(to_read)
        except RuntimeError:
            # this can happen for wave files with broken headers...
            return numpy.zeros(size) if resize_if_less else numpy.zeros(2)

        # convert to mono by selecting left channel only
        if self.audio_file.channels > 1:
            samples = samples[:,0]

        if resize_if_less and (add_to_start > 0 or add_to_end > 0):
            if add_to_start > 0:
                samples = numpy.concatenate((numpy.zeros(add_to_start), samples), axis=1)

            if add_to_end > 0:
                samples = numpy.resize(samples, size)
                samples[size - add_to_end:] = 0

        return samples

    def spectral_centroid(self, seek_point, spec_range=110.0):
        """ starting at seek_point read fft_size samples, and calculate the spectral centroid """

        samples = self.read(seek_point - self.fft_size/2, self.fft_size, True)

        samples *= self.window
        fft = numpy.fft.rfft(samples)
        spectrum = self.scale * numpy.abs(fft)  # normalized abs(FFT) between 0 and 1

        length = numpy.float64(spectrum.shape[0])

        # scale the db spectrum from [- spec_range db ... 0 db] > [0..1]
        db_spectrum = ((20*(numpy.log10(spectrum + 1e-60))).clip(-spec_range, 0.0) + spec_range)/spec_range

        energy = spectrum.sum()
        spectral_centroid = 0

        if energy > 1e-60:
            # calculate the spectral centroid

            if self.spectrum_range == None:
                self.spectrum_range = numpy.arange(length)

            spectral_centroid = (spectrum * self.spectrum_range).sum() / (energy * (length - 1)) * self.audio_file.samplerate * 0.5

            # clip > log10 > scale between 0 and 1
            spectral_centroid = (math.log10(self.clip(spectral_centroid, self.lower, self.higher)) - self.lower_log) / (self.higher_log - self.lower_log)

        return (spectral_centroid, db_spectrum)


    def peaks(self, start_seek, end_seek):
        """ read all samples between start_seek and end_seek, then find the minimum and maximum peak
        in that range. Returns that pair in the order they were found. So if min was found first,
        it returns (min, max) else the other way around. """

        # larger blocksizes are faster but take more mem...
        # Aha, Watson, a clue, a tradeof!
        block_size = 4096

        max_index = -1
        max_value = -1
        min_index = -1
        min_value = 1

        if start_seek < 0:
            start_seek = 0

        if end_seek > self.audio_file.nframes:
            end_seek = self.audio_file.nframes

        if end_seek <= start_seek:
            samples = self.read(start_seek, 1)
            return (samples[0], samples[0])

        if block_size > end_seek - start_seek:
            block_size = end_seek - start_seek

        for i in range(start_seek, end_seek, block_size):
            samples = self.read(i, block_size)

            local_max_index = numpy.argmax(samples)
            local_max_value = samples[local_max_index]

            if local_max_value > max_value:
                max_value = local_max_value
                max_index = local_max_index

            local_min_index = numpy.argmin(samples)
            local_min_value = samples[local_min_index]

            if local_min_value < min_value:
                min_value = local_min_value
                min_index = local_min_index

        return (min_value, max_value) if min_index < max_index else (max_value, min_value)


def create_spectrogram_image(source_filename, output_filename,
        image_size, fft_size, progress_callback=None):

    processor = AudioProcessor(source_filename, fft_size, numpy.hamming)
    samples_per_pixel = processor.audio_file.nframes / float(image_size[0])

    spectrogram = SpectrogramImage(image_size, fft_size)

    for x in range(image_size[0]):
        if progress_callback and x % (image_size[0] / 10) == 0:
            progress_callback((x * 100) / image_size[0])

        seek_point = int(x * samples_per_pixel)
        next_seek_point = int((x + 1) * samples_per_pixel)

        (spectral_centroid, db_spectrum) = processor.spectral_centroid(seek_point)

        spectrogram.draw_spectrum(x, db_spectrum)

    if progress_callback:
        progress_callback(100)

    spectrogram.save(output_filename)


def interpolate_colors(colors, flat=False, num_colors=256):

    palette = []

    for i in range(num_colors):
        # TODO: What does this do?
        index = (
                (i *
                    (len(colors) - 1)  # 7
                )  # 0..7..14..21..28...
            /
                (num_colors - 1.0)  # 255.0
            )

        # TODO: What is the meaning of 'alpha' in this context?
        alpha = index - round(index)

        channels = list('rgb')
        values = dict()

        for k, v in zip(range(len(channels)), channels):
            if alpha > 0:
                values[v] = (
                        (1.0 - alpha)
                    *
                        colors[int(index)][k]
                    +
                        alpha * colors[int(index) + 1][k]
                    )
            else:
                values[v] = (
                        (1.0 - alpha)
                    *
                        colors[int(index)][k]
                    )

        if flat:
            palette.extend(
                tuple(int(values[i]) for i in channels))
        else:
            palette.append(
                tuple(int(values[i]) for i in channels))

    return palette


def get_max_level(filename):
    max_value = 0
    buffer_size = 4096
    audio_file = audiolab.Sndfile(filename, 'r')
    n_samples_left = audio_file.nframes

    while n_samples_left:
        to_read = min(buffer_size, n_samples_left)

        try:
            samples = audio_file.read_frames(to_read)
        except RuntimeError:
            # this can happen with a broken header
            break

        # convert to mono by selecting left channel only
        if audio_file.channels > 1:
            samples = samples[:,0]

        max_value = max(max_value, numpy.abs(samples).max())

        n_samples_left -= to_read

    audio_file.close()

    return max_value

if __name__ == '__main__':
    import sys
    sys.argv[4] = int(sys.argv[4])
    sys.argv[3] = tuple([int(i) for i in sys.argv[3].split('x')])

    create_spectrogram_image(*sys.argv[1:])
Commit	Line	Data
	1	# processing.py -- various audio processing functions
	2	# Copyright (C) 2008 MUSIC TECHNOLOGY GROUP (MTG)
	3	# UNIVERSITAT POMPEU FABRA
	4	#
	5	# This program is free software: you can redistribute it and/or modify
	6	# it under the terms of the GNU Affero General Public License as
	7	# published by the Free Software Foundation, either version 3 of the
	8	# License, or (at your option) any later version.
	9	#
	10	# This program is distributed in the hope that it will be useful,
	11	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	13	# GNU Affero General Public License for more details.
	14	#
	15	# You should have received a copy of the GNU Affero General Public License
	16	# along with this program. If not, see <http://www.gnu.org/licenses/>.
	17	#
	18	# Authors:
	19	# Bram de Jong <bram.dejong at domain.com where domain in gmail>
	20	# 2012, Joar Wandborg <first name at last name dot se>
	21
	22	from PIL import Image
	23	import math
	24	import numpy
	25
	26	try:
	27	import scikits.audiolab as audiolab
	28	except ImportError:
	29	print "WARNING: audiolab is not installed so wav2png will not work"
	30
	31
	32	class AudioProcessingException(Exception):
	33	pass
	34
	35
	36	class SpectrogramImage(object):
	37	def __init__(self, image_size, fft_size):
	38	self.image_width, self.image_height = image_size
	39	self.fft_size = fft_size
	40
	41	colors = [
	42	(0, 0, 0, 0),
	43	(58 / 4, 68 / 4, 65 / 4, 255),
	44	(80 / 2, 100 / 2, 153 / 2, 255),
	45	(90, 180, 100, 255),
	46	(224, 224, 44, 255),
	47	(255, 60, 30, 255),
	48	(255, 255, 255, 255)
	49	]
	50
	51	self.palette = interpolate_colors(colors)
	52
	53	# Generate lookup table for y-coordinate from fft-bin
	54	self.y_to_bin = []
	55
	56	fft_min = 100.0
	57	fft_max = 22050.0 # kHz?
	58
	59	y_min = math.log10(fft_min)
	60	y_max = math.log10(fft_max)
	61
	62	for y in range(self.image_height):
	63	freq = math.pow(
	64	10.0,
	65	y_min + y / (self.image_height - 1.0)
	66	* (y_max - y_min))
	67
	68	fft_bin = freq / fft_max * (self.fft_size / 2 + 1)
	69
	70	if fft_bin < self.fft_size / 2:
	71	alpha = fft_bin - int(fft_bin)
	72
	73	self.y_to_bin.append((int(fft_bin), alpha * 255))
	74
	75	# this is a bit strange, but using image.load()[x,y] = ... is
	76	# a lot slower than using image.putadata and then rotating the image
	77	# so we store all the pixels in an array and then create the image when saving
	78	self.pixels = []
	79
	80	def draw_spectrum(self, x, spectrum):
	81	# for all frequencies, draw the pixels
	82	for index, alpha in self.y_to_bin:
	83	self.pixels.append(
	84	self.palette[int((255.0 - alpha) * spectrum[index]
	85	+ alpha * spectrum[index + 1])])
	86
	87	# if the FFT is too small to fill up the image, fill with black to the top
	88	for y in range(len(self.y_to_bin), self.image_height):
	89	self.pixels.append(self.palette[0])
	90
	91	def save(self, filename, quality=90):
	92	self.image = Image.new(
	93	'RGBA',
	94	(self.image_height, self.image_width))
	95
	96	self.image.putdata(self.pixels)
	97	self.image.transpose(Image.ROTATE_90).save(
	98	filename,
	99	quality=quality)
	100
	101
	102	class AudioProcessor(object):
	103	"""
	104	The audio processor processes chunks of audio an calculates the spectrac centroid and the peak
	105	samples in that chunk of audio.
	106	"""
	107	def __init__(self, input_filename, fft_size, window_function=numpy.hanning):
	108	max_level = get_max_level(input_filename)
	109
	110	self.audio_file = audiolab.Sndfile(input_filename, 'r')
	111	self.fft_size = fft_size
	112	self.window = window_function(self.fft_size)
	113	self.spectrum_range = None
	114	self.lower = 100
	115	self.higher = 22050
	116	self.lower_log = math.log10(self.lower)
	117	self.higher_log = math.log10(self.higher)
	118	self.clip = lambda val, low, high: min(high, max(low, val))
	119
	120	# figure out what the maximum value is for an FFT doing the FFT of a DC signal
	121	fft = numpy.fft.rfft(numpy.ones(fft_size) * self.window)
	122	max_fft = (numpy.abs(fft)).max()
	123
	124	# set the scale to normalized audio and normalized FFT
	125	self.scale = 1.0 / max_level / max_fft if max_level > 0 else 1
	126
	127	def read(self, start, size, resize_if_less=False):
	128	""" read size samples starting at start, if resize_if_less is True and less than size
	129	samples are read, resize the array to size and fill with zeros """
	130
	131	# number of zeros to add to start and end of the buffer
	132	add_to_start = 0
	133	add_to_end = 0
	134
	135	if start < 0:
	136	# the first FFT window starts centered around zero
	137	if size + start <= 0:
	138	return numpy.zeros(size) if resize_if_less else numpy.array([])
	139	else:
	140	self.audio_file.seek(0)
	141
	142	add_to_start = - start # remember: start is negative!
	143	to_read = size + start
	144
	145	if to_read > self.audio_file.nframes:
	146	add_to_end = to_read - self.audio_file.nframes
	147	to_read = self.audio_file.nframes
	148	else:
	149	self.audio_file.seek(start)
	150
	151	to_read = size
	152	if start + to_read >= self.audio_file.nframes:
	153	to_read = self.audio_file.nframes - start
	154	add_to_end = size - to_read
	155
	156	try:
	157	samples = self.audio_file.read_frames(to_read)
	158	except RuntimeError:
	159	# this can happen for wave files with broken headers...
	160	return numpy.zeros(size) if resize_if_less else numpy.zeros(2)
	161
	162	# convert to mono by selecting left channel only
	163	if self.audio_file.channels > 1:
	164	samples = samples[:,0]
	165
	166	if resize_if_less and (add_to_start > 0 or add_to_end > 0):
	167	if add_to_start > 0:
	168	samples = numpy.concatenate((numpy.zeros(add_to_start), samples), axis=1)
	169
	170	if add_to_end > 0:
	171	samples = numpy.resize(samples, size)
	172	samples[size - add_to_end:] = 0
	173
	174	return samples
	175
	176	def spectral_centroid(self, seek_point, spec_range=110.0):
	177	""" starting at seek_point read fft_size samples, and calculate the spectral centroid """
	178
	179	samples = self.read(seek_point - self.fft_size/2, self.fft_size, True)
	180
	181	samples *= self.window
	182	fft = numpy.fft.rfft(samples)
	183	spectrum = self.scale * numpy.abs(fft) # normalized abs(FFT) between 0 and 1
	184
	185	length = numpy.float64(spectrum.shape[0])
	186
	187	# scale the db spectrum from [- spec_range db ... 0 db] > [0..1]
	188	db_spectrum = ((20*(numpy.log10(spectrum + 1e-60))).clip(-spec_range, 0.0) + spec_range)/spec_range
	189
	190	energy = spectrum.sum()
	191	spectral_centroid = 0
	192
	193	if energy > 1e-60:
	194	# calculate the spectral centroid
	195
	196	if self.spectrum_range == None:
	197	self.spectrum_range = numpy.arange(length)
	198
	199	spectral_centroid = (spectrum * self.spectrum_range).sum() / (energy * (length - 1)) * self.audio_file.samplerate * 0.5
	200
	201	# clip > log10 > scale between 0 and 1
	202	spectral_centroid = (math.log10(self.clip(spectral_centroid, self.lower, self.higher)) - self.lower_log) / (self.higher_log - self.lower_log)
	203
	204	return (spectral_centroid, db_spectrum)
	205
	206
	207	def peaks(self, start_seek, end_seek):
	208	""" read all samples between start_seek and end_seek, then find the minimum and maximum peak
	209	in that range. Returns that pair in the order they were found. So if min was found first,
	210	it returns (min, max) else the other way around. """
	211
	212	# larger blocksizes are faster but take more mem...
	213	# Aha, Watson, a clue, a tradeof!
	214	block_size = 4096
	215
	216	max_index = -1
	217	max_value = -1
	218	min_index = -1
	219	min_value = 1
	220
	221	if start_seek < 0:
	222	start_seek = 0
	223
	224	if end_seek > self.audio_file.nframes:
	225	end_seek = self.audio_file.nframes
	226
	227	if end_seek <= start_seek:
	228	samples = self.read(start_seek, 1)
	229	return (samples[0], samples[0])
	230
	231	if block_size > end_seek - start_seek:
	232	block_size = end_seek - start_seek
	233
	234	for i in range(start_seek, end_seek, block_size):
	235	samples = self.read(i, block_size)
	236
	237	local_max_index = numpy.argmax(samples)
	238	local_max_value = samples[local_max_index]
	239
	240	if local_max_value > max_value:
	241	max_value = local_max_value
	242	max_index = local_max_index
	243
	244	local_min_index = numpy.argmin(samples)
	245	local_min_value = samples[local_min_index]
	246
	247	if local_min_value < min_value:
	248	min_value = local_min_value
	249	min_index = local_min_index
	250
	251	return (min_value, max_value) if min_index < max_index else (max_value, min_value)
	252
	253
	254	def create_spectrogram_image(source_filename, output_filename,
	255	image_size, fft_size, progress_callback=None):
	256
	257	processor = AudioProcessor(source_filename, fft_size, numpy.hamming)
	258	samples_per_pixel = processor.audio_file.nframes / float(image_size[0])
	259
	260	spectrogram = SpectrogramImage(image_size, fft_size)
	261
	262	for x in range(image_size[0]):
	263	if progress_callback and x % (image_size[0] / 10) == 0:
	264	progress_callback((x * 100) / image_size[0])
	265
	266	seek_point = int(x * samples_per_pixel)
	267	next_seek_point = int((x + 1) * samples_per_pixel)
	268
	269	(spectral_centroid, db_spectrum) = processor.spectral_centroid(seek_point)
	270
	271	spectrogram.draw_spectrum(x, db_spectrum)
	272
	273	if progress_callback:
	274	progress_callback(100)
	275
	276	spectrogram.save(output_filename)
	277
	278
	279	def interpolate_colors(colors, flat=False, num_colors=256):
	280
	281	palette = []
	282
	283	for i in range(num_colors):
	284	# TODO: What does this do?
	285	index = (
	286	(i *
	287	(len(colors) - 1) # 7
	288	) # 0..7..14..21..28...
	289	/
	290	(num_colors - 1.0) # 255.0
	291	)
	292
	293	# TODO: What is the meaning of 'alpha' in this context?
	294	alpha = index - round(index)
	295
	296	channels = list('rgb')
	297	values = dict()
	298
	299	for k, v in zip(range(len(channels)), channels):
	300	if alpha > 0:
	301	values[v] = (
	302	(1.0 - alpha)
	303	*
	304	colors[int(index)][k]
	305	+
	306	alpha * colors[int(index) + 1][k]
	307	)
	308	else:
	309	values[v] = (
	310	(1.0 - alpha)
	311	*
	312	colors[int(index)][k]
	313	)
	314
	315	if flat:
	316	palette.extend(
	317	tuple(int(values[i]) for i in channels))
	318	else:
	319	palette.append(
	320	tuple(int(values[i]) for i in channels))
	321
	322	return palette
	323
	324
	325	def get_max_level(filename):
	326	max_value = 0
	327	buffer_size = 4096
	328	audio_file = audiolab.Sndfile(filename, 'r')
	329	n_samples_left = audio_file.nframes
	330
	331	while n_samples_left:
	332	to_read = min(buffer_size, n_samples_left)
	333
	334	try:
	335	samples = audio_file.read_frames(to_read)
	336	except RuntimeError:
	337	# this can happen with a broken header
	338	break
	339
	340	# convert to mono by selecting left channel only
	341	if audio_file.channels > 1:
	342	samples = samples[:,0]
	343
	344	max_value = max(max_value, numpy.abs(samples).max())
	345
	346	n_samples_left -= to_read
	347
	348	audio_file.close()
	349
	350	return max_value
	351
	352	if __name__ == '__main__':
	353	import sys
	354	sys.argv[4] = int(sys.argv[4])
	355	sys.argv[3] = tuple([int(i) for i in sys.argv[3].split('x')])
	356
	357	create_spectrogram_image(*sys.argv[1:])