extlib/freesound/audioprocessing.py

   1 #!/usr/bin/env python
   2 # processing.py -- various audio processing functions
   3 # Copyright (C) 2008 MUSIC TECHNOLOGY GROUP (MTG)
   4 #                    UNIVERSITAT POMPEU FABRA
   5 #
   6 # This program is free software: you can redistribute it and/or modify
   7 # it under the terms of the GNU Affero General Public License as
   8 # published by the Free Software Foundation, either version 3 of the
   9 # License, or (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU Affero General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 #
  19 # Authors:
  20 #   Bram de Jong <bram.dejong at domain.com where domain in gmail>
  21 #   2012, Joar Wandborg <first name at last name dot se>
  22
  23 from PIL import Image, ImageDraw, ImageColor #@UnresolvedImport
  24 from functools import partial
  25 import math
  26 import numpy
  27 import os
  28 import re
  29 import signal
  30
  31
  32 def get_sound_type(input_filename):
  33     sound_type = os.path.splitext(input_filename.lower())[1].strip(".")
  34
  35     if sound_type == "fla":
  36         sound_type = "flac"
  37     elif sound_type == "aif":
  38         sound_type = "aiff"
  39
  40     return sound_type
  41
  42
  43 try:
  44     import scikits.audiolab as audiolab
  45 except ImportError:
  46     print "WARNING: audiolab is not installed so wav2png will not work"
  47 import subprocess
  48
  49 class AudioProcessingException(Exception):
  50     pass
  51
  52 class TestAudioFile(object):
  53     """A class that mimics audiolab.sndfile but generates noise instead of reading
  54     a wave file. Additionally it can be told to have a "broken" header and thus crashing
  55     in the middle of the file. Also useful for testing ultra-short files of 20 samples."""
  56     def __init__(self, num_frames, has_broken_header=False):
  57         self.seekpoint = 0
  58         self.nframes = num_frames
  59         self.samplerate = 44100
  60         self.channels = 1
  61         self.has_broken_header = has_broken_header
  62
  63     def seek(self, seekpoint):
  64         self.seekpoint = seekpoint
  65
  66     def read_frames(self, frames_to_read):
  67         if self.has_broken_header and self.seekpoint + frames_to_read > self.num_frames / 2:
  68             raise RuntimeError()
  69
  70         num_frames_left = self.num_frames - self.seekpoint
  71         will_read = num_frames_left if num_frames_left < frames_to_read else frames_to_read
  72         self.seekpoint += will_read
  73         return numpy.random.random(will_read)*2 - 1
  74
  75
  76 def get_max_level(filename):
  77     max_value = 0
  78     buffer_size = 4096
  79     audio_file = audiolab.Sndfile(filename, 'r')
  80     n_samples_left = audio_file.nframes
  81
  82     while n_samples_left:
  83         to_read = min(buffer_size, n_samples_left)
  84
  85         try:
  86             samples = audio_file.read_frames(to_read)
  87         except RuntimeError:
  88             # this can happen with a broken header
  89             break
  90
  91         # convert to mono by selecting left channel only
  92         if audio_file.channels > 1:
  93             samples = samples[:,0]
  94
  95         max_value = max(max_value, numpy.abs(samples).max())
  96
  97         n_samples_left -= to_read
  98
  99     audio_file.close()
 100
 101     return max_value
 102
 103 class AudioProcessor(object):
 104     """
 105     The audio processor processes chunks of audio an calculates the spectrac centroid and the peak
 106     samples in that chunk of audio.
 107     """
 108     def __init__(self, input_filename, fft_size, window_function=numpy.hanning):
 109         max_level = get_max_level(input_filename)
 110
 111         self.audio_file = audiolab.Sndfile(input_filename, 'r')
 112         self.fft_size = fft_size
 113         self.window = window_function(self.fft_size)
 114         self.spectrum_range = None
 115         self.lower = 100
 116         self.higher = 22050
 117         self.lower_log = math.log10(self.lower)
 118         self.higher_log = math.log10(self.higher)
 119         self.clip = lambda val, low, high: min(high, max(low, val))
 120
 121         # figure out what the maximum value is for an FFT doing the FFT of a DC signal
 122         fft = numpy.fft.rfft(numpy.ones(fft_size) * self.window)
 123         max_fft = (numpy.abs(fft)).max()
 124         # set the scale to normalized audio and normalized FFT
 125         self.scale = 1.0/max_level/max_fft if max_level > 0 else 1
 126
 127     def read(self, start, size, resize_if_less=False):
 128         """ read size samples starting at start, if resize_if_less is True and less than size
 129         samples are read, resize the array to size and fill with zeros """
 130
 131         # number of zeros to add to start and end of the buffer
 132         add_to_start = 0
 133         add_to_end = 0
 134
 135         if start < 0:
 136             # the first FFT window starts centered around zero
 137             if size + start <= 0:
 138                 return numpy.zeros(size) if resize_if_less else numpy.array([])
 139             else:
 140                 self.audio_file.seek(0)
 141
 142                 add_to_start = -start # remember: start is negative!
 143                 to_read = size + start
 144
 145                 if to_read > self.audio_file.nframes:
 146                     add_to_end = to_read - self.audio_file.nframes
 147                     to_read = self.audio_file.nframes
 148         else:
 149             self.audio_file.seek(start)
 150
 151             to_read = size
 152             if start + to_read >= self.audio_file.nframes:
 153                 to_read = self.audio_file.nframes - start
 154                 add_to_end = size - to_read
 155
 156         try:
 157             samples = self.audio_file.read_frames(to_read)
 158         except RuntimeError:
 159             # this can happen for wave files with broken headers...
 160             return numpy.zeros(size) if resize_if_less else numpy.zeros(2)
 161
 162         # convert to mono by selecting left channel only
 163         if self.audio_file.channels > 1:
 164             samples = samples[:,0]
 165
 166         if resize_if_less and (add_to_start > 0 or add_to_end > 0):
 167             if add_to_start > 0:
 168                 samples = numpy.concatenate((numpy.zeros(add_to_start), samples), axis=1)
 169
 170             if add_to_end > 0:
 171                 samples = numpy.resize(samples, size)
 172                 samples[size - add_to_end:] = 0
 173
 174         return samples
 175
 176
 177     def spectral_centroid(self, seek_point, spec_range=110.0):
 178         """ starting at seek_point read fft_size samples, and calculate the spectral centroid """
 179
 180         samples = self.read(seek_point - self.fft_size/2, self.fft_size, True)
 181
 182         samples *= self.window
 183         fft = numpy.fft.rfft(samples)
 184         spectrum = self.scale * numpy.abs(fft) # normalized abs(FFT) between 0 and 1
 185         length = numpy.float64(spectrum.shape[0])
 186
 187         # scale the db spectrum from [- spec_range db ... 0 db] > [0..1]
 188         db_spectrum = ((20*(numpy.log10(spectrum + 1e-60))).clip(-spec_range, 0.0) + spec_range)/spec_range
 189
 190         energy = spectrum.sum()
 191         spectral_centroid = 0
 192
 193         if energy > 1e-60:
 194             # calculate the spectral centroid
 195
 196             if self.spectrum_range == None:
 197                 self.spectrum_range = numpy.arange(length)
 198
 199             spectral_centroid = (spectrum * self.spectrum_range).sum() / (energy * (length - 1)) * self.audio_file.samplerate * 0.5
 200
 201             # clip > log10 > scale between 0 and 1
 202             spectral_centroid = (math.log10(self.clip(spectral_centroid, self.lower, self.higher)) - self.lower_log) / (self.higher_log - self.lower_log)
 203
 204         return (spectral_centroid, db_spectrum)
 205
 206
 207     def peaks(self, start_seek, end_seek):
 208         """ read all samples between start_seek and end_seek, then find the minimum and maximum peak
 209         in that range. Returns that pair in the order they were found. So if min was found first,
 210         it returns (min, max) else the other way around. """
 211
 212         # larger blocksizes are faster but take more mem...
 213         # Aha, Watson, a clue, a tradeof!
 214         block_size = 4096
 215
 216         max_index = -1
 217         max_value = -1
 218         min_index = -1
 219         min_value = 1
 220
 221         if start_seek < 0:
 222             start_seek = 0
 223
 224         if end_seek > self.audio_file.nframes:
 225             end_seek = self.audio_file.nframes
 226
 227         if end_seek <= start_seek:
 228             samples = self.read(start_seek, 1)
 229             return (samples[0], samples[0])
 230
 231         if block_size > end_seek - start_seek:
 232             block_size = end_seek - start_seek
 233
 234         for i in range(start_seek, end_seek, block_size):
 235             samples = self.read(i, block_size)
 236
 237             local_max_index = numpy.argmax(samples)
 238             local_max_value = samples[local_max_index]
 239
 240             if local_max_value > max_value:
 241                 max_value = local_max_value
 242                 max_index = local_max_index
 243
 244             local_min_index = numpy.argmin(samples)
 245             local_min_value = samples[local_min_index]
 246
 247             if local_min_value < min_value:
 248                 min_value = local_min_value
 249                 min_index = local_min_index
 250
 251         return (min_value, max_value) if min_index < max_index else (max_value, min_value)
 252
 253
 254 def interpolate_colors(colors, flat=False, num_colors=256):
 255     """ given a list of colors, create a larger list of colors interpolating
 256     the first one. If flatten is True a list of numers will be returned. If
 257     False, a list of (r,g,b) tuples. num_colors is the number of colors wanted
 258     in the final list """
 259
 260     palette = []
 261
 262     for i in range(num_colors):
 263         index = (i * (len(colors) - 1))/(num_colors - 1.0)
 264         index_int = int(index)
 265         alpha = index - float(index_int)
 266
 267         if alpha > 0:
 268             r = (1.0 - alpha) * colors[index_int][0] + alpha * colors[index_int + 1][0]
 269             g = (1.0 - alpha) * colors[index_int][1] + alpha * colors[index_int + 1][1]
 270             b = (1.0 - alpha) * colors[index_int][2] + alpha * colors[index_int + 1][2]
 271         else:
 272             r = (1.0 - alpha) * colors[index_int][0]
 273             g = (1.0 - alpha) * colors[index_int][1]
 274             b = (1.0 - alpha) * colors[index_int][2]
 275
 276         if flat:
 277             palette.extend((int(r), int(g), int(b)))
 278         else:
 279             palette.append((int(r), int(g), int(b)))
 280
 281     return palette
 282
 283
 284 def desaturate(rgb, amount):
 285     """
 286         desaturate colors by amount
 287         amount == 0, no change
 288         amount == 1, grey
 289     """
 290     luminosity = sum(rgb) / 3.0
 291     desat = lambda color: color - amount * (color - luminosity)
 292
 293     return tuple(map(int, map(desat, rgb)))
 294
 295
 296 class WaveformImage(object):
 297     """
 298     Given peaks and spectral centroids from the AudioProcessor, this class will construct
 299     a wavefile image which can be saved as PNG.
 300     """
 301     def __init__(self, image_width, image_height, palette=1):
 302         if image_height % 2 == 0:
 303             raise AudioProcessingException("Height should be uneven: images look much better at uneven height")
 304
 305         if palette == 1:
 306             background_color = (0,0,0)
 307             colors = [
 308                         (50,0,200),
 309                         (0,220,80),
 310                         (255,224,0),
 311                         (255,70,0),
 312                      ]
 313         elif palette == 2:
 314             background_color = (0,0,0)
 315             colors = [self.color_from_value(value/29.0) for value in range(0,30)]
 316         elif palette == 3:
 317             background_color = (213, 217, 221)
 318             colors = map( partial(desaturate, amount=0.7), [
 319                         (50,0,200),
 320                         (0,220,80),
 321                         (255,224,0),
 322                      ])
 323         elif palette == 4:
 324             background_color = (213, 217, 221)
 325             colors = map( partial(desaturate, amount=0.8), [self.color_from_value(value/29.0) for value in range(0,30)])
 326
 327         self.image = Image.new("RGB", (image_width, image_height), background_color)
 328
 329         self.image_width = image_width
 330         self.image_height = image_height
 331
 332         self.draw = ImageDraw.Draw(self.image)
 333         self.previous_x, self.previous_y = None, None
 334
 335         self.color_lookup = interpolate_colors(colors)
 336         self.pix = self.image.load()
 337
 338     def color_from_value(self, value):
 339         """ given a value between 0 and 1, return an (r,g,b) tuple """
 340
 341         return ImageColor.getrgb("hsl(%d,%d%%,%d%%)" % (int( (1.0 - value) * 360 ), 80, 50))
 342
 343     def draw_peaks(self, x, peaks, spectral_centroid):
 344         """ draw 2 peaks at x using the spectral_centroid for color """
 345
 346         y1 = self.image_height * 0.5 - peaks[0] * (self.image_height - 4) * 0.5
 347         y2 = self.image_height * 0.5 - peaks[1] * (self.image_height - 4) * 0.5
 348
 349         line_color = self.color_lookup[int(spectral_centroid*255.0)]
 350
 351         if self.previous_y != None:
 352             self.draw.line([self.previous_x, self.previous_y, x, y1, x, y2], line_color)
 353         else:
 354             self.draw.line([x, y1, x, y2], line_color)
 355
 356         self.previous_x, self.previous_y = x, y2
 357
 358         self.draw_anti_aliased_pixels(x, y1, y2, line_color)
 359
 360     def draw_anti_aliased_pixels(self, x, y1, y2, color):
 361         """ vertical anti-aliasing at y1 and y2 """
 362
 363         y_max = max(y1, y2)
 364         y_max_int = int(y_max)
 365         alpha = y_max - y_max_int
 366
 367         if alpha > 0.0 and alpha < 1.0 and y_max_int + 1 < self.image_height:
 368             current_pix = self.pix[x, y_max_int + 1]
 369
 370             r = int((1-alpha)*current_pix[0] + alpha*color[0])
 371             g = int((1-alpha)*current_pix[1] + alpha*color[1])
 372             b = int((1-alpha)*current_pix[2] + alpha*color[2])
 373
 374             self.pix[x, y_max_int + 1] = (r,g,b)
 375
 376         y_min = min(y1, y2)
 377         y_min_int = int(y_min)
 378         alpha = 1.0 - (y_min - y_min_int)
 379
 380         if alpha > 0.0 and alpha < 1.0 and y_min_int - 1 >= 0:
 381             current_pix = self.pix[x, y_min_int - 1]
 382
 383             r = int((1-alpha)*current_pix[0] + alpha*color[0])
 384             g = int((1-alpha)*current_pix[1] + alpha*color[1])
 385             b = int((1-alpha)*current_pix[2] + alpha*color[2])
 386
 387             self.pix[x, y_min_int - 1] = (r,g,b)
 388
 389     def save(self, filename):
 390         # draw a zero "zero" line
 391         a = 25
 392         for x in range(self.image_width):
 393             self.pix[x, self.image_height/2] = tuple(map(lambda p: p+a, self.pix[x, self.image_height/2]))
 394
 395         self.image.save(filename)
 396
 397
 398 class SpectrogramImage(object):
 399     """
 400     Given spectra from the AudioProcessor, this class will construct a wavefile image which
 401     can be saved as PNG.
 402     """
 403     def __init__(self, image_width, image_height, fft_size):
 404         self.image_width = image_width
 405         self.image_height = image_height
 406         self.fft_size = fft_size
 407
 408         self.image = Image.new("RGBA", (image_height, image_width))
 409
 410         colors = [
 411             (0, 0, 0, 0),
 412             (58/4, 68/4, 65/4, 255),
 413             (80/2, 100/2, 153/2, 255),
 414             (90, 180, 100, 255),
 415             (224, 224, 44, 255),
 416             (255, 60, 30, 255),
 417             (255, 255, 255, 255)
 418          ]
 419         self.palette = interpolate_colors(colors)
 420
 421         # generate the lookup which translates y-coordinate to fft-bin
 422         self.y_to_bin = []
 423         f_min = 100.0
 424         f_max = 22050.0
 425         y_min = math.log10(f_min)
 426         y_max = math.log10(f_max)
 427         for y in range(self.image_height):
 428             freq = math.pow(10.0, y_min + y / (image_height - 1.0) *(y_max - y_min))
 429             bin = freq / 22050.0 * (self.fft_size/2 + 1)
 430
 431             if bin < self.fft_size/2:
 432                 alpha = bin - int(bin)
 433
 434                 self.y_to_bin.append((int(bin), alpha * 255))
 435
 436         # this is a bit strange, but using image.load()[x,y] = ... is
 437         # a lot slower than using image.putadata and then rotating the image
 438         # so we store all the pixels in an array and then create the image when saving
 439         self.pixels = []
 440
 441     def draw_spectrum(self, x, spectrum):
 442         # for all frequencies, draw the pixels
 443         for (index, alpha) in self.y_to_bin:
 444             self.pixels.append( self.palette[int((255.0-alpha) * spectrum[index] + alpha * spectrum[index + 1])] )
 445
 446         # if the FFT is too small to fill up the image, fill with black to the top
 447         for y in range(len(self.y_to_bin), self.image_height): #@UnusedVariable
 448             self.pixels.append(self.palette[0])
 449
 450     def save(self, filename, quality=80):
 451         assert filename.lower().endswith(".jpg")
 452         self.image.putdata(self.pixels)
 453         self.image.transpose(Image.ROTATE_90).save(filename, quality=quality)
 454
 455
 456 def create_wave_images(input_filename, output_filename_w, output_filename_s, image_width, image_height, fft_size, progress_callback=None):
 457     """
 458     Utility function for creating both wavefile and spectrum images from an audio input file.
 459     """
 460     processor = AudioProcessor(input_filename, fft_size, numpy.hanning)
 461     samples_per_pixel = processor.audio_file.nframes / float(image_width)
 462
 463     waveform = WaveformImage(image_width, image_height)
 464     spectrogram = SpectrogramImage(image_width, image_height, fft_size)
 465
 466     for x in range(image_width):
 467
 468         if progress_callback and x % (image_width/10) == 0:
 469             progress_callback((x*100)/image_width)
 470
 471         seek_point = int(x * samples_per_pixel)
 472         next_seek_point = int((x + 1) * samples_per_pixel)
 473
 474         (spectral_centroid, db_spectrum) = processor.spectral_centroid(seek_point)
 475         peaks = processor.peaks(seek_point, next_seek_point)
 476
 477         waveform.draw_peaks(x, peaks, spectral_centroid)
 478         spectrogram.draw_spectrum(x, db_spectrum)
 479
 480     if progress_callback:
 481         progress_callback(100)
 482
 483     waveform.save(output_filename_w)
 484     spectrogram.save(output_filename_s)
 485
 486
 487 class NoSpaceLeftException(Exception):
 488     pass
 489
 490 def convert_to_pcm(input_filename, output_filename):
 491     """
 492     converts any audio file type to pcm audio
 493     """
 494
 495     if not os.path.exists(input_filename):
 496         raise AudioProcessingException("file %s does not exist" % input_filename)
 497
 498     sound_type = get_sound_type(input_filename)
 499
 500     if sound_type == "mp3":
 501         cmd = ["lame", "--decode", input_filename, output_filename]
 502     elif sound_type == "ogg":
 503         cmd = ["oggdec", input_filename, "-o", output_filename]
 504     elif sound_type == "flac":
 505         cmd = ["flac", "-f", "-d", "-s", "-o", output_filename, input_filename]
 506     else:
 507         return False
 508
 509     process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 510     (stdout, stderr) = process.communicate()
 511
 512     if process.returncode != 0 or not os.path.exists(output_filename):
 513         if "No space left on device" in stderr + " " + stdout:
 514             raise NoSpaceLeftException
 515         raise AudioProcessingException("failed converting to pcm data:\n" + " ".join(cmd) + "\n" + stderr + "\n" + stdout)
 516
 517     return True
 518
 519
 520 def stereofy_and_find_info(stereofy_executble_path, input_filename, output_filename):
 521     """
 522     converts a pcm wave file to two channel, 16 bit integer
 523     """
 524
 525     if not os.path.exists(input_filename):
 526         raise AudioProcessingException("file %s does not exist" % input_filename)
 527
 528     cmd = [stereofy_executble_path, "--input", input_filename, "--output", output_filename]
 529
 530     process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 531     (stdout, stderr) = process.communicate()
 532
 533     if process.returncode != 0 or not os.path.exists(output_filename):
 534         if "No space left on device" in stderr + " " + stdout:
 535             raise NoSpaceLeftException
 536         raise AudioProcessingException("failed calling stereofy data:\n" + " ".join(cmd) + "\n" + stderr + "\n" + stdout)
 537
 538     stdout = (stdout + " " + stderr).replace("\n", " ")
 539
 540     duration = 0
 541     m = re.match(r".*#duration (?P<duration>[\d\.]+).*",  stdout)
 542     if m != None:
 543         duration = float(m.group("duration"))
 544
 545     channels = 0
 546     m = re.match(r".*#channels (?P<channels>\d+).*", stdout)
 547     if m != None:
 548         channels = float(m.group("channels"))
 549
 550     samplerate = 0
 551     m = re.match(r".*#samplerate (?P<samplerate>\d+).*", stdout)
 552     if m != None:
 553         samplerate = float(m.group("samplerate"))
 554
 555     bitdepth = None
 556     m = re.match(r".*#bitdepth (?P<bitdepth>\d+).*", stdout)
 557     if m != None:
 558         bitdepth = float(m.group("bitdepth"))
 559
 560     bitrate = (os.path.getsize(input_filename) * 8.0) / 1024.0 / duration if duration > 0 else 0
 561
 562     return dict(duration=duration, channels=channels, samplerate=samplerate, bitrate=bitrate, bitdepth=bitdepth)
 563
 564
 565 def convert_to_mp3(input_filename, output_filename, quality=70):
 566     """
 567     converts the incoming wave file to a mp3 file
 568     """
 569
 570     if not os.path.exists(input_filename):
 571         raise AudioProcessingException("file %s does not exist" % input_filename)
 572
 573     command = ["lame", "--silent", "--abr", str(quality), input_filename, output_filename]
 574
 575     process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 576     (stdout, stderr) = process.communicate()
 577
 578     if process.returncode != 0 or not os.path.exists(output_filename):
 579         raise AudioProcessingException(stdout)
 580
 581 def convert_to_ogg(input_filename, output_filename, quality=1):
 582     """
 583     converts the incoming wave file to n ogg file
 584     """
 585
 586     if not os.path.exists(input_filename):
 587         raise AudioProcessingException("file %s does not exist" % input_filename)
 588
 589     command = ["oggenc", "-q", str(quality), input_filename, "-o", output_filename]
 590
 591     process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 592     (stdout, stderr) = process.communicate()
 593
 594     if process.returncode != 0 or not os.path.exists(output_filename):
 595         raise AudioProcessingException(stdout)
 596
 597 def convert_using_ffmpeg(input_filename, output_filename):
 598     """
 599     converts the incoming wave file to stereo pcm using fffmpeg
 600     """
 601     TIMEOUT = 3 * 60
 602     def  alarm_handler(signum, frame):
 603         raise AudioProcessingException("timeout while waiting for ffmpeg")
 604
 605     if not os.path.exists(input_filename):
 606         raise AudioProcessingException("file %s does not exist" % input_filename)
 607
 608     command = ["ffmpeg", "-y", "-i", input_filename, "-ac","1","-acodec", "pcm_s16le", "-ar", "44100", output_filename]
 609
 610     process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 611     signal.signal(signal.SIGALRM,alarm_handler)
 612     signal.alarm(TIMEOUT)
 613     (stdout, stderr) = process.communicate()
 614     signal.alarm(0)
 615     if process.returncode != 0 or not os.path.exists(output_filename):
 616         raise AudioProcessingException(stdout)