vcs.fsf.org Git - mediagoblin.git/blob - extlib/freesound/audioprocessing.py

1 #!/usr/bin/env python

2 # processing.py -- various audio processing functions

4 # UNIVERSITAT POMPEU FABRA

5 #

6 # This program is free software: you can redistribute it and/or modify

7 # it under the terms of the GNU Affero General Public License as

8 # published by the Free Software Foundation, either version 3 of the

9 # License, or (at your option) any later version.

10 #

11 # This program is distributed in the hope that it will be useful,

12 # but WITHOUT ANY WARRANTY; without even the implied warranty of

13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

14 # GNU Affero General Public License for more details.

15 #

16 # You should have received a copy of the GNU Affero General Public License

17 # along with this program. If not, see <http://www.gnu.org/licenses/>.

18 #

19 # Authors:

20 # Bram de Jong <bram.dejong at domain.com where domain in gmail>

21 # 2012, Joar Wandborg <first name at last name dot se>

23 import Image, ImageDraw, ImageColor #@UnresolvedImport

24 from functools import partial

25 import math

26 import numpy

27 import os

28 import re

29 import signal

32 def get_sound_type(input_filename):

     sound_type = os.path.splitext(input_filename.lower())[1].strip(".")

35 if sound_type == "fla":

36 sound_type = "flac"

37 elif sound_type == "aif":

38 sound_type = "aiff"

40 return sound_type

43 try:

44 import scikits.audiolab as audiolab

45 except ImportError:

46 print "WARNING: audiolab is not installed so wav2png will not work"

47 import subprocess

 class AudioProcessingException(Exception):

50 pass

 class TestAudioFile(object):

53 """A class that mimics audiolab.sndfile but generates noise instead of reading

54 a wave file. Additionally it can be told to have a "broken" header and thus crashing

55 in the middle of the file. Also useful for testing ultra-short files of 20 samples."""

     def __init__(self, num_frames, has_broken_header=False):

57 self.seekpoint = 0

58 self.nframes = num_frames

59 self.samplerate = 44100

60 self.channels = 1

61 self.has_broken_header = has_broken_header

     def seek(self, seekpoint):

64 self.seekpoint = seekpoint

     def read_frames(self, frames_to_read):

         if self.has_broken_header and self.seekpoint + frames_to_read > self.num_frames / 2:

68 raise RuntimeError()

70 num_frames_left = self.num_frames - self.seekpoint

71 will_read = num_frames_left if num_frames_left < frames_to_read else frames_to_read

72 self.seekpoint += will_read

         return numpy.random.random(will_read)*2 - 1

76 def get_max_level(filename):

77 max_value = 0

78 buffer_size = 4096

     audio_file = audiolab.Sndfile(filename, 'r')

80 n_samples_left = audio_file.nframes

82 while n_samples_left:

         to_read = min(buffer_size, n_samples_left)

85 try:

             samples = audio_file.read_frames(to_read)

87 except RuntimeError:

88 # this can happen with a broken header

89 break

91 # convert to mono by selecting left channel only

         if audio_file.channels > 1:

93 samples = samples[:,0]

         max_value = max(max_value, numpy.abs(samples).max())

97 n_samples_left -= to_read

99 audio_file.close()

100

101 return max_value

102

 class AudioProcessor(object):

104 """

105 The audio processor processes chunks of audio an calculates the spectrac centroid and the peak

106 samples in that chunk of audio.

107 """

     def __init__(self, input_filename, fft_size, window_function=numpy.hanning):

109 max_level = get_max_level(input_filename)

110

         self.audio_file = audiolab.Sndfile(input_filename, 'r')

112 self.fft_size = fft_size

         self.window = window_function(self.fft_size)

114 self.spectrum_range = None

115 self.lower = 100

116 self.higher = 22050

         self.lower_log = math.log10(self.lower)

         self.higher_log = math.log10(self.higher)

         self.clip = lambda val, low, high: min(high, max(low, val))

120

121 # figure out what the maximum value is for an FFT doing the FFT of a DC signal

         fft = numpy.fft.rfft(numpy.ones(fft_size) * self.window)

         max_fft = (numpy.abs(fft)).max()

124 # set the scale to normalized audio and normalized FFT

         self.scale = 1.0/max_level/max_fft if max_level > 0 else 1

126

     def read(self, start, size, resize_if_less=False):

128 """ read size samples starting at start, if resize_if_less is True and less than size

129 samples are read, resize the array to size and fill with zeros """

130

131 # number of zeros to add to start and end of the buffer

132 add_to_start = 0

133 add_to_end = 0

134

135 if start < 0:

136 # the first FFT window starts centered around zero

             if size + start <= 0:

                 return numpy.zeros(size) if resize_if_less else numpy.array([])

139 else:

                 self.audio_file.seek(0)

141

142 add_to_start = -start # remember: start is negative!

143 to_read = size + start

144

                 if to_read > self.audio_file.nframes:

146 add_to_end = to_read - self.audio_file.nframes

147 to_read = self.audio_file.nframes

148 else:

             self.audio_file.seek(start)

150

151 to_read = size

             if start + to_read >= self.audio_file.nframes:

153 to_read = self.audio_file.nframes - start

154 add_to_end = size - to_read

155

156 try:

             samples = self.audio_file.read_frames(to_read)

158 except RuntimeError:

159 # this can happen for wave files with broken headers...

             return numpy.zeros(size) if resize_if_less else numpy.zeros(2)

161

162 # convert to mono by selecting left channel only

         if self.audio_file.channels > 1:

164 samples = samples[:,0]

165

         if resize_if_less and (add_to_start > 0 or add_to_end > 0):

167 if add_to_start > 0:

                 samples = numpy.concatenate((numpy.zeros(add_to_start), samples), axis=1)

169

170 if add_to_end > 0:

                 samples = numpy.resize(samples, size)

172 samples[size - add_to_end:] = 0

173

174 return samples

175

176

     def spectral_centroid(self, seek_point, spec_range=110.0):

178 """ starting at seek_point read fft_size samples, and calculate the spectral centroid """

179

         samples = self.read(seek_point - self.fft_size/2, self.fft_size, True)

181

182 samples *= self.window

         fft = numpy.fft.rfft(samples)

         spectrum = self.scale * numpy.abs(fft) # normalized abs(FFT) between 0 and 1

         length = numpy.float64(spectrum.shape[0])

186

187 # scale the db spectrum from [- spec_range db ... 0 db] > [0..1]

         db_spectrum = ((20*(numpy.log10(spectrum + 1e-60))).clip(-spec_range, 0.0) + spec_range)/spec_range

189

190 energy = spectrum.sum()

191 spectral_centroid = 0

192

193 if energy > 1e-60:

194 # calculate the spectral centroid

195

             if self.spectrum_range == None:

                 self.spectrum_range = numpy.arange(length)

198

             spectral_centroid = (spectrum * self.spectrum_range).sum() / (energy * (length - 1)) * self.audio_file.samplerate * 0.5

200

201 # clip > log10 > scale between 0 and 1

             spectral_centroid = (math.log10(self.clip(spectral_centroid, self.lower, self.higher)) - self.lower_log) / (self.higher_log - self.lower_log)

203

204 return (spectral_centroid, db_spectrum)

205

206

     def peaks(self, start_seek, end_seek):

208 """ read all samples between start_seek and end_seek, then find the minimum and maximum peak

209 in that range. Returns that pair in the order they were found. So if min was found first,

210 it returns (min, max) else the other way around. """

211

212 # larger blocksizes are faster but take more mem...

213 # Aha, Watson, a clue, a tradeof!

214 block_size = 4096

215

216 max_index = -1

217 max_value = -1

218 min_index = -1

219 min_value = 1

220

221 if start_seek < 0:

222 start_seek = 0

223

         if end_seek > self.audio_file.nframes:

225 end_seek = self.audio_file.nframes

226

227 if end_seek <= start_seek:

             samples = self.read(start_seek, 1)

             return (samples[0], samples[0])

230

231 if block_size > end_seek - start_seek:

232 block_size = end_seek - start_seek

233

         for i in range(start_seek, end_seek, block_size):

             samples = self.read(i, block_size)

236

             local_max_index = numpy.argmax(samples)

238 local_max_value = samples[local_max_index]

239

240 if local_max_value > max_value:

241 max_value = local_max_value

242 max_index = local_max_index

243

             local_min_index = numpy.argmin(samples)

245 local_min_value = samples[local_min_index]

246

247 if local_min_value < min_value:

248 min_value = local_min_value

249 min_index = local_min_index

250

         return (min_value, max_value) if min_index < max_index else (max_value, min_value)

252

253

 def interpolate_colors(colors, flat=False, num_colors=256):

255 """ given a list of colors, create a larger list of colors interpolating

256 the first one. If flatten is True a list of numers will be returned. If

257 False, a list of (r,g,b) tuples. num_colors is the number of colors wanted

258 in the final list """

259

260 palette = []

261

     for i in range(num_colors):

         index = (i * (len(colors) - 1))/(num_colors - 1.0)

264 index_int = int(index)

         alpha = index - float(index_int)

266

267 if alpha > 0:

             r = (1.0 - alpha) * colors[index_int][0] + alpha * colors[index_int + 1][0]

             g = (1.0 - alpha) * colors[index_int][1] + alpha * colors[index_int + 1][1]

             b = (1.0 - alpha) * colors[index_int][2] + alpha * colors[index_int + 1][2]

271 else:

             r = (1.0 - alpha) * colors[index_int][0]

             g = (1.0 - alpha) * colors[index_int][1]

             b = (1.0 - alpha) * colors[index_int][2]

275

276 if flat:

             palette.extend((int(r), int(g), int(b)))

278 else:

             palette.append((int(r), int(g), int(b)))

280

281 return palette

282

283

 def desaturate(rgb, amount):

285 """

286 desaturate colors by amount

287 amount == 0, no change

288 amount == 1, grey

289 """

     luminosity = sum(rgb) / 3.0

     desat = lambda color: color - amount * (color - luminosity)

292

     return tuple(map(int, map(desat, rgb)))

294

295

 class WaveformImage(object):

297 """

298 Given peaks and spectral centroids from the AudioProcessor, this class will construct

299 a wavefile image which can be saved as PNG.

300 """

     def __init__(self, image_width, image_height, palette=1):

         if image_height % 2 == 0:

303 raise AudioProcessingException, "Height should be uneven: images look much better at uneven height"

304

305 if palette == 1:

             background_color = (0,0,0)

307 colors = [

                         (50,0,200),

                         (0,220,80),

                         (255,224,0),

                         (255,70,0),

312 ]

313 elif palette == 2:

             background_color = (0,0,0)

             colors = [self.color_from_value(value/29.0) for value in range(0,30)]

316 elif palette == 3:

             background_color = (213, 217, 221)

             colors = map( partial(desaturate, amount=0.7), [

                         (50,0,200),

                         (0,220,80),

                         (255,224,0),

322 ])

323 elif palette == 4:

             background_color = (213, 217, 221)

             colors = map( partial(desaturate, amount=0.8), [self.color_from_value(value/29.0) for value in range(0,30)])

326

         self.image = Image.new("RGB", (image_width, image_height), background_color)

328

329 self.image_width = image_width

330 self.image_height = image_height

331

         self.draw = ImageDraw.Draw(self.image)

         self.previous_x, self.previous_y = None, None

334

         self.color_lookup = interpolate_colors(colors)

         self.pix = self.image.load()

337

     def color_from_value(self, value):

339 """ given a value between 0 and 1, return an (r,g,b) tuple """

340

         return ImageColor.getrgb("hsl(%d,%d%%,%d%%)" % (int( (1.0 - value) * 360 ), 80, 50))

342

     def draw_peaks(self, x, peaks, spectral_centroid):

344 """ draw 2 peaks at x using the spectral_centroid for color """

345

         y1 = self.image_height * 0.5 - peaks[0] * (self.image_height - 4) * 0.5

         y2 = self.image_height * 0.5 - peaks[1] * (self.image_height - 4) * 0.5

348

         line_color = self.color_lookup[int(spectral_centroid*255.0)]

350

         if self.previous_y != None:

             self.draw.line([self.previous_x, self.previous_y, x, y1, x, y2], line_color)

353 else:

             self.draw.line([x, y1, x, y2], line_color)

355

         self.previous_x, self.previous_y = x, y2

357

         self.draw_anti_aliased_pixels(x, y1, y2, line_color)

359

     def draw_anti_aliased_pixels(self, x, y1, y2, color):

361 """ vertical anti-aliasing at y1 and y2 """

362

         y_max = max(y1, y2)

364 y_max_int = int(y_max)

365 alpha = y_max - y_max_int

366

         if alpha > 0.0 and alpha < 1.0 and y_max_int + 1 < self.image_height:

             current_pix = self.pix[x, y_max_int + 1]

369

             r = int((1-alpha)*current_pix[0] + alpha*color[0])

             g = int((1-alpha)*current_pix[1] + alpha*color[1])

             b = int((1-alpha)*current_pix[2] + alpha*color[2])

373

             self.pix[x, y_max_int + 1] = (r,g,b)

375

         y_min = min(y1, y2)

377 y_min_int = int(y_min)

         alpha = 1.0 - (y_min - y_min_int)

379

         if alpha > 0.0 and alpha < 1.0 and y_min_int - 1 >= 0:

             current_pix = self.pix[x, y_min_int - 1]

382

             r = int((1-alpha)*current_pix[0] + alpha*color[0])

             g = int((1-alpha)*current_pix[1] + alpha*color[1])

             b = int((1-alpha)*current_pix[2] + alpha*color[2])

386

             self.pix[x, y_min_int - 1] = (r,g,b)

388

     def save(self, filename):

390 # draw a zero "zero" line

391 a = 25

         for x in range(self.image_width):

             self.pix[x, self.image_height/2] = tuple(map(lambda p: p+a, self.pix[x, self.image_height/2]))

394

         self.image.save(filename)

396

397

 class SpectrogramImage(object):

399 """

400 Given spectra from the AudioProcessor, this class will construct a wavefile image which

401 can be saved as PNG.

402 """

     def __init__(self, image_width, image_height, fft_size):

404 self.image_width = image_width

405 self.image_height = image_height

406 self.fft_size = fft_size

407

         self.image = Image.new("RGBA", (image_height, image_width))

409

410 colors = [

             (0, 0, 0, 0),

             (58/4, 68/4, 65/4, 255),

             (80/2, 100/2, 153/2, 255),

             (90, 180, 100, 255),

             (224, 224, 44, 255),

             (255, 60, 30, 255),

             (255, 255, 255, 255)

418 ]

         self.palette = interpolate_colors(colors)

420

421 # generate the lookup which translates y-coordinate to fft-bin

422 self.y_to_bin = []

423 f_min = 100.0

424 f_max = 22050.0

         y_min = math.log10(f_min)

         y_max = math.log10(f_max)

         for y in range(self.image_height):

             freq = math.pow(10.0, y_min + y / (image_height - 1.0) *(y_max - y_min))

             bin = freq / 22050.0 * (self.fft_size/2 + 1)

430

             if bin < self.fft_size/2:

                 alpha = bin - int(bin)

433

                 self.y_to_bin.append((int(bin), alpha * 255))

435

436 # this is a bit strange, but using image.load()[x,y] = ... is

437 # a lot slower than using image.putadata and then rotating the image

438 # so we store all the pixels in an array and then create the image when saving

439 self.pixels = []

440

     def draw_spectrum(self, x, spectrum):

442 # for all frequencies, draw the pixels

         for (index, alpha) in self.y_to_bin:

             self.pixels.append( self.palette[int((255.0-alpha) * spectrum[index] + alpha * spectrum[index + 1])] )

445

446 # if the FFT is too small to fill up the image, fill with black to the top

         for y in range(len(self.y_to_bin), self.image_height): #@UnusedVariable

             self.pixels.append(self.palette[0])

449

     def save(self, filename, quality=80):

         assert filename.lower().endswith(".jpg")

         self.image.putdata(self.pixels)

         self.image.transpose(Image.ROTATE_90).save(filename, quality=quality)

454

455

 def create_wave_images(input_filename, output_filename_w, output_filename_s, image_width, image_height, fft_size, progress_callback=None):

457 """

458 Utility function for creating both wavefile and spectrum images from an audio input file.

459 """

     processor = AudioProcessor(input_filename, fft_size, numpy.hanning)

     samples_per_pixel = processor.audio_file.nframes / float(image_width)

462

     waveform = WaveformImage(image_width, image_height)

     spectrogram = SpectrogramImage(image_width, image_height, fft_size)

465

     for x in range(image_width):

467

         if progress_callback and x % (image_width/10) == 0:

             progress_callback((x*100)/image_width)

470

         seek_point = int(x * samples_per_pixel)

         next_seek_point = int((x + 1) * samples_per_pixel)

473

         (spectral_centroid, db_spectrum) = processor.spectral_centroid(seek_point)

         peaks = processor.peaks(seek_point, next_seek_point)

476

         waveform.draw_peaks(x, peaks, spectral_centroid)

         spectrogram.draw_spectrum(x, db_spectrum)

479

480 if progress_callback:

481 progress_callback(100)

482

483 waveform.save(output_filename_w)

484 spectrogram.save(output_filename_s)

485

486

 class NoSpaceLeftException(Exception):

488 pass

489

 def convert_to_pcm(input_filename, output_filename):

491 """

492 converts any audio file type to pcm audio

493 """

494

     if not os.path.exists(input_filename):

         raise AudioProcessingException, "file %s does not exist" % input_filename

497

498 sound_type = get_sound_type(input_filename)

499

500 if sound_type == "mp3":

         cmd = ["lame", "--decode", input_filename, output_filename]

502 elif sound_type == "ogg":

         cmd = ["oggdec", input_filename, "-o", output_filename]

504 elif sound_type == "flac":

         cmd = ["flac", "-f", "-d", "-s", "-o", output_filename, input_filename]

506 else:

507 return False

508

     process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

     (stdout, stderr) = process.communicate()

511

     if process.returncode != 0 or not os.path.exists(output_filename):

         if "No space left on device" in stderr + " " + stdout:

514 raise NoSpaceLeftException

         raise AudioProcessingException, "failed converting to pcm data:\n" + " ".join(cmd) + "\n" + stderr + "\n" + stdout

516

517 return True

518

519

 def stereofy_and_find_info(stereofy_executble_path, input_filename, output_filename):

521 """

522 converts a pcm wave file to two channel, 16 bit integer

523 """

524

     if not os.path.exists(input_filename):

         raise AudioProcessingException, "file %s does not exist" % input_filename

527

     cmd = [stereofy_executble_path, "--input", input_filename, "--output", output_filename]

529

     process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

     (stdout, stderr) = process.communicate()

532

     if process.returncode != 0 or not os.path.exists(output_filename):

         if "No space left on device" in stderr + " " + stdout:

535 raise NoSpaceLeftException

         raise AudioProcessingException, "failed calling stereofy data:\n" + " ".join(cmd) + "\n" + stderr + "\n" + stdout

537

     stdout = (stdout + " " + stderr).replace("\n", " ")

539

540 duration = 0

     m = re.match(r".*#duration (?P<duration>[\d\.]+).*",  stdout)

542 if m != None:

         duration = float(m.group("duration"))

544

545 channels = 0

     m = re.match(r".*#channels (?P<channels>\d+).*", stdout)

547 if m != None:

         channels = float(m.group("channels"))

549

550 samplerate = 0

     m = re.match(r".*#samplerate (?P<samplerate>\d+).*", stdout)

552 if m != None:

         samplerate = float(m.group("samplerate"))

554

555 bitdepth = None

     m = re.match(r".*#bitdepth (?P<bitdepth>\d+).*", stdout)

557 if m != None:

         bitdepth = float(m.group("bitdepth"))

559

     bitrate = (os.path.getsize(input_filename) * 8.0) / 1024.0 / duration if duration > 0 else 0

561

     return dict(duration=duration, channels=channels, samplerate=samplerate, bitrate=bitrate, bitdepth=bitdepth)

563

564

 def convert_to_mp3(input_filename, output_filename, quality=70):

566 """

567 converts the incoming wave file to a mp3 file

568 """

569

     if not os.path.exists(input_filename):

         raise AudioProcessingException, "file %s does not exist" % input_filename

572

     command = ["lame", "--silent", "--abr", str(quality), input_filename, output_filename]

574

     process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

     (stdout, stderr) = process.communicate()

577

     if process.returncode != 0 or not os.path.exists(output_filename):

579 raise AudioProcessingException, stdout

580

 def convert_to_ogg(input_filename, output_filename, quality=1):

582 """

583 converts the incoming wave file to n ogg file

584 """

585

     if not os.path.exists(input_filename):

         raise AudioProcessingException, "file %s does not exist" % input_filename

588

     command = ["oggenc", "-q", str(quality), input_filename, "-o", output_filename]

590

     process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

     (stdout, stderr) = process.communicate()

593

     if process.returncode != 0 or not os.path.exists(output_filename):

595 raise AudioProcessingException, stdout

596

 def convert_using_ffmpeg(input_filename, output_filename):

598 """

599 converts the incoming wave file to stereo pcm using fffmpeg

600 """

601 TIMEOUT = 3 * 60

     def  alarm_handler(signum, frame):

603 raise AudioProcessingException, "timeout while waiting for ffmpeg"

604

     if not os.path.exists(input_filename):

         raise AudioProcessingException, "file %s does not exist" % input_filename

607

     command = ["ffmpeg", "-y", "-i", input_filename, "-ac","1","-acodec", "pcm_s16le", "-ar", "44100", output_filename]

609

     process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

     signal.signal(signal.SIGALRM,alarm_handler)

612 signal.alarm(TIMEOUT)

     (stdout, stderr) = process.communicate()

     signal.alarm(0)

     if process.returncode != 0 or not os.path.exists(output_filename):

616 raise AudioProcessingException, stdout