#!/usr/bin/env python3 import sys import os import essentia import essentia.standard import essentia.streaming argc = len(sys.argv) input_audio_filename = None output_features_filename = None if argc <= 1: print("Usage: "+sys.argv[0] +" input_file [output_file]\n",file=sys.stderr) sys.exit(1) else: input_audio_filename = sys.argv[1] if argc == 2: output_features_filename = os.path.splitext(input_audio_filename)[0]+'-features.json' else: output_features_filename = sys.argv[2] # Prepare to process the file loader = essentia.standard.MonoLoader(filename=input_audio_filename) audio = loader() #from pylab import plot, show, figure, imshow #%matplotlib inline #import matplotlib.pyplot as plt #plt.rcParams['figure.figsize'] = (15, 6) # set plot sizes to something larger than default # #plot(audio[1*44100:2*44100]) #plt.title("This is how the 2nd second of this audio looks like:") #show() # unnecessary if you started "ipython --pylab" from essentia.standard import * w = Windowing(type = 'hann') spectrum = Spectrum() # FFT() would return the complex FFT, here we just want the magnitude spectrum mfcc = MFCC() logNorm = UnaryOperator(type='log') mfccs = [] melbands = [] melbands_log = [] print("Processing Frames") for frame in FrameGenerator(audio, frameSize=1024, hopSize=512, startFromZero=True): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) mfccs.append(mfcc_coeffs) melbands.append(mfcc_bands) melbands_log.append(logNorm(mfcc_bands)) # transpose to have it in a better shape # we need to convert the list to an essentia.array first (== numpy.array of floats) mfccs = essentia.array(mfccs).T melbands = essentia.array(melbands).T melbands_log = essentia.array(melbands_log).T ## and plot #imshow(melbands[:,:], aspect = 'auto', origin='lower', interpolation='none') #plt.title("Mel band spectral energies in frames") #show() #imshow(melbands_log[:,:], aspect = 'auto', origin='lower', interpolation='none') #plt.title("Log-normalized mel band spectral energies in frames") #show() #imshow(mfccs[1:,:], aspect='auto', origin='lower', interpolation='none') #plt.title("MFCCs in frames") #show() pool = essentia.Pool() for frame in FrameGenerator(audio, frameSize = 1024, hopSize = 512, startFromZero=True): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) pool.add('lowlevel.mfcc', mfcc_coeffs) pool.add('lowlevel.mfcc_bands', mfcc_bands) pool.add('lowlevel.mfcc_bands_log', logNorm(mfcc_bands)) #imshow(pool['lowlevel.mfcc_bands'].T, aspect = 'auto', origin='lower', interpolation='none') #plt.title("Mel band spectral energies in frames") #show() #imshow(pool['lowlevel.mfcc_bands_log'].T, aspect = 'auto', origin='lower', interpolation='none') #plt.title("Log-normalized mel band spectral energies in frames") #show() #imshow(pool['lowlevel.mfcc'].T[1:,:], aspect='auto', origin='lower', interpolation='none') #plt.title("MFCCs in frames") #show() output = YamlOutput(filename = output_features_filename, format = 'json' ) # otherwise defaults to YAML (.sig) output(pool)