1 | #!/usr/bin/env python3
|
---|
2 |
|
---|
3 | import sys
|
---|
4 | import os
|
---|
5 |
|
---|
6 | import essentia
|
---|
7 | import essentia.standard
|
---|
8 | import essentia.streaming
|
---|
9 |
|
---|
10 | argc = len(sys.argv)
|
---|
11 |
|
---|
12 | input_audio_filename = None
|
---|
13 | output_features_filename = None
|
---|
14 |
|
---|
15 | if argc <= 1:
|
---|
16 | print("Usage: "+sys.argv[0] +" input_file [output_file]\n",file=sys.stderr)
|
---|
17 | sys.exit(1)
|
---|
18 | else:
|
---|
19 | input_audio_filename = sys.argv[1]
|
---|
20 | if argc == 2:
|
---|
21 | output_features_filename = os.path.splitext(input_audio_filename)[0]+'-features.json'
|
---|
22 | else:
|
---|
23 | output_features_filename = sys.argv[2]
|
---|
24 |
|
---|
25 | # Prepare to process the file
|
---|
26 | loader = essentia.standard.MonoLoader(filename=input_audio_filename)
|
---|
27 |
|
---|
28 | audio = loader()
|
---|
29 |
|
---|
30 |
|
---|
31 | #from pylab import plot, show, figure, imshow
|
---|
32 | #%matplotlib inline
|
---|
33 | #import matplotlib.pyplot as plt
|
---|
34 | #plt.rcParams['figure.figsize'] = (15, 6) # set plot sizes to something larger than default
|
---|
35 | #
|
---|
36 | #plot(audio[1*44100:2*44100])
|
---|
37 | #plt.title("This is how the 2nd second of this audio looks like:")
|
---|
38 | #show() # unnecessary if you started "ipython --pylab"
|
---|
39 |
|
---|
40 |
|
---|
41 | from essentia.standard import *
|
---|
42 | w = Windowing(type = 'hann')
|
---|
43 | spectrum = Spectrum() # FFT() would return the complex FFT, here we just want the magnitude spectrum
|
---|
44 | mfcc = MFCC()
|
---|
45 |
|
---|
46 | logNorm = UnaryOperator(type='log')
|
---|
47 |
|
---|
48 | mfccs = []
|
---|
49 | melbands = []
|
---|
50 | melbands_log = []
|
---|
51 |
|
---|
52 | print("Processing Frames")
|
---|
53 |
|
---|
54 | for frame in FrameGenerator(audio, frameSize=1024, hopSize=512, startFromZero=True):
|
---|
55 | mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
|
---|
56 | mfccs.append(mfcc_coeffs)
|
---|
57 | melbands.append(mfcc_bands)
|
---|
58 | melbands_log.append(logNorm(mfcc_bands))
|
---|
59 |
|
---|
60 | # transpose to have it in a better shape
|
---|
61 | # we need to convert the list to an essentia.array first (== numpy.array of floats)
|
---|
62 | mfccs = essentia.array(mfccs).T
|
---|
63 | melbands = essentia.array(melbands).T
|
---|
64 | melbands_log = essentia.array(melbands_log).T
|
---|
65 |
|
---|
66 |
|
---|
67 | ## and plot
|
---|
68 | #imshow(melbands[:,:], aspect = 'auto', origin='lower', interpolation='none')
|
---|
69 | #plt.title("Mel band spectral energies in frames")
|
---|
70 | #show()
|
---|
71 |
|
---|
72 | #imshow(melbands_log[:,:], aspect = 'auto', origin='lower', interpolation='none')
|
---|
73 | #plt.title("Log-normalized mel band spectral energies in frames")
|
---|
74 | #show()
|
---|
75 |
|
---|
76 | #imshow(mfccs[1:,:], aspect='auto', origin='lower', interpolation='none')
|
---|
77 | #plt.title("MFCCs in frames")
|
---|
78 | #show()
|
---|
79 |
|
---|
80 |
|
---|
81 | pool = essentia.Pool()
|
---|
82 |
|
---|
83 | for frame in FrameGenerator(audio, frameSize = 1024, hopSize = 512, startFromZero=True):
|
---|
84 | mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
|
---|
85 | pool.add('lowlevel.mfcc', mfcc_coeffs)
|
---|
86 | pool.add('lowlevel.mfcc_bands', mfcc_bands)
|
---|
87 | pool.add('lowlevel.mfcc_bands_log', logNorm(mfcc_bands))
|
---|
88 |
|
---|
89 | #imshow(pool['lowlevel.mfcc_bands'].T, aspect = 'auto', origin='lower', interpolation='none')
|
---|
90 | #plt.title("Mel band spectral energies in frames")
|
---|
91 | #show()
|
---|
92 |
|
---|
93 | #imshow(pool['lowlevel.mfcc_bands_log'].T, aspect = 'auto', origin='lower', interpolation='none')
|
---|
94 | #plt.title("Log-normalized mel band spectral energies in frames")
|
---|
95 | #show()
|
---|
96 |
|
---|
97 | #imshow(pool['lowlevel.mfcc'].T[1:,:], aspect='auto', origin='lower', interpolation='none')
|
---|
98 | #plt.title("MFCCs in frames")
|
---|
99 | #show()
|
---|
100 |
|
---|
101 |
|
---|
102 | output = YamlOutput(filename = output_features_filename, format = 'json' ) # otherwise defaults to YAML (.sig)
|
---|
103 | output(pool)
|
---|
104 |
|
---|