From e398410f2d6673badfa365879298c0858ab5f152 Mon Sep 17 00:00:00 2001 From: Pierre Alexandre Tremblay Date: Wed, 19 Sep 2018 20:49:39 +0100 Subject: [PATCH] BufNMF with all examples included, and FluidHPSS with the full text and examples. --- .../HelpSource/Classes/FluidBufNMF.schelp | 133 +++++++++++++++++- .../HelpSource/Classes/FluidHPSS.schelp | 83 ++++++++--- 2 files changed, 189 insertions(+), 27 deletions(-) diff --git a/release-packaging/HelpSource/Classes/FluidBufNMF.schelp b/release-packaging/HelpSource/Classes/FluidBufNMF.schelp index f4e7b7e..9716c8c 100644 --- a/release-packaging/HelpSource/Classes/FluidBufNMF.schelp +++ b/release-packaging/HelpSource/Classes/FluidBufNMF.schelp @@ -8,9 +8,9 @@ DESCRIPTION:: The FluidBufNMF object decomposes the spectrum of a sound into a number of components using Non-Negative Matrix Factorisation (NMF) footnote:: Lee, Daniel D., and H. Sebastian Seung. 1999. ‘Learning the Parts of Objects by Non-Negative Matrix Factorization’. Nature 401 (6755): 788–91. https://doi.org/10.1038/44565. ::. NMF has been a popular technique in signal processing research for things like source separation and transcription, although its creative potential is so far relatively unexplored. -The algorithm takes a buffer in and divides it into a number of components, determined by the rank argument. It works iteratively, by trying to find a combination of spectral templates ('dictionaries') and envelopes ('activations') that yield the original magnitude spectrogram when added together. By and large, there is no unique answer to this question (i.e. there are different ways of accounting for an evolving spectrum in terms of some set of templates and envelopes). In its basic form, NMF is a form of unsupervised learning: it starts with some random data and then converges towards something that minimizes the distance between its generated data and the original:it tends to converge very quickly at first and then level out. Fewer iterations mean less processing, but also less predictable results. +The algorithm takes a buffer in and divides it into a number of components, determined by the rank argument. It works iteratively, by trying to find a combination of spectral templates ('dictionaries') and envelopes ('activations') that yield the original magnitude spectrogram when added together. By and large, there is no unique answer to this question (i.e. there are different ways of accounting for an evolving spectrum in terms of some set of templates and envelopes). In its basic form, NMF is a form of unsupervised learning: it starts with some random data and then converges towards something that minimizes the distance between its generated data and the original:it tends to converge very quickly at first and then level out. Fewer iterations mean less processing, but also less predictable results. -The obect can provide back either or all of the following: LIST:: +The object can provide back either or all of the following: LIST:: ## a spectral contour of each component in the form of a magnitude spectrogram (called a dictionary in NMF lingo); ## an amplitude envolope of each component in the form of gains for each consecutive frame of the unterlying spectrogram (called an activation in NMF lingo); ## a reconstruction of each rank in the time domain. :: @@ -22,7 +22,6 @@ Some additional options and flexibility can be found through combinations of the If supplying pre-formed data, it's up to the user to make sure that the supplied buffers are the right size: LIST:: ## dictionaries must be STRONG::(fft size / 2) + 1:: frames and STRONG::(rank * input channels):: channels ## activations must be STRONG::(input frames / hopSize) + 1:: frames and STRONG::(rank * input channels):: channels - :: In this implementation, the components are reconstructed by masking the ogriginal spectrum, such that they will sum to yield the original sound. @@ -116,7 +115,131 @@ RETURNS:: EXAMPLES:: code:: - b = Buffer.read(s,"../../../AudioFiles/01-mix.wav".resolveRelative); - b.play +// set some buffers and parameters +( +b = Buffer.read(s,"../../AudioFiles/Tremblay-AaS-SynthTwoVoices-M.wav".resolveRelative); +c = Buffer.new(s); +x = Buffer.new(s); +y = Buffer.new(s); +~fft_size = 1024; +~frame_size = 512; +~hop_size = 256; +~which_rank = 0; +) + +// matrix factorisation, requesting everything +( +Routine{ + t = Main.elapsedTime; + FluidBufNMF.process(s,b.bufnum, 0,-1,0,-1,c.bufnum,x.bufnum,0,y.bufnum,0,5,100,0,~frame_size,~hop_size,~fft_size); + s.sync; + (Main.elapsedTime - t).postln; + s.sync; + c.query; + s.sync; + x.query; + s.sync; + y.query; +}.play +) + +//look at the resynthesised ranks, the dictionaries and the activations +c.plot;x.plot; y.plot; +//null test of the sum of sources +{(PlayBuf.ar(5,c.bufnum,doneAction:2).sum)+(-1*PlayBuf.ar(1,b.bufnum,doneAction:2))}.play + +// play the ranks spread in the stereo field +{Splay.ar(PlayBuf.ar(5,c.bufnum,doneAction:2))}.play + +//play a single source +{PlayBuf.ar(5,c.bufnum,doneAction:2)[~which_rank].dup}.play + +//play noise using one of the dictionaries as filter. +( +{ + var chain; + chain = FFT(LocalBuf(~fft_size), WhiteNoise.ar()); + + chain = chain.pvcollect(~fft_size, {|mag, phase, index| + [mag * BufRd.kr(5,x.bufnum,DC.kr(index),0,1)[~which_rank]]; + }); + + IFFT(chain); +}.play +) + +//play noise using one of the activations as envelope. +{WhiteNoise.ar(BufRd.kr(5,y.bufnum,Phasor.ar(1,1/~hop_size,0,(b.numFrames / ~hop_size + 1)),0,1)[~which_rank])*0.5}.play + +//play noise through both matching activation and filter +( +{ + var chain; + chain = FFT(LocalBuf(~fft_size), WhiteNoise.ar(BufRd.kr(5,y.bufnum,Phasor.ar(1,1/~hop_size,0,(b.numFrames / ~hop_size + 1)),0,1)[~which_rank]*12),0.5,1); + + chain = chain.pvcollect(~fft_size, {|mag, phase, index| + [mag * BufRd.kr(5,x.bufnum,DC.kr(index),0,1)[~which_rank]]; + }); + + [0,IFFT(chain)]; +}.play +) + +:: + STRONG::Fixed Dictionnaries:: The process can be trained, and the learnt dictionaries or activations can be used as templates. + + CODE:: + + //set some buffers +( +b = Buffer.read(s,"../../AudioFiles/Tremblay-AaS-AcousticStrums-M.wav".resolveRelative); +c = Buffer.new(s); +x = Buffer.new(s); +e = Buffer.alloc(s,1,1); +y = Buffer.alloc(s,1,1); +) + + // train only 2 seconds +( +Routine { + FluidBufNMF.process(s,b.bufnum,0,88200,0,1, c.bufnum, x.bufnum, rank:10); + s.sync; + c.query; +}.play; +) + +// find the rank that has the picking sound by changing which channel to listen to +( + ~element = 0; + {PlayBuf.ar(10,c.bufnum)[~element]}.play +) + +// copy all the other ranks on itself and the picking dictionnary as the sole component of the 1st channel +( +Routine{ + (0..9).remove(~element).do({|chan|FluidBufCompose.process(s,srcBufNumA: x.bufnum, startChanA:chan, nChansA: 1, srcBufNumB: e.bufnum, dstBufNum: e.bufnum)}); + s.sync; + e.query; + s.sync; + FluidBufCompose.process(s,srcBufNumA: x.bufnum, startChanA: ~element, nChansA: 1, srcBufNumB: e.bufnum, dstStartChanB: 1, dstBufNum: e.bufnum); + s.sync; + e.query; +}.play; +) + +//process the whole file, splitting it with the 2 trained dictionnaries +( +Routine{ + FluidBufNMF.process(s, b.bufnum, dstBufNum: c.bufnum, dictBufNum: e.bufnum, dictFlag: 2, actBufNum:y.bufnum, rank:2); + s.sync; + c.query; +}.play; +) + +// play the result: pick on the left, rest on the right. +c.play + +// it even null-sums +{(PlayBuf.ar(2,c.bufnum,doneAction:2).sum)-(PlayBuf.ar(1,b.bufnum,doneAction:2))}.play :: \ No newline at end of file diff --git a/release-packaging/HelpSource/Classes/FluidHPSS.schelp b/release-packaging/HelpSource/Classes/FluidHPSS.schelp index 1d7f0f1..24b70e1 100644 --- a/release-packaging/HelpSource/Classes/FluidHPSS.schelp +++ b/release-packaging/HelpSource/Classes/FluidHPSS.schelp @@ -1,75 +1,114 @@ TITLE:: FluidHPSS -SUMMARY:: (put short description here) +SUMMARY:: Harmonic-Percussive Source Separation Using Median Filtering CATEGORIES:: Libraries>FluidDecomposition RELATED:: Guides/FluCoMa, Guides/FluidDecomposition -DESCRIPTION:: +A FluidHPSS object performs Harmonic-Percussive Source Separation (HPSS) on the an audio input. The class performs HPSS as described in its original form footnote:: +Fitzgerald, Derry. 2010. ‘Harmonic/Percussive Separation Using Median Filtering’. In Proceedings DaFx 10. https://arrow.dit.ie/argcon/67. +:: as well as a variation on the extension propsoed by Driedger et al. footnote:: +Driedger, Jonathan, Meinard Uller, and Sascha Disch. 2014. ‘Extending Harmonic-Percussive Separation of Audio Signals’. In Proc. ISMIR. http://www.terasoft.com.tw/conf/ismir2014/proceedings/T110_127_Paper.pdf. +:: + +The algorithm takes an audio in, and divides it into two or three outputs, depending on the mode: LIST:: + ## an harmonic component; + ## a percussive component; + ## a residual of the previous two if the flag is set to inter-dependant thresholds. See the modeFlag below.:: + It is part of the Fluid Decomposition Toolkit of the FluCoMa project. footnote:: -This was made possible thanks to the FluCoMa project ( http://www.flucoma.org/ ) funded by the European Research Council ( https://erc.europa.eu/ ) under the European Union’s Horizon 2020 research and innovation programme (grant agreement No 725899).:: +This was made possible thanks to the FluCoMa project ( http://www.flucoma.org/ ) funded by the European Research Council ( https://erc.europa.eu/ ) under the European Union’s Horizon 2020 research and innovation programme (grant agreement No 725899). +:: + + More information on median filtering, and on HPSS for musicianly usage, are availabe in LINK::Guides/FluCoMa:: overview file. CLASSMETHODS:: METHOD:: ar The audio rate version of the object. ARGUMENT:: in -(describe argument here) + The input to be processed. ARGUMENT:: harmFiltSize -(describe argument here) + The size, in spectral frames, of the median filter for the harmonic component. Must be an odd number, >= 3. ARGUMENT:: percFiltSize -(describe argument here) + The size, in spectral bins, of the median filter for the percussive component. Must be an odd number, >=3 ARGUMENT:: modeFlag - The way the masking is happening on the spectrogram. + The way the masking is applied to the original spectrogram. (0,1,2) table:: - ## 0 || Original paper - the loudest winds. - ## 1 || Relative mode - the thresholds set next on the harmonic counterpart will decide of a binary masking, and the percussive mask is its complement. - ## 2 || Inter-dependant mode - the thresholds are independant on the harmonic and percussive component, but are then normalised to make a null sum and their difference is sent to the residual buffer. + ## 0 || Fitzgerald's original method of 'Wiener-inspired' filtering. Compllimentary, soft masks are made for the harmonic and percussive parts by allocating some fraction of a point in time-frequency to each. This provides the fewest artefacts, but the weakest separation. The two resulting buffers will sum to exactly the original material. + ## 1 || Relative mode - Better separation, with more artefacts. The harmonic mask is constructed using a binary decision, based on whether a threshold is exceeded at a given time-frequency point (these are set using htf1, hta1, htf2, hta2, see below). The percussive mask is then formed as the inverse of the harmonic one, meaning that as above, the two components will sum to the original sound. + + ## 2 || Inter-dependent mode - Thresholds can be varied independently, but are coupled in effect. Binary masks are made for each of the harmonic and percussive components, but these aren't gurranteed to cover the whole sound. In this case the 'leftovers' will placed into a third buffer. This method tuneable, but hardest to control. :: ARGUMENT:: htf1 - In modes 1 and 2, the frequency of the low part of the threshold for the harmonic filter. + In modes 1 and 2, the frequency of the low part of the threshold for the harmonic filter (0-1) ARGUMENT:: hta1 - In modes 1 and 2, the threshold of the low part for the harmonic filter. That threshold applies to all frequencies up to htf1. + In modes 1 and 2, the threshold of the low part for the harmonic filter. That threshold applies to all frequencies up to htf1: how much more powerful (in dB) the harmonic median filter needs to be than the percussive median filter for this bin to be counted as harmonic. ARGUMENT:: htf2 - In modes 1 and 2, the frequency of the hight part of the threshold for the harmonic filter. + In modes 1 and 2, the frequency of the hight part of the threshold for the harmonic filter. (0-1) ARGUMENT:: hta2 - In modes 1 and 2, the threshold of the high part for the harmonic filter. That threshold applies to all frequencies above htf2. The threshold between htf1 and htf2 is interpolated between hta1 and hta2. + In modes 1 and 2, the threshold of the high part for the harmonic filter. That threshold applies to all frequencies above htf2. The threshold between htf1 and htf2 is interpolated between hta1 and hta2. How much more powerful (in dB) the harmonic median filter needs to be than the percussive median filter for this bin to be counted as harmonic. ARGUMENT:: ptf1 - In mode 2, the frequency of the low part of the threshold for the percussive filter. + In mode 2, the frequency of the low part of the threshold for the percussive filter. (0-1) ARGUMENT:: pta1 - In mode 2, the threshold of the low part for the percussive filter. That threshold applies to all frequencies up to ptf1. + In mode 2, the threshold of the low part for the percussive filter. That threshold applies to all frequencies up to ptf1. How much more powerful (in dB) the percussive median filter needs to be than the harmonic median filter for this bin to be counted as percussive. ARGUMENT:: ptf2 - In mode 2, the frequency of the hight part of the threshold for the percussive filter. + In mode 2, the frequency of the hight part of the threshold for the percussive filter. (0-1) ARGUMENT:: pta2 - In mode 2, the threshold of the high part for the percussive filter. That threshold applies to all frequencies above ptf2. The threshold between ptf1 and ptf2 is interpolated between pta1 and pta2. + In mode 2, the threshold of the high part for the percussive filter. That threshold applies to all frequencies above ptf2. The threshold between ptf1 and ptf2 is interpolated between pta1 and pta2. How much more powerful (in dB) the percussive median filter needs to be than the harmonic median filter for this bin to be counted as percussive. ARGUMENT:: winSize - The window size. As HPSS relies on spectral frames, we need to decide what precision we give it spectrally and temporally, in line with Gabor Uncertainty principles. http://www.subsurfwiki.org/wiki/Gabor_uncertainty + The window size in samples. As HPSS relies on spectral frames, we need to decide what precision we give it spectrally and temporally, in line with Gabor Uncertainty principles. http://www.subsurfwiki.org/wiki/Gabor_uncertainty ARGUMENT:: hopSize - The window hope size. As HPSS relies on spectral frames, we need to move the window forward. It can be any size but low overlap will create audible artefacts. + The window hop size in samples. As HPSS relies on spectral frames, we need to move the window forward. It can be any size but low overlap may create audible artefacts. ARGUMENT:: fftSize - The inner FFT/IFFT size. It should be at least 4 samples long, at least the size of the window, and a power of 2. Making it larger allows an oversampling of the spectral precision. + The inner FFT/IFFT size. It should be at least 4 samples long; at least the size of the window; and a power of 2. Making it larger than the window size provides interpolation in frequency. + RETURNS:: An array of three audio streams: [0] is the harmonic part extracted, [1] is the percussive part extracted, [2] is the rest. The latency between the input and the output is ((harmFiltSize + (winSize / hopSize) - 1) * hopSize) samples. +Discussion:: + HPSS works by using median filters on the spectral magnitudes of a sound. It hinges on a simple modelling assumption that tonal components will tend to yield concentrations of energy across time, spread out in frequency, and percussive components will manifest as concentrations of energy across frequency, spread out in time. By using median filters across time and frequency respectively, we get initial esitmates of the tonal-ness / transient-ness of a point in time and frequency. These are then combined into 'masks' that are applied to the orginal spectral data in order to produce a separation. + + The modeFlag parameter provides different approaches to combinging estimates and producing masks. Some settings (especially in modes 1 & 2) will provide better separation but with more artefacts. These can, in principle, be ameliorated by applying smoothing filters to the masks before transforming back to the time-domain (not yet implemented). + + EXAMPLES:: CODE:: -safdsad + //load a soundfile to play + b = Buffer.read(s,"../../AudioFiles/Tremblay-AaS-SynthTwoVoices-M.wav".resolveRelative); + + // run with basic parameters (left is harmonic, right is percussive) + {FluidHPSS.ar(PlayBuf.ar(1,b.bufnum,loop:1))}.play + + // run in mode 1 + {FluidHPSS.ar(PlayBuf.ar(1,b.bufnum,loop:1),17,31,1,0.05,40,0.1,-40)}.play + + // run in mode 2m listening to + //the harmonic stream + {FluidHPSS.ar(PlayBuf.ar(1,b.bufnum,loop:1),17,31,2,0.05,40,0.1,-40, 0.1, -10, 0.2, 10)[0].dup}.play + // the percussive stream + {FluidHPSS.ar(PlayBuf.ar(1,b.bufnum,loop:1),17,31,2,0.05,40,0.1,-40, 0.1, -10, 0.2, 10)[1].dup}.play + // the residual stream + {FluidHPSS.ar(PlayBuf.ar(1,b.bufnum,loop:1),17,31,2,0.05,40,0.1,-40, 0.1, -10, 0.2, 10)[2].dup}.play + + // null test (the process add a latency of ((harmFiltSize + (winSize / hopSize) - 1) * hopSize) samples + {var sig = PlayBuf.ar(1,b.bufnum,loop:1); [FluidHPSS.ar(sig,17,31, winSize:1024,hopSize:512,fftSize:2048).sum - DelayN.ar(sig, 1, ((31 + 1) * 512 / s.sampleRate))]}.play :: \ No newline at end of file