TITLE:: FluidKMeans
summary:: Cluster data points with K-Means
categories:: FluidManipulation
related:: Classes/FluidDataSet, Classes/FluidLabelSet, Classes/FluidKNN

DESCRIPTION::
Uses the K-Means algorithm to learn clusters from a link::Classes/FluidDataSet::

https://scikit-learn.org/stable/tutorial/statistical_inference/unsupervised_learning.html#clustering-grouping-observations-together

CLASSMETHODS::

METHOD:: new
Construct a new K Means model on the passed server
ARGUMENT:: server
If nil will use Server.default

INSTANCEMETHODS::

PRIVATE::k

METHOD:: fit
Identify code::k:: clusters in a link::Classes/FluidDataSet::
ARGUMENT:: dataset
A link::Classes/FluidDataSet:: of data points
ARGUMENT:: k
The number of clusters to identify in the data set
ARGUMENT:: maxIter
Maximum number of iterations to use partitioning the data
ARGUMENT:: buffer
Seed centroids for clusters WARNING:: Not yet implemented ::
ARGUMENT:: action
A function to run when fitting is complete, taking as its argument an array with the number of data points for each cluster

METHOD:: predict
Given a trained object, return the cluster ID for each data point in a dataset to a label set.
ARGUMENT:: dataset
a link::Classes/FluidDataSet:: containing the data to predict
ARGUMENT:: labelset
a link::Classes/FluidLabelSet:: to reveive the predicted clusters
ARGUMENT:: action
A function to run when the server responds

METHOD:: fitPredict
Run link::Classes/FluidKMeans#*fit:: and link::Classes/FluidKMeans#*predict:: in a single pass: i.e. train the model on the incoming link::Classes/FluidDataSet:: and then return the learned clustering to the passed link::Classes/FluidLabelSet::
ARGUMENT:: dataset
a link::Classes/FluidDataSet:: containing the data to fit and predict
ARGUMENT:: labelset
a link::Classes/FluidLabelSet:: to reveive the predicted clusters
ARGUMENT:: k
The number of clusters to identify in the data set
ARGUMENT:: maxIter
Maximum number of iterations to use partitioning the data
ARGUMENT:: action
A function to run when the server responds

METHOD:: predictPoint
Given a trained object, return the cluster ID for a data point in a link::Classes/Buffer::
ARGUMENT:: buffer
a link::Classes/Buffer:: containing a data point
ARGUMENT:: action
A function to run when the server responds, taking the ID of the cluser as its argument


METHOD:: cols
Retreive the dimentionality of the dataset this instance is trained on
ARGUMENT:: action
A function to run when the server responds, taking the dimensionality as its argument

METHOD:: predict
Report cluster assignments for previously unseen data
ARGUMENT:: dataset
A link::Classes/FluidDataSet:: of data points
ARGUMENT:: labelset
A link::Classes/FluidLabelSet:: to contain assigments
ARGUMENT:: action
A function to run when complete, taking an array of the counts for each catgegory as its argument


METHOD:: write
write learned clusters to disk as a JSON file. Will not overwrite existing files
ARGUMENT:: filename
Absolute path for file
ARGUMENT:: action
A function to run when the file is written

METHOD:: read
Read a learned clustering of a data set from a JSON file
ARGUMENT:: filename
Absolute path of the JSON file
ARGUMENT:: action
Function to run when the file has been read


EXAMPLES::
Server.default.options.outDevice = "Built-in Output"
code::

//A dataset for our points, a labelset for cluster labels
(
~dataset= FluidDataSet(s,\kdtree_help_rand2d);

~clusters = FluidLabelSet(s,\kmeans_help_clusters);
)

//Make some clumped 2D points and place into a dataset
(
~points = (4.collect{64.collect{(1.sum3rand) + [1,-1].choose}.clump(2)}).flatten(1) * 0.5;
~dataset.clear;
~tmpbuf = Buffer.alloc(s,2);
fork{
    s.sync;
    ~points.do{|x,i|
        (""++(i+1)++"/128").postln;
        ~tmpbuf.setn(0,x);
        ~dataset.addPoint(i,~tmpbuf);
        s.sync
    }
}
)

//Make a new k means model, fit it to the dataset and return the discovered clusters to a labelset
(
fork{
	~clusters.clear;
	~kmeans = FluidKMeans(s);
    s.sync;
	~kmeans.fitPredict(~dataset,~clusters, 4,action: {|c|
		"Fitted.\n # Points in each cluster:".postln;
		c.do{|x,i|
			("Cluster" + i + "->" + x.asInteger + "points").postln;
		}
	});
}
)

//Dims of kmeans should match dataset
~kmeans.cols

//Return labels of clustered points
(
~assignments = Array.new(128);
fork{
	128.do{ |i|
		~clusters.getLabel(i,{|clusterID|
			(i.asString+clusterID).postln;
			~assignments.add(clusterID)
		});
		s.sync;
	}
}
)

//Visualise: we're hoping to see colours neatly mapped to quandrants...
(
d = ((~points + 1) * 0.5).flatten(1).unlace;
// d = [20.collect{1.0.rand}, 20.collect{1.0.rand}];
w = Window("scatter", Rect(128, 64, 200, 200));
~colours = [Color.blue,Color.red,Color.green,Color.magenta];
w.drawFunc = {
	Pen.use {
		d[0].size.do{|i|
			var x = (d[0][i]*200);
			var y = (d[1][i]*200);
			var r = Rect(x,y,5,5);
			Pen.fillColor = ~colours[~assignments[i].asInteger];
			Pen.fillOval(r);
		}
	}
};
w.refresh;
w.front;
)

::