! k-Centroid Clustering;
! Given:
a set of points or observations,
in k dimensional feature space,
Classify the points into
a specified number of groups or clusters,
so that points in the same cluster are
close in the k dimensional space.
We minimize the sum over all points, of their distance
from the centroid point of their assigned cluster.
Various distance metrics, such as L1, L2 and higher metrics are allowed.
! Keywords: Classification, Clustering, Data Mining, Grouping;
SETS:
OBS: erngrp;
FEATURE: SCALEF, mean;
OXO( OBS, OBS): y;
OXF( OBS, FEATURE): err, VAL ;
ENDSETS DATA:
NPOW = 2; ! 1 ==> Absolute distance (L1), 2 ==> squared Euclidean (L2) distance;
NCLUSTR = 3; ! Number clusters allowed;
FEATURE = Horiz Vert; ! The dimensions of the space in which points are located;
SCALEF = 1 1; ! Scaling to be applied to each feature/dimension;
! The points. This data set is a variation of the Mouse data;
OBS= L01 F01 F02 R01 R02 L02 F03 F04 R03 L03 F05 F06 L04 R04 F07 R05 F08 L05 F09 F10 R06;
VAL = 19 47
30 21
21 12
41 45
38 41
16 44
36 11
25 26
35 38
15 36
33 25
31 9
21 39
34 46
18 22
39 36
39 25
12 41
26 34
31 16
39 48
;
ENDDATA
SUBMODEL CLUSTER:
! Variables:
y(i,j) = 1 if point i is assigned to point j and j is a centroid,
y(j,j) = 1 if point j is a centroid;
! Compute error/distance from centroid in each cluster or group j;
@FOR( OBS(j):
@bin(y(j,j)); ! The y(i,j) are 0 or 1;
erngrp(j) =
@SUM( OBS(i):
y(i,j)*@SUM( FEATURE(k): @ABS(SCALEF(k)*(VAL(i,k) - val(j,k)))^NPOW)^(1/NPOW));
);
@FOR( OBS(i):
! Each observation i must be assigned to one cluster or group;
@SUM( OBS(j): y(i,j)) = 1;
);
@FOR( OXO(i,j):
! If i assigned to j, then j must be a centroid;
y(i,j) <= y(j,j);
);
! Number centroids allowed;
@SUM( OBS(j): y(j,j)) <= NCLUSTR;
! Minimize the sum over all clusters of the within cluster distances from
cluster centroid point;
MIN = @SUM( OBS(j): erngrp(j)); ! Minimize over all observations and features;
ENDSUBMODEL
CALC:
@SET('TERSEO',2); ! Turn off default output;
@SOLVE(CLUSTER);
totgd = 0; ! Accumulate total within group distances;
@WRITE(' Distance metric is L',NPOW,@NEWLINE(1));
@FOR(OBS(j) | y(j,j) #GT# .5:
totgd = totgd + erngrp(j);
@WRITE( ' Cluster centered on: ',OBS(j),
', Within cluster distances= ',erngrp(j),@NEWLINE(1),
' Members are:', @NEWLINE(1));
@FOR( OBS(I)| y(i,j) #GT# .5:
@WRITE(' ', OBS(i),@NEWLINE(1));
);
);
! For reference, compute distance with no grouping;
nogrpd = 0;
@FOR( FEATURE(k):
mean(k) = @SUM( OBS(i): VAL(i,k))/@SIZE(OBS);
);
@FOR( OBS(i):
nogrpd = nogrpd +
@SUM( FEATURE(k): @ABS(SCALEF(k)*(VAL(i,k) - mean(k)))^NPOW)^(1/NPOW);
);
@WRITE(' Total distance from centers, within clusters= ',totgd,@NEWLINE(1));
@WRITE(' Total distance from center, with no grouping= ',nogrpd,@NEWLINE(1));
ENDCALC
|