! Clustering. (Clustering.lng)
Partition items into clusters so that items in the
same cluster are close together according to some distance metric;
! We compute a distance, D(i,j) for every pair of ITEMS i, j.
The cost of a cluster is the sum of all the D(i,j)'s in the cluster.
We assign items to clusters so as to minimize these within cluster
distances.;
! Keywords: Classification, Clustering, Data Mining, Grouping, Statistics;
SETS:
ITEM;
ATTRIBUTE: TYPE, SCALE;
IXA( ITEM, ATTRIBUTE): RESPONSE;
CLUSTER;
IXI( ITEM, ITEM) | &1 #LT# &2: D, Y;
IXC( ITEM, CLUSTER) : Z;
ENDSETS ! Clustering problem.
Assign items to clusters so that items that are close
are assigned to the same clusters;
DATA:
CLUSTER = C1..C2;
! Surveyed some people who use our exercise equipment, and got their
! Height in cm, Satisfaction level (1 to 5), Favorite equipment color(3 choices);
ATTRIBUTE = HGT SATLVL FCOLOR;
! Type of response: Cardinal(1), Ordinal(2), Categorical(3);
TYPE = 1 2 3;
ITEM = I1..I11;
! ITEM = I1..I20;
RESPONSE =
186 4 3 ! 1;
177 3 1 ! 2;
180 5 2 ! 3;
183 4 1 ! 4;
171 2 2 ! 5;
158 1 3 ! 6;
204 5 3 ! 7;
180 4 1 ! 8;
189 3 1 ! 9;
189 1 3 !10;
168 2 2 !11;
! 174 5 3 ! 195 4 1 !13;
! 162 4 2 ! 192 1 3 !15;
! 207 5 2 ! 204 1 1 !17;
! 155 1 2 ! 162 4 3 !19;
! 180 5 1; ;
ENDDATA
! Variables:
X(i,k) = 1 if item i is assigned to cluster k,
Y(i,j) = 1 if items i and j are in the same cluster
;
SUBMODEL DOCLUSTER:
! Minimize the distance among items in same cluster;
MIN = REDDIF;
REDDIF = @SUM( IXI(i,j): D(i,j)*Y(i,j));
! Each item i must be in some cluster k;
@FOR( ITEM(i):
@SUM( CLUSTER(k): Z(i,k)) = 1;
);
! Measure interaction. Force Y(i,j) = 1 if i and j in same cluster;
@FOR( IXI(i,j):
@FOR( CLUSTER(k):
Y(i,j) >= Z(i,k) + Z(j,k) - 1;
@BND(0, Y(i,j), 1);
);
);
! Z(i,k) can be only 0 or 1;
@FOR( IXC(i,k):
@BIN( Z(i,k));
);
! Optionally, some cuts;
! On average, each cluster has NPC*(NPC-1)/2 interaction terms;
@SUM( IXI(i,j): Y(i,j)) >= NCLUST*NPC*(NPC-1)/2;
! Optionally, Kill some symmetry;
! We may arbitrarily order the clusters so
If item i is in cluster k, then some lower numbered item
must be in cluster k-1;
! @FOR( IXC(i,k) | k #GT# 1:
Z(i,k) <= @SUM( ITEM( j) | j #LT# i: Z(j,k-1))
);
! Item 2 cannot be assigned to cluster higher than 2;
@SUM( CLUSTER(k) | k #GT# 2: Z(2,k)) = 0;
! If item 2 assigned to cluster 1, then item 3
cannot be assigned to cluster 3;
@SUM( CLUSTER(k) | k #GT# 2: Z(3,k)) + Z(2,1) <= 1;
ENDSUBMODEL
CALC:
NITEM = @SIZE( ITEM);
NCLUST = @SIZE( CLUSTER);
! Average items per cluster;
NPC = @FLOOR( NITEM/ NCLUST);
! Compute the scale factor for each attribute;
! Type 1 is a cardinal measure,
so scale by the standard deviation, so most values are < 1;
@FOR( ATTRIBUTE(j) | TYPE(j) #EQ# 1:
MEAN = @SUM( ITEM(i): RESPONSE(i,j))/NITEM;
SD = (@SUM( ITEM(i): ( RESPONSE(i,j)- MEAN)^2)/ NITEM)^0.5;
@IFC( SD #GT# 0:
SCALE(j) = SD*2^(0.5);
@ELSE
SCALE(j) = 1;
);
);
! Ordinal attributes scaling. Type 2 is an ordinal measure,
so scale by the range so largest value is 1;
@FOR( ATTRIBUTE(j) | TYPE(j) #EQ# 2:
ENDLO = RESPONSE(1,j);
ENDHI = RESPONSE(1,j);
@FOR( ITEM(i):
ENDLO = @SMIN( ENDLO, RESPONSE(i,j));
ENDHI = @SMAX( ENDHI, RESPONSE(i,j));
);
SCALE(j) = ENDHI - ENDLO;
@IFC( SCALE(j) #EQ# 0: SCALE(j) = 1);
);
! Compute distances between all item pairs;
TOTDIF = 0;
@FOR( IXI(i,j):
! For cardinal and ordinal;
D(i,j) = @SUM( ATTRIBUTE(k) | TYPE( k) #EQ# 1 #OR# TYPE(k) #EQ# 2:
@ABS( RESPONSE(i,k)- RESPONSE(j,k))/SCALE(k))
+ @SUM( ATTRIBUTE(k) | TYPE(k) #EQ# 3 #AND# RESPONSE(i,k) #NE# RESPONSE(j,k): 1);
TOTDIF = TOTDIF + D(i,j);
);
! Type 3 is a categorical measure, so score 1 if different, 0 if the same;
@SOLVE( DOCLUSTER);
RSQD = 1 - REDDIF/TOTDIF;
@WRITE( 'Fraction of differences',@NEWLINE(1),
' explained by clustering= ', @FORMAT(RSQD,"6.4F"), @NEWLINE(1));
@WRITE( 'Cluster Contents', @NEWLINE(1));
@FOR( CLUSTER(k):
@FOR( IXC(i,k) | Z(i,k) #GT# 0.5:
@WRITE( ' ',CLUSTER(k),' ', ITEM(i), @NEWLINE(1));
);
);
ENDCALC
|