! Clustering.                          (Clustering.lng)
Partition items into clusters so that items in the
same cluster are close together according to some distance metric;
! We compute a distance, D(i,j) for every pair of ITEMS i, j.
The cost of a cluster is the sum of all the D(i,j)'s in the cluster.
We assign items to clusters so as to minimize these within cluster
distances.;

! Keywords: Classification, Clustering, Data Mining, Grouping, Statistics;
SETS:
  ITEM;
  ATTRIBUTE: TYPE, SCALE;
  IXA( ITEM, ATTRIBUTE): RESPONSE;
  CLUSTER;
  IXI( ITEM, ITEM) | &1 #LT# &2: D, Y;
  IXC( ITEM, CLUSTER) : Z;
ENDSETS
! Clustering problem. Assign items to clusters so that items that are close are assigned to the same clusters; DATA: CLUSTER = C1..C2; ! Surveyed some people who use our exercise equipment, and got their ! Height in cm, Satisfaction level (1 to 5), Favorite equipment color(3 choices); ATTRIBUTE = HGT SATLVL FCOLOR; ! Type of response: Cardinal(1), Ordinal(2), Categorical(3); TYPE = 1 2 3; ITEM = I1..I11; ! ITEM = I1..I20; RESPONSE = 186 4 3 ! 1; 177 3 1 ! 2; 180 5 2 ! 3; 183 4 1 ! 4; 171 2 2 ! 5; 158 1 3 ! 6; 204 5 3 ! 7; 180 4 1 ! 8; 189 3 1 ! 9; 189 1 3 !10; 168 2 2 !11; ! 174 5 3 ! 195 4 1 !13; ! 162 4 2 ! 192 1 3 !15; ! 207 5 2 ! 204 1 1 !17; ! 155 1 2 ! 162 4 3 !19; ! 180 5 1; ; ENDDATA ! Variables: X(i,k) = 1 if item i is assigned to cluster k, Y(i,j) = 1 if items i and j are in the same cluster ; SUBMODEL DOCLUSTER: ! Minimize the distance among items in same cluster; MIN = REDDIF; REDDIF = @SUM( IXI(i,j): D(i,j)*Y(i,j)); ! Each item i must be in some cluster k; @FOR( ITEM(i): @SUM( CLUSTER(k): Z(i,k)) = 1; ); ! Measure interaction. Force Y(i,j) = 1 if i and j in same cluster; @FOR( IXI(i,j): @FOR( CLUSTER(k): Y(i,j) >= Z(i,k) + Z(j,k) - 1; @BND(0, Y(i,j), 1); ); ); ! Z(i,k) can be only 0 or 1; @FOR( IXC(i,k): @BIN( Z(i,k)); ); ! Optionally, some cuts; ! On average, each cluster has NPC*(NPC-1)/2 interaction terms; @SUM( IXI(i,j): Y(i,j)) >= NCLUST*NPC*(NPC-1)/2; ! Optionally, Kill some symmetry; ! We may arbitrarily order the clusters so If item i is in cluster k, then some lower numbered item must be in cluster k-1; ! @FOR( IXC(i,k) | k #GT# 1: Z(i,k) <= @SUM( ITEM( j) | j #LT# i: Z(j,k-1)) ); ! Item 2 cannot be assigned to cluster higher than 2; @SUM( CLUSTER(k) | k #GT# 2: Z(2,k)) = 0; ! If item 2 assigned to cluster 1, then item 3 cannot be assigned to cluster 3; @SUM( CLUSTER(k) | k #GT# 2: Z(3,k)) + Z(2,1) <= 1; ENDSUBMODEL
CALC: NITEM = @SIZE( ITEM); NCLUST = @SIZE( CLUSTER); ! Average items per cluster; NPC = @FLOOR( NITEM/ NCLUST); ! Compute the scale factor for each attribute; ! Type 1 is a cardinal measure, so scale by the standard deviation, so most values are < 1; @FOR( ATTRIBUTE(j) | TYPE(j) #EQ# 1: MEAN = @SUM( ITEM(i): RESPONSE(i,j))/NITEM; SD = (@SUM( ITEM(i): ( RESPONSE(i,j)- MEAN)^2)/ NITEM)^0.5; @IFC( SD #GT# 0: SCALE(j) = SD*2^(0.5); @ELSE SCALE(j) = 1; ); ); ! Ordinal attributes scaling. Type 2 is an ordinal measure, so scale by the range so largest value is 1; @FOR( ATTRIBUTE(j) | TYPE(j) #EQ# 2: ENDLO = RESPONSE(1,j); ENDHI = RESPONSE(1,j); @FOR( ITEM(i): ENDLO = @SMIN( ENDLO, RESPONSE(i,j)); ENDHI = @SMAX( ENDHI, RESPONSE(i,j)); ); SCALE(j) = ENDHI - ENDLO; @IFC( SCALE(j) #EQ# 0: SCALE(j) = 1); ); ! Compute distances between all item pairs; TOTDIF = 0; @FOR( IXI(i,j): ! For cardinal and ordinal; D(i,j) = @SUM( ATTRIBUTE(k) | TYPE( k) #EQ# 1 #OR# TYPE(k) #EQ# 2: @ABS( RESPONSE(i,k)- RESPONSE(j,k))/SCALE(k)) + @SUM( ATTRIBUTE(k) | TYPE(k) #EQ# 3 #AND# RESPONSE(i,k) #NE# RESPONSE(j,k): 1); TOTDIF = TOTDIF + D(i,j); ); ! Type 3 is a categorical measure, so score 1 if different, 0 if the same; @SOLVE( DOCLUSTER); RSQD = 1 - REDDIF/TOTDIF; @WRITE( 'Fraction of differences',@NEWLINE(1), ' explained by clustering= ', @FORMAT(RSQD,"6.4F"), @NEWLINE(1)); @WRITE( 'Cluster Contents', @NEWLINE(1)); @FOR( CLUSTER(k): @FOR( IXC(i,k) | Z(i,k) #GT# 0.5: @WRITE( ' ',CLUSTER(k),' ', ITEM(i), @NEWLINE(1)); ); ); ENDCALC