Lindo Systems, Inc.

Lindo Systems

! Clustering.                          (Clustering.lng)
Partition items into clusters so that items in the
same cluster are close together according to some distance metric;
! We compute a distance, D(i,j) for every pair of ITEMS i, j.
The cost of a cluster is the sum of all the D(i,j)'s in the cluster.
We assign items to clusters so as to minimize these within cluster
distances.;

! Keywords: Classification, Clustering, Data Mining, Grouping, Statistics;
SETS:
  ITEM;
  ATTRIBUTE: TYPE, SCALE;
  IXA( ITEM, ATTRIBUTE): RESPONSE;
  CLUSTER;
  IXI( ITEM, ITEM) | &1 #LT# &2: D, Y;
  IXC( ITEM, CLUSTER) : Z;
ENDSETS
! Clustering problem.
Assign items to clusters so that items that are close
are assigned to the same clusters;
DATA:
 CLUSTER = C1..C2;
! Surveyed some people who use our exercise equipment, and got their
! Height in cm, Satisfaction level (1 to 5), Favorite equipment color(3 choices);
 ATTRIBUTE = HGT SATLVL FCOLOR;
! Type of response: Cardinal(1), Ordinal(2), Categorical(3);
 TYPE =       1    2      3; 

 ITEM = I1..I11;
! ITEM = I1..I20;

 RESPONSE =
       186  4   3  ! 1;
       177  3   1  ! 2;
       180  5   2  ! 3;
       183  4   1  ! 4;
       171  2   2  ! 5;
       158  1   3  ! 6;
       204  5   3  ! 7;
       180  4   1  ! 8;
       189  3   1  ! 9;
       189  1   3  !10;
       168  2   2  !11;
  !     174  5   3  !12;
  !     195  4   1  !13;
  !     162  4   2  !14;
  !     192  1   3  !15;
  !     207  5   2  !16;
  !     204  1   1  !17;
  !     155  1   2  !18;
  !     162  4   3  !19;
  !     180  5   1; !20;
  ;
ENDDATA

! Variables:
   X(i,k) = 1 if item i is assigned to cluster k,
   Y(i,j) = 1 if items i and j are in the same cluster
;

SUBMODEL DOCLUSTER:
! Minimize the distance among items in same cluster;
 MIN = REDDIF;
   REDDIF = @SUM( IXI(i,j): D(i,j)*Y(i,j));

! Each item i must be in some cluster k;
  @FOR( ITEM(i):
    @SUM( CLUSTER(k): Z(i,k)) = 1;
      );

! Measure interaction. Force Y(i,j) = 1 if i and j in same cluster;
  @FOR( IXI(i,j):
    @FOR( CLUSTER(k):
      Y(i,j) >= Z(i,k) + Z(j,k) - 1;
      @BND(0, Y(i,j), 1);
        );
      );

 ! Z(i,k) can be only 0 or 1;
 @FOR( IXC(i,k):
   @BIN( Z(i,k));
     );

! Optionally, some cuts;
! On average, each cluster has NPC*(NPC-1)/2 interaction terms;
 @SUM( IXI(i,j): Y(i,j)) >= NCLUST*NPC*(NPC-1)/2;
 
! Optionally, Kill some symmetry;

! We may arbitrarily order the clusters so
  If item i is in cluster k, then some lower numbered item
   must be in cluster k-1;
! @FOR( IXC(i,k) | k #GT# 1:
   Z(i,k) <= @SUM( ITEM( j) | j #LT# i: Z(j,k-1))
      );

! Item 2 cannot be assigned to cluster higher than 2;
  @SUM( CLUSTER(k) | k #GT# 2: Z(2,k)) = 0;
! If item 2 assigned to cluster 1, then item 3
  cannot be assigned to cluster 3;
  @SUM( CLUSTER(k) | k #GT# 2: Z(3,k)) + Z(2,1) <= 1;
ENDSUBMODEL

CALC:
  NITEM = @SIZE( ITEM);
  NCLUST = @SIZE( CLUSTER);

! Average items per cluster;
  NPC = @FLOOR( NITEM/ NCLUST);

! Compute the scale factor for each attribute;
 ! Type 1 is a cardinal measure,
   so scale by the standard deviation, so most values are < 1;
  @FOR( ATTRIBUTE(j) | TYPE(j) #EQ# 1:
    MEAN = @SUM( ITEM(i): RESPONSE(i,j))/NITEM;
    SD = (@SUM( ITEM(i): ( RESPONSE(i,j)- MEAN)^2)/ NITEM)^0.5;
    @IFC( SD #GT# 0:
      SCALE(j) = SD*2^(0.5);
      @ELSE
       SCALE(j) = 1;
        );
      );

 ! Ordinal attributes scaling. Type 2 is an ordinal measure, 
  so scale by the range so largest value is 1;
  @FOR( ATTRIBUTE(j) | TYPE(j) #EQ# 2:
    ENDLO = RESPONSE(1,j);
    ENDHI = RESPONSE(1,j);
    @FOR( ITEM(i):
      ENDLO = @SMIN( ENDLO, RESPONSE(i,j));
      ENDHI = @SMAX( ENDHI, RESPONSE(i,j));
        );
      SCALE(j) = ENDHI - ENDLO;
      @IFC( SCALE(j) #EQ# 0: SCALE(j) = 1);
      );

! Compute distances between all item pairs;
  TOTDIF = 0;
 @FOR( IXI(i,j):
  ! For cardinal and ordinal;
   D(i,j) = @SUM( ATTRIBUTE(k) | TYPE( k) #EQ# 1 #OR# TYPE(k) #EQ# 2: 
                   @ABS( RESPONSE(i,k)- RESPONSE(j,k))/SCALE(k))
          + @SUM( ATTRIBUTE(k) | TYPE(k) #EQ# 3 #AND# RESPONSE(i,k) #NE# RESPONSE(j,k): 1);
   TOTDIF = TOTDIF + D(i,j);
     );

 ! Type 3 is a categorical measure, so score 1 if different, 0 if the same;

 @SOLVE( DOCLUSTER);

 RSQD = 1 - REDDIF/TOTDIF;
 
 @WRITE( 'Fraction of differences',@NEWLINE(1),
  ' explained by clustering= ', @FORMAT(RSQD,"6.4F"), @NEWLINE(1));
 @WRITE( 'Cluster   Contents', @NEWLINE(1));
 @FOR( CLUSTER(k):
    @FOR( IXC(i,k) | Z(i,k) #GT# 0.5:
    @WRITE( '   ',CLUSTER(k),'       ', ITEM(i), @NEWLINE(1));
        );
     );

ENDCALC