PERFECT
REPLICATION
The following program is a SAS macro that can be used to
determine the number of clusters in a hierarchical cluster analysis using
perfect replication as the criterion. Below you will find a description of the
macro, the SAS
code, and an example Call Statement. Although this
program will run on a single data set, we recommend you execute this program on
multiple data sets that have been created by sampling with replacement from the
primary data set (see "Sampling with Replacement" program). The true number
of clusters can then be determined by examining the distribution of stopping
points produced by the execution of this macro. For more information on
performing perfect replication on resampled data sets see: Tonidandel, S. & Overall, J. E.
(2004). Determining the number of clusters in a hierarchical
cluster analysis using sampling with replacement.
Psychological
Methods, 9, 238-249.
/************************************************************************************************/
/*
*/
/* Macro name:
%cluster
*/
/*
*/
/* Purpose: Macro for determining the number of
underlying clusters in
a
*/
/* hierarchical cluster analysis using perfect
replication.
*/
/*
*/
/* PLEASE NOTE: This Macro is designed to run on
multiple data sets that have been resampled */
/* from a single primary data set.
As a result, the program requires a set
identifier */
/* variable called '_set_' and a
case identifier called _id_. These
variable
are */
/* automatically created using the "sampling with
replacement" program found
on this website.*/
/* If you wish to run this program on
a
primary data set prior to resampling you
must */
/* still create the set and case identifier variables or else
modify the macro. */
/* Additionally, this program expects that the variables you wish to
*/
/* cluster analyze are labeled
'scr1' to 'scr&NUMELEM' (&NUMELEM is a macro variable
*/
/* indicating the number of
elements to be cluster analyzed). */
/*
*/
/* Call
Statement:
*/
/*
*/
/* %cluster (ncl,method=,inp=,ss=,numsplt=,numelem=); */
/*
*/
/* REQUIRED parameters: NCL
*/
/*
*/
/* NCL - Hierarchical Levels (M; e.g 2,3,4,...,m) you
want to test the
perfect */
/* replication criterion on.
*/
/*
*/
/* OPTIONAL
parameters:
*/
/*
*/
/* METHOD= Method of clustering. Default is Ward's
method
*/
/*
*/
/* INP= input SAS dataset containing the sample data.
If a SAS
*/
/* dataset is not specified, the most recently
created SAS dataset
(_LAST_)
*/
/* is used as input to the
macro.
*/
/*
*/
/* SS= Number of subjects in each resampled data set.
Default is
240.
*/
/*
*/
/* NUMSPLT=Number of partitions (K) you want to split
data set into. Default is
4.
*/
/*
*/
/* NUMELEM= Number of
elements to be cluster analyzed -- labeled
scr1-scr(numelem). */
/* Default is 4.
*/
/*
*/
/* Printed output:
none
*/
/*
*/
/* Output SAS dataset: work.final
*/
/*
*/
/************************************************************************************************/
%macro
cluster(ncl,method=ward,INP=_LAST_,ss=240,numsplts=4,numelem=10);
/*****************************************/
/*
STEP
1
*/
/*
*/
/* Splits each data set
into */
/* &numsplts different
partitions
*/
/*****************************************/
data datasplt;
set &inp;
_splt_=ceil(_id_/(&ss/&numsplts));
/*****************************************/
/*
STEP
2
*/
/*
*/
/* Performs an initial cluster analysis */
/* on each partition of every data set */
/*****************************************/
Proc cluster data=datasplt out=tree
method=&method;
var scr1-scr&numelem;
by _set_ _splt_;
%let
k=1;
%let
n=%scan(&ncl,&k);
%do
%while(&n^=);
proc tree data=work.tree noprint out=dataout1 ncl=&n;
copy scr1-scr&numelem;
by _set_ _splt_;
run;
/*****************************************/
/*
STEP
3a
*/
/*
*/
/* Computes the Cluster mean
profiles */
/* for each partition of every data set */
/*****************************************/
data dataout1;
set work.dataout1;
_clust_=cluster;
drop cluster;
proc sort;
by _set_ _splt_ _clust_;
proc means;
var scr1-scr&numelem;
by _set_ _splt_ _clust_;
output out=work.means mean=mscr1-mscr&numelem;
run;
/*****************************************/
/*
STEP
3b
*/
/*
*/
/* Performs a higher order
cluster */
/* analysis on the cluster mean profiles*/
/* from each partition of every data set*/
/*****************************************/
proc cluster data=work.means out=tree2
method=&method;
var mscr1-mscr&numelem;
by _set_;
copy _clust_ _splt_;
proc tree data=tree2 noprint out=dataout2 ncl=&n;
copy mscr1-mscr&numelem _clust_ _splt_;
by _set_;
proc sort data=dataout2;
by _set_ cluster _splt_;
run;
/*****************************************/
/*
STEP
4
*/
/*
*/
/* Checks to see if perfect replication */
/* has occurred at each level of &ncl */
/* such that exactly one mean profile */
/* comes from each subsample */
/*****************************************/
data datanew1;
set work.dataout2;
remain=mod(_n_,&numsplts);
if remain=
0
then remain=&numsplts;
miss=abs(_splt_-remain);
proc means;
var miss;
by _set_;
output out=work.means2 mean=miss;
run;
/*****************************************/
/*
STEP
5
*/
/*
*/
/* Identifies the highest level of &ncl */
/* that perfect replication
occurs */
/*****************************************/
%if
&n=2
%then
%do;
data final;
set work.means2;
perfrepl=
0;
%end
;
data datanew2;
set work.means2;
if miss=
0;
perfrepl=&n;
data final;
merge final datanew2;
by _set_;
run;
/*****************************************/
/* Goes on to next level of &ncl
*/
/*****************************************/
%let
k=%eval(&k+1);
%let
n=%scan(&ncl,&k);
%end
;
%mend
;
***************************************************************************************************************;
/*EXAMPLE CALL STATEMENT*/
ODS
listing
close;
%cluster(
2
3
4
5
6
7
8,method=ward,INP=work.rsmpdata,SS=150,NUMSPLTS=4,numelem=10);
ods
listing;
/*THE FOLLOWING OUTPUTS THE DISTRIBUTION OF STOPPING
POINTS ACROSS THE RESAMPLED DATA SETS*/
proc
freq
data=work.final;
tables
perfrepl;
run
;