setwd("C:/Users/rvosy/Desktop/Sequence analysis seminar/") library(TraMineR) #main package library(cluster) #needed for building a typology library(RColorBrewer) #this one is needed for building nice graphs library(foreign) library(WeightedCluster) #this one may also be used for cluster analysis #lets read the data. The data should be saved in tab-delimited .dat format column names should be kept it #the first command reads the data. maindata <- read.delim("SAwebinar.dat") #this command lets you view the data View(maindata) #this prints the names of variables in the console names(maindata) #we will start by analyzing residential status #we need to introduce some labels to code quantitative values #we need two types of labes: short ones and long ones. RS.labels <- c("Lives with parents", "Lives in temporary accomodation", "Lives in self-owned accmodation") RS.shortlab <- c("LWP", "LTA", "LSOA") #to build some of the charts we will need labels representing different columns #values of x in the histogram xvalues25 <- c("finished school", "0.5 y.", "1 y.", "1.5 y.","2 y.", "2.5", "3 y.", "3.5 y.", "4 y.", "4.5 y.","5 y.", "5.5 y.", "6 y.", "6.5 y.") #we also need to define values that we are interested in the sequences. This is called alphabet #since there are three values in residential status variables, we need to specify what they are #creating alphabet alphabet.RS.seq <- c("1", "2", "3") #and now... one the most important commands in sequence analysis. #we need to say which string or variables represent sequences that we will analyze #defining sequences: #seqdef is the command we use to define sequence. First argument is file name, second-column numbers that represent sequences #third and fourth argument are label objects that we created, fifth is the alhpabet #This command will create a new object, which will later be used in subsequent analysis RS.seq <- seqdef(maindata, var= 2:15, label=RS.labels, states=RS.shortlab, alphabet=alphabet.RS.seq) View(RS.seq) names(RS.seq) #lets attribute some colors to the three states to build our cool charts cpal(RS.seq) <- c("#fdb462", "#ffffb3", "#b3de69") #if you want some different colors, go to https://colorbrewer2.org/#type=qualitative&scheme=Pastel1&n=6 to pick up the codes for the ones you like #vizualizing the full sample #we will now focus on some of the commands that help vizualizing the sequence data for the full sample ##overall sequence distribution plot seqdplot(RS.seq, withlegend = T, border = T, xtlab=xvalues25) #all sequences colored and vizualized in one plot seqIplot(RS.seq, withlegend = T, xtlab=xvalues25, border = NA) seqfplot(RS.seq, withlegend = T, xtlab=xvalues25) #mean time spent in each state: on average how much time participants spent while being in specific state seqmtplot(RS.seq, title = "Mean time", withlegend = F) #entropy for each column and its change across time seqHtplot(RS.seq) seqtab(RS.seq) #displaying transition probabilities seqtrate(RS.seq) #printing number of transitions for each individual sequence seqtransn(RS.seq) #frequency of each state for each sequence seqistatd(RS.seq) #within column entropy seqstatd(RS.seq) #printing entropy index for individual seqs seqient(RS.seq) #printing Turbulence index for each case seqST(RS.seq) #COMPLEXITY seqici(RS.seq) #and now... LETS BUILD A TYPOLOGY FOR RESIDENTIAL STATUS HISTORIES #first we need to calculate the transitions rates and create an object that stores these transition rates RS.trate <- seqtrate(RS.seq) #then we need to create the transitions costs RS.seq.scost <- seqsubm(RS.seq, method = "TRATE") View(RS.seq.scost) seqsubm(RS.seq, method = "TRATE") #THEN we need to create a distance matrix based on OPTIMAL MATCHING algorithm RS.sec.full.distOM <- seqdist(RS.seq, method="OM", norm=TRUE, indel=1, sm=RS.seq.scost, with.missing=TRUE, full.matrix=TRUE) View(RS.sec.full.distOM) #building a typology is best done by applying a two-stage approach to cluster analysis #this means that we first need to perform hierarchical cluster analysis and then "correct it" by applying k-medoid (very similar to k-means) clustering #let's start with the hierarchical cluster analysis RS.wardCluster <- hclust(as.dist(RS.sec.full.distOM), method = "ward") RS.wC.clust8 <- cutree(RS.wardCluster, k = 8) RS.wC.clust7 <- cutree(RS.wardCluster, k = 7) RS.wC.clust6 <- cutree(RS.wardCluster, k = 6) RS.wC.clust5 <- cutree(RS.wardCluster, k = 5) RS.wC.clust4 <- cutree(RS.wardCluster, k = 4) RS.wC.clust3 <- cutree(RS.wardCluster, k = 3) RS.wC.clust2 <- cutree(RS.wardCluster, k = 2) #step 2: PAM clustering RS.pamwardclust2 <- wcKMedoids(RS.sec.full.distOM, k = 2, initialclust = RS.wardCluster) RS.pamwardclust3 <- wcKMedoids(RS.sec.full.distOM, k = 3, initialclust = RS.wardCluster) RS.pamwardclust4 <- wcKMedoids(RS.sec.full.distOM, k = 4, initialclust = RS.wardCluster) RS.pamwardclust5 <- wcKMedoids(RS.sec.full.distOM, k = 5, initialclust = RS.wardCluster) RS.pamwardclust6 <- wcKMedoids(RS.sec.full.distOM, k = 6, initialclust = RS.wardCluster) RS.pamwardclust7 <- wcKMedoids(RS.sec.full.distOM, k = 7, initialclust = RS.wardCluster) RS.pamwardclust8 <- wcKMedoids(RS.sec.full.distOM, k = 8, initialclust = RS.wardCluster) #getting some idea how final clusters will look like seqIplot(RS.seq, group = RS.wC.clust2) seqIplot(RS.seq, group = RS.wC.clust3) seqIplot(RS.seq, group = RS.wC.clust4) seqIplot(RS.seq, group = RS.wC.clust5) seqIplot(RS.seq, group = RS.wC.clust6) seqIplot(RS.seq, group = RS.wC.clust7) seqIplot(RS.seq, group = RS.wC.clust8) ##PAM cluster quality RS.pamwardclust2$stats RS.pamwardclust3$stats RS.pamwardclust4$stats RS.pamwardclust5$stats RS.pamwardclust6$stats RS.pamwardclust7$stats RS.pamwardclust8$stats #vizualizing the final solution seqdplot(RS.seq, group=maindata$RSPATHS, xtlab=xvalues25, border = NA) seqIplot(RS.seq, group=maindata$RSPATHS, xtlab=xvalues25, border = NA) seqrplot(RS.seq, criterion = "density", withlegend = F, group=maindata$RSPATHS, dist.matrix = RS.sec.full.distOM, tsim = 0.10, trep = 0.5, xtlab=xvalues25, cex.plot = 0.1) seqfplot(RS.seq, group=maindata$RSPATHS, withlegend = T, xtlab=xvalues25)