setwd("C:/Users/rvosy/Desktop/Sequence analysis seminar/")

library(TraMineR) #main package
library(cluster) #needed for building a typology
library(RColorBrewer) #this one is needed for building nice graphs
library(foreign)
library(WeightedCluster) #this one may also be used for cluster analysis


#lets read the data. The data should be saved in tab-delimited .dat format column names should be kept it

#the first command reads the data.
maindata <- read.delim("SAwebinar.dat")
#this command lets you view the data
View(maindata)
#this prints the names of variables in the console
names(maindata)


#we will start by analyzing residential status
#we need to introduce some labels to code quantitative values
#we need two types of labes: short ones and long ones. 

RS.labels <- c("Lives with parents", "Lives in temporary accomodation", "Lives in self-owned accmodation")
RS.shortlab <- c("LWP", "LTA", "LSOA")

#to build some of the charts we will need labels representing different columns
#values of x in the histogram 
xvalues25 <- c("finished school", "0.5 y.", "1 y.", "1.5 y.","2 y.", "2.5", "3 y.", "3.5 y.", "4 y.", "4.5 y.","5 y.", "5.5 y.", "6 y.", "6.5 y.")

#we also need to define values that we are interested in the sequences. This is called alphabet
#since there are three values in residential status variables, we need to specify what they are
#creating alphabet
alphabet.RS.seq <- c("1", "2", "3")

#and now... one the most important commands in sequence analysis.
#we need to say which string or variables represent sequences that we will analyze
#defining sequences:
#seqdef is the command we use to define sequence. First argument is file name, second-column numbers that represent sequences
#third and fourth argument are label objects that we created, fifth is the alhpabet
#This command will create a new object, which will later be used in subsequent analysis
RS.seq <- seqdef(maindata, var= 2:15, label=RS.labels, states=RS.shortlab, alphabet=alphabet.RS.seq)
View(RS.seq)
names(RS.seq)

#lets attribute some colors to the three states to build our cool charts
cpal(RS.seq) <- c("#fdb462", "#ffffb3", "#b3de69")
#if you want some different colors, go to https://colorbrewer2.org/#type=qualitative&scheme=Pastel1&n=6 to pick up the codes for the ones you like

#vizualizing the full sample
#we will now focus on some of the commands that help vizualizing the sequence data for the full sample
##overall sequence distribution plot
seqdplot(RS.seq, withlegend = T, border = T, xtlab=xvalues25)
#all sequences colored and vizualized in one plot
seqIplot(RS.seq, withlegend = T, xtlab=xvalues25, border = NA)


seqfplot(RS.seq, withlegend = T, xtlab=xvalues25)


#mean time spent in each state: on average how much time participants spent while being in specific state
seqmtplot(RS.seq, title = "Mean time", withlegend = F)

#entropy for each column and its change across time
seqHtplot(RS.seq)

seqtab(RS.seq)

#displaying transition probabilities
seqtrate(RS.seq)


#printing number of transitions for each individual sequence
seqtransn(RS.seq)

#frequency of each state for each sequence
seqistatd(RS.seq)

#within column entropy
seqstatd(RS.seq)

#printing entropy index for individual seqs
seqient(RS.seq)

#printing Turbulence index for each case
seqST(RS.seq)

#COMPLEXITY
seqici(RS.seq)



#and now... LETS BUILD A TYPOLOGY FOR RESIDENTIAL STATUS HISTORIES

#first we need to calculate the transitions rates and create an object that stores these transition rates
RS.trate <- seqtrate(RS.seq)

#then we need to create the transitions costs 
RS.seq.scost <- seqsubm(RS.seq, method = "TRATE")
View(RS.seq.scost)
seqsubm(RS.seq, method = "TRATE")

#THEN we need to create a distance matrix based on OPTIMAL MATCHING algorithm
RS.sec.full.distOM <- seqdist(RS.seq, method="OM", norm=TRUE, indel=1, sm=RS.seq.scost, with.missing=TRUE, full.matrix=TRUE)
View(RS.sec.full.distOM)

#building a typology is best done by applying a two-stage approach to cluster analysis
#this means that we first need to perform hierarchical cluster analysis and then "correct it" by applying k-medoid (very similar to k-means) clustering

#let's start with the hierarchical cluster analysis
RS.wardCluster <- hclust(as.dist(RS.sec.full.distOM), method = "ward")

RS.wC.clust8 <- cutree(RS.wardCluster, k = 8)
RS.wC.clust7 <- cutree(RS.wardCluster, k = 7)
RS.wC.clust6 <- cutree(RS.wardCluster, k = 6)
RS.wC.clust5 <- cutree(RS.wardCluster, k = 5)
RS.wC.clust4 <- cutree(RS.wardCluster, k = 4)
RS.wC.clust3 <- cutree(RS.wardCluster, k = 3)
RS.wC.clust2 <- cutree(RS.wardCluster, k = 2)

#step 2: PAM clustering
RS.pamwardclust2 <- wcKMedoids(RS.sec.full.distOM, k = 2, initialclust = RS.wardCluster)
RS.pamwardclust3 <- wcKMedoids(RS.sec.full.distOM, k = 3, initialclust = RS.wardCluster)
RS.pamwardclust4 <- wcKMedoids(RS.sec.full.distOM, k = 4, initialclust = RS.wardCluster)
RS.pamwardclust5 <- wcKMedoids(RS.sec.full.distOM, k = 5, initialclust = RS.wardCluster)
RS.pamwardclust6 <- wcKMedoids(RS.sec.full.distOM, k = 6, initialclust = RS.wardCluster)
RS.pamwardclust7 <- wcKMedoids(RS.sec.full.distOM, k = 7, initialclust = RS.wardCluster)
RS.pamwardclust8 <- wcKMedoids(RS.sec.full.distOM, k = 8, initialclust = RS.wardCluster)

#getting some idea how final clusters will look like
seqIplot(RS.seq, group = RS.wC.clust2)
seqIplot(RS.seq, group = RS.wC.clust3)
seqIplot(RS.seq, group = RS.wC.clust4)
seqIplot(RS.seq, group = RS.wC.clust5)
seqIplot(RS.seq, group = RS.wC.clust6)
seqIplot(RS.seq, group = RS.wC.clust7)
seqIplot(RS.seq, group = RS.wC.clust8)

##PAM cluster quality
RS.pamwardclust2$stats
RS.pamwardclust3$stats
RS.pamwardclust4$stats
RS.pamwardclust5$stats
RS.pamwardclust6$stats
RS.pamwardclust7$stats
RS.pamwardclust8$stats

#vizualizing the final solution
seqdplot(RS.seq, group=maindata$RSPATHS, xtlab=xvalues25, border = NA)
seqIplot(RS.seq, group=maindata$RSPATHS, xtlab=xvalues25, border = NA)
seqrplot(RS.seq, criterion = "density", withlegend = F, group=maindata$RSPATHS,  dist.matrix = RS.sec.full.distOM, tsim = 0.10, trep = 0.5, xtlab=xvalues25, cex.plot = 0.1)
seqfplot(RS.seq, group=maindata$RSPATHS, withlegend = T, xtlab=xvalues25)