###################################################################
	# 		Métodos Multivariados em Saúde - 2015         		#
	# 		Script #7: Análise de Agrupamentos				#
	# 		Prof. Lupercio F. Bessegato					#
	###################################################################

rm(list=ls()) # remove TODOS os objetos 

		# Figura clusters (Everitt, Fig. 6.1, pág. 165)

library("mvtnorm")

dados <- rbind(rmvnorm(25, mean = c(3,3)),
               rmvnorm(20, mean = c(10, 8)),
               rmvnorm(10, mean = c(20, 1)))
plot(abs(dados), xlab = expression(x[1]), ylab = expression(x[2]))

		# Figura distâncias entre clusters
		
			# Objetos

set.seed(29)
x1 <- c(0.7, 0.8, 0.85, 0.9, 1.1, 1, 0.95)
x <- c(x1, x1 + 1.5)
y1 <- sample(x1)
y <- c(y1, y1 + 1)

			# Simple linkage
			# Máxima similaridade

plot(x, y, main = "Ligação Simples")
lines(c(1, 0.7 + 1.5), c(1.1, 0.7 + 1), col = "grey")

			# Complete linkage
			# Mínima similaridade

plot(x, y, main = "Ligação Completa")
lines(c(0.7, 2.5), c(0.7, 1.1 + 1), col = "grey")

			# Average linkage

plot(x, y, main = "Ligação Média")
for (i in 1:7) {
     	for (j in 8:14) lines(x[c(i, j)], y[c(i, j)], 
		col = rgb(0.1, 0.1, 0.1, 0.1))
 	}

			# Centróide

x1.m <- mean(x1)
y1.m <- mean(y1)
x2.m <- x1.m + 1.5
y2.m <- y1.m + 1

plot(x, y, main = "Centróide")
points(c(x1.m, x2.m), c(y1.m, y2.m), pch = 19)
text(x1.m, y1.m, expression(C[1]), cex = 0.8, adj = 1.5)
text(x2.m, y2.m, expression(C[2]), cex = 0.8, adj = -0.5)
lines(c(x1.m, x2.m), c(y1.m, y2.m), col = "grey")



		# +++++++++++++++++++  Dados de Vida  ++++++++++++++++++++++++
		# Medidas de corpo humano
		# Medidas de peito, cintura e quadris em 20 indivíduos,
		# homens e mulheres
		# Apresentado em Everitt e Hothorn (2011)
		# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
			
			#  Carregamento de Dados

medidas <- read.csv2("measure_data.csv", header = T)

			# Distâncias
		
	# distancia euclidiana, dados não padronizados

(dm <- dist(medidas[,c("chest", "waist", "hips")], method = "euclidean"))
	# opções: "maximum", "manhattan", "canberra", "binary" ou "minkowski".
m <- as.matrix(dm)
d <- as.dist(m)

			# Plots dendogramas

y.rot <- "Distância"

	# single linkage

cs <- hclust(dm, method = "single")
cs$merge
cs$height
cs$labels
cbind(cs$merge, cs$height)
plot(cs$height, type = "l")
abline( h = 3.8, col = "lightgrey")
plot(cs, main = "Ligação Simples", ylab = y.rot)
abline( h = 3.8, col = "lightgrey")

	# complete linkage

cc <- hclust(dm, method = "complete")
cbind(cc$merge, cc$height)
plot(cc$height, type = "l")
abline( h = 10, col = "lightgrey")
plot(cc, main = "Ligação Completa", ylab = y.rot)
abline(h = 10, col = "lightgrey")

	# average linkage

ca <- hclust(dm, method = "average")
cbind(ca$merge, ca$height)
plot(ca$height, type = "l")
abline(h = 7.8, col = "lightgrey")
plot(ca, main = "Ligação Média", ylab = y.rot)
abline(h = 7.8, col = "lightgrey")

	# centroid

cm <- hclust(dm, method = "centroid")
cbind(cm$merge, cm$height)
plot(cm$height, type = "l")
abline(h = 4.3, col = "lightgrey")
plot(cm, main = "Centróide", ylab = y.rot)
abline(h = 4.3, col = "lightgrey")

	# ward
cw <- hclust(dm, method = "ward")
cbind(cw$merge, cw$height)
plot(cw$height, type = "l")
abline( h = 15.3, col = "lightgrey")
plot(cw, main = "Ward", ylab = y.rot)
abline( h = 15.3, col = "lightgrey")

			# Componentes principais

medidas_cp <- princomp(dm, cor = TRUE)
xlim <- range(medidas_cp$scores[,1])
plot(medidas_cp$scores[,1:2], type = "n", xlim = xlim, ylim = xlim)
lab <- cutree(cs, h = 3.8)	# k: integer scalar or vector with 
					#    the desired number of groups
					# h: numeric scalar or vector with 
					#    heights where the tree should be cut.
text(medidas_cp$scores[,1:2], labels = lab, cex = 0.6)


		# Figura 6.4 (métodos comparados)

medidas_cp <- princomp(dm, cor = TRUE)
xlim <- range(medidas_cp$scores[,1])

layout(matrix(1:6, nr = 2), height = c(2, 1))

	# Simples

plot(cs <- hclust(dm, method = "single"), main = "Simples", 
	ylab = "Distância")
abline(h = 3.8, col = "lightgrey")


plot(medidas_cp$scores[,1:2], type = "n", xlim = xlim, ylim = xlim,
      xlab = "CP1", ylab = "CP2")
lab <- cutree(cs, h = 3.8)
text(body_pc$scores[,1:2], labels = lab, cex=0.6)

	# Completa

plot(cc <- hclust(dm, method = "complete"), main = "Completa",
	ylab = "Distância")
abline(h = 10, col = "lightgrey")
plot(medidas_cp$scores[,1:2], type = "n", xlim = xlim, ylim = xlim,
      xlab = "CP1", ylab = "CP2")
lab <- cutree(cc, h = 10)  
text(body_pc$scores[,1:2], labels = lab, cex=0.6)     

	# Média

plot(ca <- hclust(dm, method = "average"), main = "Média",
	ylab = "Distância")
abline(h = 7.8, col = "lightgrey")
plot(body_pc$scores[,1:2], type = "n", xlim = xlim, ylim = xlim,
      xlab = "CP1", ylab = "CP2")
lab <- cutree(ca, h = 7.8)                             
text(body_pc$scores[,1:2], labels = lab, cex=0.6)     

	# Alternativas para construção dos dendrogramas

par(mfrow=c(1,3))

plclust(hclust(dm, method = "single"), labels = row.names(medidas),
	   ylab="Distância")
title("(a) Single linkage")

plclust(hclust(dm, method = "complete"), 
	   labels = row.names(medidas), ylab = "Distância")
title("(b) Complete linkage")

plclust(hclust(dm, method = "average"), labels = row.names(medidas),
	   ylab="Distância")
title("(c) Average linkage")
#
grupos<-cutree(hclust(dm, method = "complete"), k = 2)

#
dados <- medidas[,c("chest", "waist", "hips")]
k = 2
medidas.clus<-lapply(1:k, function(nc) row.names(dados)[grupos == nc])
medidas.mean<-lapply(1:k, function(nc) apply(dados[grupos == nc,], 2, mean))
medidas.mean
medidas.clus


		# +++++++++++++++++++  Jet Data  ++++++++++++++++++++++++
		# Dados de caças norte-americanos
		# Variáveis:
		#	FFD: 	data 1o vôo, meses após Jan, 1940
		#	SPR: 	potência específica, proporcional à potência por
		#		unidade de peso
		#	RGF:	flight range factor
		#	PLF:	payload as a fraction of gross weight of aircraft
		#	SLF:	sustained load factor
		#	CAR:	indicadora de a aeronave pousar em porta-aviões
		#
		# Fonte: Stanley and Miller (1979) e Hand et al. (1994)
		# Apresentado em Everitt e Hothorn (2011)
		# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
			
			#  Carregamento de Dados

jato <- read.csv2("jet-data.csv", header = T)
jato$CAR <- factor(jato$CAR)
row.names(jato) <- jato$TYP

# Variáveis estão em diferentes escalas
# Padronização para a variância unitária

			# Distância

X <- scale(jato[, c("SPR", "RGF", "PLF", "SLF")],
		center = FALSE, scale = TRUE)
dj <- dist(X)

			# Clustering - Complete linkage (defaut)

plot( cc <- hclust(dj), main = "Agrupamento de Jatos",
	ylab = "Distância", xlab="Tipo de jato")
cc
cbind(cc$merge, cc$height)
plot(cc$height, type = "l")
abline(h = 7.8, col = "lightgrey")

			# Dendogram - Alternatives

	# Rótulos no mesmo nível

plot(cc, hang = -1, main = "Agrupamento de Jatos",
	ylab = "Distância", xlab="Tipo de jato")

	# Redução tamanho rótulos

par(cex=0.8, mar=c(5, 8, 4, 1))
plot(cc, xlab="", ylab="", main="", sub="", axes=FALSE)
par(cex=1)
title(xlab="Tipo de jato", ylab="Distância", main="Agrupamento de Jatos")
axis(2)

	# using dendrogram objects

ccd = as.dendrogram(cc)
plot(ccd)

	# triangular dendrogram

plot(ccd, type = "triangle", main = "Agrupamento de Jatos",
	ylab = "Distância", xlab="Tipo de jato")


	# Horizontal dendrogram

plot(ccd, main = "Agrupamento de Jatos",
	ylab = "Distância", xlab="Tipo de jato", horiz = TRUE)

	# Zooming-in on dendrograms
	#

#plot dendrogram with some cuts

op = par(mfrow = c(2, 1))
plot(cut(ccd, h = 1.5)$upper, main = "Upper tree of cut at h=1.5")

plot(cut(ccd, h = 1.5)$lower[[2]],
     main = "Second branch of lower tree with cut at h=1.5")

	# Phylogenetic trees

library(ape)


plot(as.phylo(cc), cex = 0.9, label.offset = 0.1) 	# plot basic tree

plot(as.phylo(cc), type="cladogram", cex = 0.9, 	# cladogram
	label.offset = 0.1) 

plot(as.phylo(cc), type = "unrooted")			# unrooted

plot(as.phylo(cc), type = "fan")			# fan

plot(as.phylo(cc), type = "radial")			# radial

	# tweek some parameters according to our needs

# vector of colors
mypal = c("#556270", "#4ECDC4", "#1B676B", "#FF6B6B", "#C44D58")

# cutting dendrogram in 3 clusters
clus5 = cutree(cc, 3)

# plot
op = par(bg="#E8DDCB")

# Size reflects FFD
plot(as.phylo(cc), type = "fan", tip.color = mypal[clus5], label.offset = 1,
cex = log(jato$FFD,10), col = "red")


			# Componentes Principais

pr <- prcomp(dj)$x[,1:2]
plot(pr, pch = (1:2)[cutree(cc, k = 2)],
	col = c("black", "darkgrey")[jato$CAR],
	xlim = range(pr) * c(1, 1.5),
	xlab = "CP1", ylab = "CP2")
legend("topright", col = c("black", "black", 
				   "darkgrey", "darkgrey"),
	 legend = c("1 / Não", "2 / Não", "1 / Sim", "2 / Sim"),
	 pch = c(1:2, 1:2), title = "Grupo/Navio",
	 bty = "n")


		# +++++++++++++++++++  Dados de Vida  ++++++++++++++++++++++++
		# Dados de expectativa de vida, em anos, nos anos 60
		# por país, idade e sexo
		# Dados originais: Keyfitz e Flieger (1971)
		# Apresentado em Everitt e Hothorn (2011)
		# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
			
			#  Carregamento de Dados

life<-source("http://www.york.ac.uk/depts/maths/data/everitt/chap4lifeexp.dat")$value

			# Dendrogramas

par(mfrow=c(1,3))
plclust(hclust(dist(life), method = "single"), labels = row.names(life),
	  ylab="Distância")
title("(a) Single linkage")
plclust(hclust(dist(life), method = "complete"), labels = row.names(life),
	  ylab = "Distãncia")
title("(b) Complete linkage")
plclust(hclust(dist(life), method = "average"), labels = row.names(life),
	  ylab = "Distância")
title("(c) Average linkage")
#
four<-cutree(hclust(dist(life),method="complete"),h=21)
 
#
#
country.clus<-lapply(1:5,function(nc) row.names(life)[four==nc])
country.mean<-lapply(1:5,function(nc) apply(life[four==nc,],2,mean))
country.mean
country.clus
#
#
dev.off()
#
pairs(life,panel=function(x,y) text(x,y,four))
#


		# +++++++++++++++++++  Crime Data  +++++++++++++++++++++++++++++
		# Taxas de diferentes tipos de crimes por 100.000 habitantes,
		# dos 50 estados dos EUA mais o Distrito de Columbia, em 1986
		# Variáveis:
		#	Murder: assassinato
		#	Rape:	  estupro
		#	Robbery:
		#	Assault:
		#	Burglary:
		#	Theft:
		#	Vehicle:
		#
		# Fonte: Statistical Abstract of USA (1988)
		# Apresentado em Everitt e Hothorn (2011)
		# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
			
			#  Carregamento de Dados

crime <- read.csv2("crime.csv", header = T)
head(crime)
row.names(crime) <- crime$X
crime <- crime[, 2:8]
			# Exploração de dados

plot(crime, pch = ".", cex = 1.5)

subset(crime, Murder > 15)

plot(crime, pch = c(".", "+")[(rownames(crime) == "DC") + 1], cex = 1.5)

sapply(crime, var)

amp <- sapply(crime, function(x) diff(range(x)))
crime.p <- sweep(crime, 2, amp, FUN = "/")
sapply(crime.p, var)
		
			# Clustering

n <- nrow(crime_s)

sq.dentro <- rep(0, 6)

sq.dentro[1] <- (n - 1) * sum(sapply(crime.p, var))

for (i in 2:6)
     sq.dentro[i] <- sum(kmeans(crime.p,
                          centers = i)$withinss)

plot(1:6, sq.dentro, type = "b", xlab = "Número de grupos",
      ylab = "Soma de quadrados dentros dos grupos")

kmeans(crime.p, centers = 2)$centers * amp	# médias dos grupos

crime.acp <- prcomp(crime.p)

plot(crime.acp$x[, 1:2], pch = kmeans(crime.p, centers = 2)$cluster,
	xlab = "CP1", ylab = "CP2")
legend("topright", legend = c(" I ", " II "),
	 pch = 1:2, 
	 title = "Grupo", bty = "n")


		# +++++++++++++++++++  Pottery Data  +++++++++++++++++++++++++++++++
		# Dados de análise química em 45 potes de cerâmica romano-britânica,
		# em 3 regiões 
		#	
		# Variáveis:
		#	Al2O3: 
		#	Fe2O3: 
		#	MgO:	
		#	K2O:	
		#	TiO2:	
		#	MnO:	
		#	BaO:	
		#	kiln:	forno: 1 (Região 1),  2 e 3 (Região 2), 4 e 5 (Região 3)
		#
		# Fonte: Tubb et al. (1980)
		# Apresentado em Everitt e Hothorn (2011)
		# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

# Questão interessante:
#	Se os perfis químicos de cada pote sugerem diferentes tipos de potes e
#	se qualquer destes tipos estão relacionados ao forno ou à região

			#  Carregamento de Dados

source("chap6pottery.dat")
ceramica <- pottery.dat
kiln <- rep(c(1:5), c(21, 12, 2, 5, 5))
ceramica <- cbind(ceramica, kiln)
head(ceramica)

			# Distância Euclidiana

ceramica.dist <- dist(potes <- scale(ceramica[, colnames(ceramica) != "kiln"], 
                                    center = FALSE))

			# Inspeção gráfica da matriz de distâncias

library(lattice)
levelplot(as.matrix(ceramica.dist), xlab = "Pote no.",
           ylab = "Pote no.")

# grafico sem numeros dos potes
trellis.par.set(standard.theme(color = FALSE))
plot(levelplot(as.matrix(ceramica.dist), xlab = "Pote no.", ylab = "Pote No.",
      scales = list(x = list(draw = FALSE), y = list(draw = FALSE))))

			# Clustering

n <- nrow(potes)
				# Soma quadrados grupos

sq.dentro <- rep(0, 6)		
sq.dentro[1] <- (n - 1) * sum(sapply(potes, var))

for (i in 2:6)
     sq.dentro[i] <- sum(kmeans(potes,
                          centers = i)$withinss)

plot(1:6, sq.dentro, type = "b", xlab = "Número de grupos",
      ylab = "Soma de quadrados dentros dos grupos")

				# Componentes principais

potes.acp <- prcomp(potes)

plot(potes.acp$x[, 1:2], 
      pch = kmeans(potes, centers = 3)$cluster,
	xlab = "CP1", ylab = "CP2")

				# Tabela Cluster x Fornos

set.seed(29)
ceramica.cluster <- kmeans(potes, centers = 3)$cluster
xtabs(~ ceramica.cluster + kiln, data = ceramica)


		# +++++++++++++++++++  Thomsen Data  +++++++++++++++++++++++++++++++
		# O que gastroenterologistas dizem a seus pacientes?
		#	Questionário enviado a aproximadamente 600 gastroenterologistas
		#	em 27 países europeus.
		#	Estudo realizados antes da configuração atual do mapa europeu
		#
		# Objetivo: perguntar o que diriam a paciente com diagnóstico recente
		#	de câncer de colo e a seu marido/esposa, sobre o diagnóstico
		#
		# Questões: respostas Sim ou Não
		#	Q1: 	Você diria a seu paciente que ele tem câncer, caso ele 
		#		não tenha perguntado? 
		#	Q2:	Você diria a marido/esposa que o paciente tem câncer
		#		(na ausência do paciente) 
		#	Q3:	Você diria ao paciente que ele tem câncer, caso ele 
		#		peça pergunte diretamente a você sobre o diagnóstico?
		# (Durante cirurgia verifica-se muitas pequenas metástases no fígado)
		#	Q4:	Você diria ao paciente sobre as metástases (supondo
		#		que o paciente pergunte sobre os resultados da cirurgia)?	
		#	Q5:	Você diria ao paciente que sua condição é incurável?	
		#	Q6:	Você diria a esposa/marido que a cirurgia revelou 
		#		metástases?	
		#
		# Fonte: Thomsen, Wulff, Martin e Singer (1993)
		# Apresentado em Everitt e Hothorn (2011)
		# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

			# Carregamento do conjunto de dados
source("thomsen.txt")
head(thomsen)

			# Exploratória

	# Tabulação dados
thomsen.tab <- xtabs(Freq ~ pais + resposta + questao, data = thomsen)

	# matriz de proporção de respostas "Sim" às perguntas Q1-Q6
thomsenprop <- prop.table(thomsen.tab, c(1,3))[,"sim",]	

	# Apresentação gráfica dos dados
plot(1:(22 * 6), rep(-1, 22 * 6), 
      ylim = c(-nlevels(thomsen$pais), -1), type = "n",
      axes = FALSE, xlab = "", ylab = "")
for (q in 1:6) {   
	tmp <- thomsen.tab[,,q]
	xstart <- (q - 1) * 22 + 1
	y <- -rep(1:nrow(tmp), rowSums(tmp))
	x <- xstart + unlist(sapply(rowSums(tmp), function(i) 1:i))
	pch <- unlist(apply(tmp, 1, function(x) c(rep(19, x[2]), rep(1, x[1]))))
	points(x, y, pch = pch)
}
axis(2, at = -(1:nlevels(thomsen$pais)), labels = levels(thomsen$pais), 
	cex = 0.5, las = 2, tick = FALSE, line = -1)
mtext(text = paste("Quest. ", 1:6), 3, at = 22 * (0:5), adj = 0, cex = 0.9)

			# Clustering - Finite Mixture

library(mclust)
	# Objeto 'Mclust'
(thomsen.mc <- Mclust(thomsenprop))
# melhor modelo (ellipsoidal, equal volume (EVV) c/ 2 componentes

	# Plot valores do BIC para questionário
plot(thomsen.mc, thomsenprop, what = "BIC", col = "black", 
	xlab = "Número de componentes")

cl <- thomsen.mc$classification

nm <- unlist(sapply(1:3, function(i) names(cl[cl == i])))

ttab <- thomsen.tab[nm,,]

par(mar = c(2, 5, 2, 2))
plot(1:(22 * 6), rep(-1, 22 * 6), 
      ylim = c(-nlevels(thomsen$pais), -1), type = "n",
      axes = FALSE, xlab = "", ylab = "")
for (q in 1:6) {   
    tmp <- ttab[,,q]
    xstart <- (q - 1) * 22 + 1
    y <- -rep(1:nrow(tmp), rowSums(tmp))
    x <- xstart + unlist(sapply(rowSums(tmp), function(i) 1:i))
    pch <- unlist(apply(tmp, 1, function(x) c(rep(19, x[2]), rep(1, x[1]))))
    points(x, y, pch = pch)
}
axis(2, at = -(1:nlevels(thomsen$pais)), labels = dimnames(ttab)[[1]],
      las = 2, tick = FALSE, line = 0)
mtext(text = paste("Quest. ", 1:6), 3, at = 22 * (0:5), adj = 0)

abline(h = -cumsum(table(cl))[-3] - 0.5, col = "grey")

text(-c(4.5, 4.5), -cumsum(table(cl)) + table(cl)/2,
      label = paste("Cluster", 1:2), srt = 90, pos = 1, cex = 0.9)

		###################################################
		###  Displaying clustering solutions graphically
		###################################################

			# Neighbourhood Plot - Primeiro exemplo

library(flexclust)
library("mvtnorm")

				# Geração conjunto de dados bivariados

	#random number generator for the multivariate normal distribution 
set.seed(290875)
x <- rbind(rmvnorm(n = 20, mean = c(0, 0), 
                    sigma = diag(2)),
            rmvnorm(n = 20, mean = c(3, 3), 
                    sigma = 0.5 * diag(2)),
            rmvnorm(n = 20, mean = c(7, 6), 
                    sigma = 0.5 * (diag(2) + 0.25)))

	# Perform k-means clustering
k <- cclust(x, k = 5, save.data = TRUE)

	# Neighbourhood plot of k-means 5-cluster solution
plot(k, hull = FALSE, col = rep("black", 5), xlab = "x", ylab = "y")

		# Neighbourhood Plot - Exemplo 2 - Pottery Data

			# Carregamento dos dados - Cálculos prévios

source("chap6pottery.dat")
ceramica <- pottery.dat
kiln <- rep(c(1:5), c(21, 12, 2, 5, 5))
ceramica <- cbind(ceramica, kiln)
potes <- scale(ceramica[, colnames(ceramica) != "kiln"], center = FALSE)
potes.acp <- prcomp(potes)

	# Perform k-means clustering
k <- cclust(potes, k = 3, save.data = TRUE)

	# Neighbourhood plot of k-means 3-cluster solution
plot(k, project = prcomp(potes), hull = FALSE, col = rep("black", 3),
      xlab = "CP1", ylab = "CP2")     

		# Stripes Plot - Exemplo 3

set.seed(912345654)
x3 <- rbind(matrix(rnorm(100, sd = 0.5), ncol= 2 ),
           matrix(rnorm(100, mean =4, sd = 0.5), ncol = 2),
           matrix(rnorm(100, mean =7, sd = 0.5), ncol = 2),
           matrix(rnorm(100, mean =-1.0, sd = 0.7), ncol = 2),
           matrix(rnorm(100, mean =-4.0, sd = 1.0), ncol = 2))

	# Perform k-means clustering
c3.5 <- cclust(x3, 5, save.data = TRUE)

	# Stripes plot of k-means 5-cluster solution
stripes(c3.5, type = "second", col = 1, ylab = "Distância do centróide")

	# Neighbourhood plot of k-means 5-cluster solution
plot(c3.5, hull = FALSE, col =