## purcell_Transf&DF.R
# ------------------------------------------------------------------------------------------------------------------------
#  Part 1: Data preparation/transformation for DF-extremes analysis based on Purcell's means.gawk and prepare.gawk scripts
#  Author: Maciej Trzaskowski
#  Part 2: Maximum likelihood DF-extremes analysis based on Purcell S. & Sham PC, Behav Genet. 2003 May;33(3):271-8.
#  "A model-fitting implementation of the DeFries-Fulker model for selected twin data".
#  Author: Fruhling Rijsdijk
#  Date: 20.08.2013
#  Note: It is possible to run Part 1 as a separate script
# ------------------------------------------------------------------------------------------------------------------------


#  PART 1: Programming functions for transformation of data
# ------------------------------------------------------------------

#  Function:		df.std()

#  Purpose: to standardise data against zygosity specific proband and population means or to return descriptive statistics relevant to DF-extremes analysis

#  required arguments are:
#	-	x:		matrix or dataframe with three columns: zygosity, phenotype twin1 and phenotype tw2; NOTE: zygosity needs to be coded: MZ=1 and DZ=2
#	-	thr:		threshold - the cut off point indexing proband status
#	-	pr:		index whether the probands should be below or above the threshold; NOTE: defaults to "high", alternative option should be set to "low";
#	-	z:		defaults to FALSE expecting a value given to 'thr' as a raw value; if TRUE then 'thr' is defined in standard deviations;
#	-	means:	defaults to FALSE and retruns prepared dataset; if set to TRUE returns descriptive statistics instead; NOTE: if TRUE and assigned to an object it returns a list with all descriptives, the original data and the transformed data


.df.tr <- function(x, zyg, mz.mean, mz.pro, dz.mean, dz.pro){
 	xx <- as.data.frame(cbind(x,zyg))
 		yy <- ifelse( xx$zyg == 1, (xx$x-mz.mean)/(mz.pro-mz.mean), (xx$x-dz.mean)/(dz.pro-dz.mean))
 		return(yy)
 }
 
 
.zscore <- function(x){
				xx <- x[!is.na(x)]
				m <- mean(xx)
				v <- var(xx)
				z <- ( xx -  m ) / sqrt( v )
				x[!is.na(x)] <- z
				return(x)
		} 


 df.std <- function(x, thr, pr='high', z=FALSE, means=FALSE){
 	

	# check that x is a matrix or a data frame with three required columns
	if( is.matrix(x) || is.data.frame(x) && dim(x)[2] == 3){
		 	
 		colnames(x) = c('zyg', 'v1', 'v2')
 		xx = as.data.frame(na.omit(x))

		if(z==FALSE){
 	
 			if(pr == 'high'){
 				xx$p1 = ifelse( xx$v1 > thr, 1, 0)
 				xx$p2 = ifelse( xx$v2 > thr, 1, 0)
 			}
 			else{
 				xx$p1 = ifelse( xx$v1 < thr, 1, 0)
 				xx$p2 = ifelse( xx$v2 < thr, 1, 0)
 			}
		}
		if(z==TRUE){
 			if(pr == 'high'){
 				xx$p1 = ifelse( .zscore(xx$v1) > thr, 1, 0)
 				xx$p2 = ifelse( .zscore(xx$v2) > thr, 1, 0)
 			}
 			else{
 				xx$p1 = ifelse( .zscore(xx$v1) < thr, 1, 0)
 				xx$p2 = ifelse( .zscore(xx$v2) < thr, 1, 0)
 			}		
		}	

# ---------------------------------
# Grand means
	# mean for MZ twin1 and twin2
	mz1 <-subset(xx, zyg==1, 'v1')
	mz1n <- dim(mz1)[1]
	mz2 <- subset(xx, zyg==1, 'v2')
	mz2n <- dim(mz2)[1]
	mzM <- (sum(mz1)+sum(mz2))/(mz1n+mz2n)

	# mean for DZ twin1 and twin2
	dz1 <-subset(xx, zyg==2, 'v1')
	dz1n <- dim(dz1)[1]
	dz2 <- subset(xx, zyg==2, 'v2')
	dz2n <- dim(dz2)[1]
	dzM <- (sum(dz1)+sum(dz2))/(dz1n+dz2n)
	
	# total Mean
	totS <- sum(mz1)+sum(mz2)+sum(dz1)+sum(dz2)
	totN <- mz1n+mz2n+dz1n+dz2n
	totM <- totS/totN

# -----------------------------------	
# means for probands and co-twins

# ---------------------------------
# Double-enter data so that all probands are in one column and their co-twin in the other

	d1 <- xx
	d2 <- xx[,c(1,3,2,5,4)]
	names(d2) <- names(xx)
	d3 <- rbind(d1,d2)
	pro <- subset(d3, p1==1)
	
	# proband mean
	prM <- mean(pro$v1)
	coM <- mean(pro$v2)

	# Sample size by zygosity
	MZn <- dim(subset(pro,zyg==1))[1]
	DZn <- dim(subset(pro,zyg==2))[1]
	
	# Means by zygosity
	# Probands
	mzMp <- colMeans(subset(pro,zyg==1,'v1'))
	dzMp <- colMeans(subset(pro,zyg==2,'v1'))
	
	# Cotwins
	mzMc <- colMeans(subset(pro,zyg==1,'v2'))
	dzMc <- colMeans(subset(pro,zyg==2,'v2'))
	
	
	# MZ correlation
	rMZ <- (mzMc-totM)/(prM-totM)
	rDZ <- (dzMc-totM)/(prM-totM)
	h2g <- 2*(rMZ-rDZ)
		
	xx$vs1 <- .df.tr(xx$v1, xx$zyg, mzM, mzMp, dzM, dzMp)
	xx$vs2 <- .df.tr(xx$v2, xx$zyg, mzM, mzMp, dzM, dzMp)

		if(means==FALSE){	
			res <- xx[,c('zyg','vs1','vs2','p1','p2')]		
		}
		if(means==TRUE){
			if(pr=='high'){
				res <- list(
						'Total N ind'=totN,
						'Grand Mean'=totM,
						'MZ ind N'=(mz1n+mz2n),
						'MZ mean'=mzM,
						'DZ ind N'=(dz1n+dz2n),
						'DZ mean'=dzM,
						'Ind > thr'=(MZn+DZn),
						'Proband mean'=prM,
						'Cotwin mean'=coM,
						'MZ > thr'=MZn,
						'MZ proband mean'=mzMp,
						'MZ cotwin mean'=mzMc,
						'DZ > thr'=DZn,
						'DZ proband mean'=dzMp,
						'DZ cotwin mean'=dzMc,
						'Estimate of rMZ'=rMZ,
						'Estimate of rDZ'=rDZ,
						'Estimate of h2g'=h2g,
						x,
						xx
						)  						
					
				cat(
		 			"Total N ind. 			:", totN,
					"\nGrand Mean 			:", round(totM,2),
					"\n",
					"\nMZ ind N				:", (mz1n+mz2n),
					"\nMZ mean				:", round(mzM,2),
					"\n",
					"\nDZ ind N				:", (dz1n+dz2n),
					"\nDZ mean				:", round(dzM,2),
					"\n",
					"\nInd > thr			:", (MZn+DZn),
					"\nProband mean			:", round(prM,2),
					"\nCotwin mean			:", round(coM,2),
					"\n",
					"\nMZ > thr				:", MZn,
					"\nMZ proband mean		:", round(mzMp,2),
					"\nMZ cotwin mean			:", round(mzMc,2),
					"\n",
					"\nDZ > thr				:", DZn,
					"\nDZ proband mean		:", round(dzMp,2),
					"\nDZ cotwin mean			:", round(dzMc,2),
					"\n",
					"\nEstimate of rMZ		:", round(rMZ,2),
					"\nEstimate of rDZ		:", round(rDZ,2),
					"\nEstiamte of h2g		:", round(h2g,2),
					"\n"
					)
			}else{
				res <- list(
						'Total N ind'=totN,
						'Grand Mean'=totM,
						'MZ ind N'=(mz1n+mz2n),
						'MZ mean'=mzM,
						'DZ ind N'=(dz1n+dz2n),
						'DZ mean'=dzM,
						'Ind < thr'=(MZn+DZn),
						'Proband mean'=prM,
						'Cotwin mean'=coM,
						'MZ < thr'=MZn,
						'MZ proband mean'=mzMp,
						'MZ cotwin mean'=mzMc,
						'DZ < thr'=DZn,
						'DZ proband mean'=dzMp,
						'DZ cotwin mean'=dzMc,
						'Estimate of rMZ'=rMZ,
						'Estimate of rDZ'=rDZ,
						'Estimate of h2g'=h2g,
						x,
						xx
						)  				
				cat(
		 			"Total N ind. 			:", totN,
					"\nGrand Mean 			:", round(totM,2),
					"\n",
					"\nMZ ind N				:", (mz1n+mz2n),
					"\nMZ mean				:", round(mzM,2),
					"\n",
					"\nDZ ind N				:", (dz1n+dz2n),
					"\nDZ mean				:", round(dzM,2),
					"\n",
					"\nInd < thr			:", (MZn+DZn),
					"\nProband mean			:", round(prM,2),
					"\nCotwin mean			:", round(coM,2),
					"\n",
					"\nMZ < thr				:", MZn,
					"\nMZ proband mean		:", round(mzMp,2),
					"\nMZ cotwin mean			:", round(mzMc,2),
					"\n",
					"\nDZ < thr				:", DZn,
					"\nDZ proband mean		:", round(dzMp,2),
					"\nDZ cotwin mean			:", round(dzMc,2),
					"\n",
					"\nEstimate of rMZ		:", round(rMZ,2),
					"\nEstimate of rDZ		:", round(rDZ,2),
					"\nEstiamte of h2g		:", round(h2g,2),
					"\n"
					)
			}
		}					
	return(invisible(res))
	}else{
		stop("Your data is in unexpected format!")
	}
}


# ***********************************************************
# Data preparation/transformation for DF-extremes analysis 
# Run Function on original data file
# ***********************************************************

allVars<- c('id', 'zyg', 'v1', 'v2')
data <- read.table ('final.dat', header=F, sep="", na.strings="NA",col.names=allVars)
summary(data)
str(data)

selVars	<- c('zyg', 'v1', 'v2')
dat		<- subset(data, zyg<3, selVars)
summary(dat)
str(dat)

trData	<- df.std(dat, thr=1.8, pr='high', z=FALSE, means=F)
str(trData)

#  When means=F, the data frame 'trData' will have 5 col:
# 'zyg', the transformed scores 'vs1', 'vs2' and 'p1', 'p2', indicating
#  whether v1 and v2 were above (1) or below(0) the specified cutt-off point ('thr') 


# ******************************************************************************************************************* #
# PART 2: Running DF-extremes analysis using Shaun Purcell's ML model                                         
# Data file: Transformed Scores generated with df.std function																													  #
# ******************************************************************************************************************* #

require(psych)
require(OpenMx)

# Variables 
defs		<- c('p1', 'p2')
selVars	<- c('vs1', 'vs2', defs)
useVars	<- c('vs1', 'vs2')

mzData <- subset(trData, zyg==1, selVars)
dzData <- subset(trData, zyg==2, selVars)
describe(mzData)
describe(dzData)

# ___________________________________________________
# PREPARE GENETIC MODEL to compute group heritability
# ___________________________________________________

nv	<- 1

# Matrices to store a, c, v and and d values
gra    	<-mxMatrix( type="Lower", nrow=nv, ncol=nv, free=TRUE, values=.3, lbound=0, ubound=1, labels=c("a"), name="A" ) 
grc    	<-mxMatrix( type="Lower", nrow=nv, ncol=nv, free=TRUE, values=.1, lbound=0, ubound=1, labels=c("c"), name="C" )
resv   	<-mxMatrix( type="Lower", nrow=nv, ncol=nv, free=TRUE, values=.3, labels=c("v"), name="V" )
difv   	<-mxMatrix( type="Lower", nrow=nv, ncol=nv, free=TRUE, values=.1, labels=c("d"), name="D" )
ident		<-mxMatrix( type="Iden", nrow=nv, ncol=nv, name="I")
null		<-mxMatrix( type="Zero", nrow=nv, ncol=nv, name="Z")

# this is the 'box' to put the defintion variables in (indicated by 'data.')
defvars	<-mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c('data.p2','data.p1'), name="Y") 
	
# Algebra to generate Matrix to hold E 
gre		<-mxAlgebra( expression=I-(A+C), name="E" )

# # Algebra for expected Means Matrices in MZ & DZ twins
MZmean    <-mxAlgebra( expression= (A+C) %x% Y , name="expMeanMZ" )
DZmean    <-mxAlgebra( expression= (.5%x%A+C) %x% Y , name="expMeanDZ" )

# Algebra for expected Variance/Covariance Matrices in MZ & DZ twins
covMZ    <-mxAlgebra( expression= rbind(cbind(V,Z), cbind(Z,V)) +   rbind(cbind(D,Z), cbind(Z,D)) %*% vec2diag(Y) , name="expCovMZ" )
covDZ    <-mxAlgebra( expression= rbind(cbind(V,Z), cbind(Z,V)) +   rbind(cbind(D,Z), cbind(Z,D)) %*% vec2diag(Y) , name="expCovDZ" )

# Data objects for Multiple Groups
dataMZ   <-mxData( observed=mzData, type="raw" )
dataDZ   <-mxData( observed=dzData, type="raw" )

# Objective objects for Multiple Groups
objMZ	<-mxFIMLObjective( covariance="expCovMZ", means="expMeanMZ", dimnames=useVars )
objDZ	<-mxFIMLObjective( covariance="expCovDZ", means="expMeanDZ", dimnames=useVars )

# Combine Groups
pars		<-list(gra,grc,resv,difv, ident, null, gre)
modelMZ	<-mxModel( pars, defvars, MZmean, covMZ, dataMZ, objMZ, name="MZ" )
modelDZ	<-mxModel( pars, defvars, DZmean, covDZ, dataDZ, objDZ, name="DZ" )
minus2ll	<-mxAlgebra( expression=MZ.objective + DZ.objective, name="m2LL" )
obj		<-mxAlgebraObjective( "m2LL" )
ci		<-mxCI (c ('MZ.A[1,1]','MZ.C[1,1]','MZ.E[1,1]'   )  )
AceModel 	<-mxModel("ACE", pars, modelMZ, modelDZ, minus2ll, obj, ci)

# -------------------------------------------------------------------------------------------------------------------------------
# RUN Genetic Model

AceFit	<- mxRun(AceModel, intervals=T)
(AceSumm	<- summary(AceFit))

# OUTPUT specific parameters of interest of full model
AceFit$ACE.A@values
AceFit$ACE$A
AceFit$ACE$C
AceFit$ACE$E
AceFit$ACE$MZ.Y
AceFit$ACE$DZ.Y
AceFit$ACE$MZ.expMeanMZ
AceFit$ACE$DZ.expMeanDZ
AceFit$ACE$MZ.expCovMZ
AceFit$ACE$DZ.expCovDZ