## purcell_Transf&DF.R # ------------------------------------------------------------------------------------------------------------------------ # Part 1: Data preparation/transformation for DF-extremes analysis based on Purcell's means.gawk and prepare.gawk scripts # Author: Maciej Trzaskowski # Part 2: Maximum likelihood DF-extremes analysis based on Purcell S. & Sham PC, Behav Genet. 2003 May;33(3):271-8. # "A model-fitting implementation of the DeFries-Fulker model for selected twin data". # Author: Fruhling Rijsdijk # Date: 20.08.2013 # Note: It is possible to run Part 1 as a separate script # ------------------------------------------------------------------------------------------------------------------------ # PART 1: Programming functions for transformation of data # ------------------------------------------------------------------ # Function: df.std() # Purpose: to standardise data against zygosity specific proband and population means or to return descriptive statistics relevant to DF-extremes analysis # required arguments are: # - x: matrix or dataframe with three columns: zygosity, phenotype twin1 and phenotype tw2; NOTE: zygosity needs to be coded: MZ=1 and DZ=2 # - thr: threshold - the cut off point indexing proband status # - pr: index whether the probands should be below or above the threshold; NOTE: defaults to "high", alternative option should be set to "low"; # - z: defaults to FALSE expecting a value given to 'thr' as a raw value; if TRUE then 'thr' is defined in standard deviations; # - means: defaults to FALSE and retruns prepared dataset; if set to TRUE returns descriptive statistics instead; NOTE: if TRUE and assigned to an object it returns a list with all descriptives, the original data and the transformed data .df.tr <- function(x, zyg, mz.mean, mz.pro, dz.mean, dz.pro){ xx <- as.data.frame(cbind(x,zyg)) yy <- ifelse( xx$zyg == 1, (xx$x-mz.mean)/(mz.pro-mz.mean), (xx$x-dz.mean)/(dz.pro-dz.mean)) return(yy) } .zscore <- function(x){ xx <- x[!is.na(x)] m <- mean(xx) v <- var(xx) z <- ( xx - m ) / sqrt( v ) x[!is.na(x)] <- z return(x) } df.std <- function(x, thr, pr='high', z=FALSE, means=FALSE){ # check that x is a matrix or a data frame with three required columns if( is.matrix(x) || is.data.frame(x) && dim(x)[2] == 3){ colnames(x) = c('zyg', 'v1', 'v2') xx = as.data.frame(na.omit(x)) if(z==FALSE){ if(pr == 'high'){ xx$p1 = ifelse( xx$v1 > thr, 1, 0) xx$p2 = ifelse( xx$v2 > thr, 1, 0) } else{ xx$p1 = ifelse( xx$v1 < thr, 1, 0) xx$p2 = ifelse( xx$v2 < thr, 1, 0) } } if(z==TRUE){ if(pr == 'high'){ xx$p1 = ifelse( .zscore(xx$v1) > thr, 1, 0) xx$p2 = ifelse( .zscore(xx$v2) > thr, 1, 0) } else{ xx$p1 = ifelse( .zscore(xx$v1) < thr, 1, 0) xx$p2 = ifelse( .zscore(xx$v2) < thr, 1, 0) } } # --------------------------------- # Grand means # mean for MZ twin1 and twin2 mz1 <-subset(xx, zyg==1, 'v1') mz1n <- dim(mz1)[1] mz2 <- subset(xx, zyg==1, 'v2') mz2n <- dim(mz2)[1] mzM <- (sum(mz1)+sum(mz2))/(mz1n+mz2n) # mean for DZ twin1 and twin2 dz1 <-subset(xx, zyg==2, 'v1') dz1n <- dim(dz1)[1] dz2 <- subset(xx, zyg==2, 'v2') dz2n <- dim(dz2)[1] dzM <- (sum(dz1)+sum(dz2))/(dz1n+dz2n) # total Mean totS <- sum(mz1)+sum(mz2)+sum(dz1)+sum(dz2) totN <- mz1n+mz2n+dz1n+dz2n totM <- totS/totN # ----------------------------------- # means for probands and co-twins # --------------------------------- # Double-enter data so that all probands are in one column and their co-twin in the other d1 <- xx d2 <- xx[,c(1,3,2,5,4)] names(d2) <- names(xx) d3 <- rbind(d1,d2) pro <- subset(d3, p1==1) # proband mean prM <- mean(pro$v1) coM <- mean(pro$v2) # Sample size by zygosity MZn <- dim(subset(pro,zyg==1))[1] DZn <- dim(subset(pro,zyg==2))[1] # Means by zygosity # Probands mzMp <- colMeans(subset(pro,zyg==1,'v1')) dzMp <- colMeans(subset(pro,zyg==2,'v1')) # Cotwins mzMc <- colMeans(subset(pro,zyg==1,'v2')) dzMc <- colMeans(subset(pro,zyg==2,'v2')) # MZ correlation rMZ <- (mzMc-totM)/(prM-totM) rDZ <- (dzMc-totM)/(prM-totM) h2g <- 2*(rMZ-rDZ) xx$vs1 <- .df.tr(xx$v1, xx$zyg, mzM, mzMp, dzM, dzMp) xx$vs2 <- .df.tr(xx$v2, xx$zyg, mzM, mzMp, dzM, dzMp) if(means==FALSE){ res <- xx[,c('zyg','vs1','vs2','p1','p2')] } if(means==TRUE){ if(pr=='high'){ res <- list( 'Total N ind'=totN, 'Grand Mean'=totM, 'MZ ind N'=(mz1n+mz2n), 'MZ mean'=mzM, 'DZ ind N'=(dz1n+dz2n), 'DZ mean'=dzM, 'Ind > thr'=(MZn+DZn), 'Proband mean'=prM, 'Cotwin mean'=coM, 'MZ > thr'=MZn, 'MZ proband mean'=mzMp, 'MZ cotwin mean'=mzMc, 'DZ > thr'=DZn, 'DZ proband mean'=dzMp, 'DZ cotwin mean'=dzMc, 'Estimate of rMZ'=rMZ, 'Estimate of rDZ'=rDZ, 'Estimate of h2g'=h2g, x, xx ) cat( "Total N ind. :", totN, "\nGrand Mean :", round(totM,2), "\n", "\nMZ ind N :", (mz1n+mz2n), "\nMZ mean :", round(mzM,2), "\n", "\nDZ ind N :", (dz1n+dz2n), "\nDZ mean :", round(dzM,2), "\n", "\nInd > thr :", (MZn+DZn), "\nProband mean :", round(prM,2), "\nCotwin mean :", round(coM,2), "\n", "\nMZ > thr :", MZn, "\nMZ proband mean :", round(mzMp,2), "\nMZ cotwin mean :", round(mzMc,2), "\n", "\nDZ > thr :", DZn, "\nDZ proband mean :", round(dzMp,2), "\nDZ cotwin mean :", round(dzMc,2), "\n", "\nEstimate of rMZ :", round(rMZ,2), "\nEstimate of rDZ :", round(rDZ,2), "\nEstiamte of h2g :", round(h2g,2), "\n" ) }else{ res <- list( 'Total N ind'=totN, 'Grand Mean'=totM, 'MZ ind N'=(mz1n+mz2n), 'MZ mean'=mzM, 'DZ ind N'=(dz1n+dz2n), 'DZ mean'=dzM, 'Ind < thr'=(MZn+DZn), 'Proband mean'=prM, 'Cotwin mean'=coM, 'MZ < thr'=MZn, 'MZ proband mean'=mzMp, 'MZ cotwin mean'=mzMc, 'DZ < thr'=DZn, 'DZ proband mean'=dzMp, 'DZ cotwin mean'=dzMc, 'Estimate of rMZ'=rMZ, 'Estimate of rDZ'=rDZ, 'Estimate of h2g'=h2g, x, xx ) cat( "Total N ind. :", totN, "\nGrand Mean :", round(totM,2), "\n", "\nMZ ind N :", (mz1n+mz2n), "\nMZ mean :", round(mzM,2), "\n", "\nDZ ind N :", (dz1n+dz2n), "\nDZ mean :", round(dzM,2), "\n", "\nInd < thr :", (MZn+DZn), "\nProband mean :", round(prM,2), "\nCotwin mean :", round(coM,2), "\n", "\nMZ < thr :", MZn, "\nMZ proband mean :", round(mzMp,2), "\nMZ cotwin mean :", round(mzMc,2), "\n", "\nDZ < thr :", DZn, "\nDZ proband mean :", round(dzMp,2), "\nDZ cotwin mean :", round(dzMc,2), "\n", "\nEstimate of rMZ :", round(rMZ,2), "\nEstimate of rDZ :", round(rDZ,2), "\nEstiamte of h2g :", round(h2g,2), "\n" ) } } return(invisible(res)) }else{ stop("Your data is in unexpected format!") } } # *********************************************************** # Data preparation/transformation for DF-extremes analysis # Run Function on original data file # *********************************************************** allVars<- c('id', 'zyg', 'v1', 'v2') data <- read.table ('final.dat', header=F, sep="", na.strings="NA",col.names=allVars) summary(data) str(data) selVars <- c('zyg', 'v1', 'v2') dat <- subset(data, zyg<3, selVars) summary(dat) str(dat) trData <- df.std(dat, thr=1.8, pr='high', z=FALSE, means=F) str(trData) # When means=F, the data frame 'trData' will have 5 col: # 'zyg', the transformed scores 'vs1', 'vs2' and 'p1', 'p2', indicating # whether v1 and v2 were above (1) or below(0) the specified cutt-off point ('thr') # ******************************************************************************************************************* # # PART 2: Running DF-extremes analysis using Shaun Purcell's ML model # Data file: Transformed Scores generated with df.std function # # ******************************************************************************************************************* # require(psych) require(OpenMx) # Variables defs <- c('p1', 'p2') selVars <- c('vs1', 'vs2', defs) useVars <- c('vs1', 'vs2') mzData <- subset(trData, zyg==1, selVars) dzData <- subset(trData, zyg==2, selVars) describe(mzData) describe(dzData) # ___________________________________________________ # PREPARE GENETIC MODEL to compute group heritability # ___________________________________________________ nv <- 1 # Matrices to store a, c, v and and d values gra <-mxMatrix( type="Lower", nrow=nv, ncol=nv, free=TRUE, values=.3, lbound=0, ubound=1, labels=c("a"), name="A" ) grc <-mxMatrix( type="Lower", nrow=nv, ncol=nv, free=TRUE, values=.1, lbound=0, ubound=1, labels=c("c"), name="C" ) resv <-mxMatrix( type="Lower", nrow=nv, ncol=nv, free=TRUE, values=.3, labels=c("v"), name="V" ) difv <-mxMatrix( type="Lower", nrow=nv, ncol=nv, free=TRUE, values=.1, labels=c("d"), name="D" ) ident <-mxMatrix( type="Iden", nrow=nv, ncol=nv, name="I") null <-mxMatrix( type="Zero", nrow=nv, ncol=nv, name="Z") # this is the 'box' to put the defintion variables in (indicated by 'data.') defvars <-mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c('data.p2','data.p1'), name="Y") # Algebra to generate Matrix to hold E gre <-mxAlgebra( expression=I-(A+C), name="E" ) # # Algebra for expected Means Matrices in MZ & DZ twins MZmean <-mxAlgebra( expression= (A+C) %x% Y , name="expMeanMZ" ) DZmean <-mxAlgebra( expression= (.5%x%A+C) %x% Y , name="expMeanDZ" ) # Algebra for expected Variance/Covariance Matrices in MZ & DZ twins covMZ <-mxAlgebra( expression= rbind(cbind(V,Z), cbind(Z,V)) + rbind(cbind(D,Z), cbind(Z,D)) %*% vec2diag(Y) , name="expCovMZ" ) covDZ <-mxAlgebra( expression= rbind(cbind(V,Z), cbind(Z,V)) + rbind(cbind(D,Z), cbind(Z,D)) %*% vec2diag(Y) , name="expCovDZ" ) # Data objects for Multiple Groups dataMZ <-mxData( observed=mzData, type="raw" ) dataDZ <-mxData( observed=dzData, type="raw" ) # Objective objects for Multiple Groups objMZ <-mxFIMLObjective( covariance="expCovMZ", means="expMeanMZ", dimnames=useVars ) objDZ <-mxFIMLObjective( covariance="expCovDZ", means="expMeanDZ", dimnames=useVars ) # Combine Groups pars <-list(gra,grc,resv,difv, ident, null, gre) modelMZ <-mxModel( pars, defvars, MZmean, covMZ, dataMZ, objMZ, name="MZ" ) modelDZ <-mxModel( pars, defvars, DZmean, covDZ, dataDZ, objDZ, name="DZ" ) minus2ll <-mxAlgebra( expression=MZ.objective + DZ.objective, name="m2LL" ) obj <-mxAlgebraObjective( "m2LL" ) ci <-mxCI (c ('MZ.A[1,1]','MZ.C[1,1]','MZ.E[1,1]' ) ) AceModel <-mxModel("ACE", pars, modelMZ, modelDZ, minus2ll, obj, ci) # ------------------------------------------------------------------------------------------------------------------------------- # RUN Genetic Model AceFit <- mxRun(AceModel, intervals=T) (AceSumm <- summary(AceFit)) # OUTPUT specific parameters of interest of full model AceFit$ACE.A@values AceFit$ACE$A AceFit$ACE$C AceFit$ACE$E AceFit$ACE$MZ.Y AceFit$ACE$DZ.Y AceFit$ACE$MZ.expMeanMZ AceFit$ACE$DZ.expMeanDZ AceFit$ACE$MZ.expCovMZ AceFit$ACE$DZ.expCovDZ