# Genetic correlation: two ways, two different estimates

I implemented two ways of calculating the genetic correlation between two phenotypes, estimated from twins (direct model of rg between A1 and A2, and Cholesky).

My two implementations are based on the path diagrams from H.H. Maes' slides 18 and 53 found at

https://vipbg.vcu.edu/media/course/HGEN619_2015/BivariateGeneticAnalysis2.pdf

I do not get the same estimates between the two methods (-0.26 vs 0.51). Below is the code with the reproducible fake data and results. The code is also attached.

If someone could help figuring out why the two methods differ, of help me find a bug in my code, I would appreciate. Many thanks for looking into this.

In the notation below, indices 1 and 2 refer to phenotype 1 and phenotype 2; indices x and y refer to twin 1 and twin 2.

Mathieu

####

```
```library(OpenMx)

#######

# creating a reproducible data set

library( mvtnorm )

set.seed(12345)

twincov<- 0.30

trcov<- 0.1

N<-200

sigmz<-matrix( c( 1, twincov, trcov, trcov,

twincov, 1, trcov, trcov,

trcov, trcov, 1, twincov,

trcov, trcov, twincov,1 ), ncol=4 )

sigdz <-matrix( c( 1, twincov/2, trcov, trcov,

twincov/2, 1, trcov, trcov,

trcov, trcov, 1, twincov/2,

trcov, trcov, twincov/2,1 ), ncol=4 )

tmz <- as.data.frame( rmvnorm(N , c(0,0,0,0) , sigmz ) )

names(tmz)<- c("phen1x", "phen1y", "phen2x", "phen2y")

tdz <- as.data.frame(rmvnorm(N , c(0,0,0,0) , sigdz ))

names(tdz)<- c("phen1x", "phen1y", "phen2x", "phen2y")

dataMZ <- mxData( observed=tmz , type="raw" )

dataDZ <- mxData( observed=tdz , type="raw" )

##############

# defining variables and path

# the following are common to the two methods of calculating rg

# and will be re-used

phenVars<- c("phen1x", "phen1y" ,"phen2x", "phen2y" )

# pheno 1

aceVars1<- c("A1x","C1x","E1x","A1y","C1y","E1y")

# pheno 2

aceVars2<- c("A2x","C2x","E2x","A2y","C2y","E2y")

latentVariances <- mxPath( from=c( aceVars1, aceVars2 ), arrows=2,

free=FALSE, values=1 )

# means of latent variables

latentMeans <- mxPath( from="one", to=c( aceVars1, aceVars2 ) , arrows=1,

free=FALSE, values=0 )

# means of observed variables

obsMeans1 <- mxPath( from="one", to=c("phen1x", "phen1y" ) , arrows=1,

free=TRUE, values=0.01 , labels="mean1" )

obsMeans2 <- mxPath( from="one", to=c("phen2x", "phen2y" ) , arrows=1,

free=TRUE, values=0.02 , labels="mean2" )

# path coefficients for twin 1, phen 1

path1x <- mxPath( from=c( "A1x","C1x","E1x" ), to="phen1x", arrows=1,

free=TRUE, values=c( .12,.22,.32) , label=c( "a1","c1","e1") )

# path coefficients for twin 2, phen 1

path1y <- mxPath( from=c( "A1y","C1y","E1y" ), to="phen1y", arrows=1,

free=TRUE, values=c( .12,.22,.32) , label=c( "a1","c1","e1") )

# path coefficients for twin 1, phen 2

path2x <- mxPath( from=c( "A2x","C2x","E2x" ), to="phen2x", arrows=1,

free=TRUE, values=c( .18,.28,.38) , label=c( "a2","c2","e2") )

# path coefficients for twin 2, phen 2

path2y <- mxPath( from=c( "A2y","C2y","E2y" ), to="phen2y", arrows=1,

free=TRUE, values= c( .18,.28,.38) , label=c( "a2","c2","e2") )

# covariance between C1x and C2y, etc

covC1 <- mxPath( from="C1x", to="C1y", arrows=2,

free=FALSE , values=1 )

covC2 <- mxPath( from="C2x", to="C2y", arrows=2,

free=FALSE , values=1 )

# cov between A1x, A1y etc . index x is twin 1 , y in twin 2

covA1mz <- mxPath( from="A1x", to="A1y", arrows=2,

free=FALSE, values=1 )

covA2mz <- mxPath( from="A2x", to="A2y", arrows=2,

free=FALSE, values=1 )

covA1dz <- mxPath( from="A1x", to="A1y", arrows=2,

free=FALSE, values=.5 )

covA2dz <- mxPath( from="A2x", to="A2y", arrows=2,

free=FALSE, values=.5 )

commonPaths <- list( latentVariances, latentMeans,obsMeans1, obsMeans2,

path1x, path1y, path2x, path2y , covC1, covC2 )

# END of paths common to both methods

######################################

########

# method one: directly modeling the genetic correlation between A1 and A2

# index 1 refers to phenotype 1, 2 to phenotype 2

# x refers to twin 1, y to twin 2

pathRGx <- mxPath( from="A1x", to="A2x" , arrows=2 , free=TRUE , value= .5, label="rg" )

pathRGy <- mxPath( from="A1y", to="A2y" , arrows=2 , free=TRUE , value= .5, label="rg" )

# solution seems highly dependent on starting values (see below).

# 0.5 was chosen as starting value because

# that's close to the estimate of method 2 below

# confidence intervals

ci.rg <- mxCI("rg")

# Defining the models the paths

paths.rg <- list( pathRGx, pathRGy )

modelMZ.rg <- mxModel(model="MZrg", type="RAM", manifestVars=phenVars,

latentVars=c(aceVars1, aceVars2 ) , commonPaths, paths.rg , covA1mz, covA2mz, dataMZ , ci.rg )

modelDZ.rg <- mxModel(model="DZrg", type="RAM", manifestVars=phenVars,

latentVars=c(aceVars1, aceVars2 ) , commonPaths, paths.rg , covA1dz, covA2dz, dataDZ )

multi.rg <- mxFitFunctionMultigroup(c("MZrg","DZrg" ) )

model.rg <- mxModel(model="ACErg", modelMZ.rg, modelDZ.rg, multi.rg )

# check identification

print( mxCheckIdentification( model.rg ) )

# the model is identified

# Run Model

fit.rg <- mxRun( model.rg , intervals=T )

print(fit.rg$output$confidenceIntervals )

# lbound estimate ubound

#rg NA -0.2639032 NA

# Note that the estimate is far from method 2 (cholesky, below). The sign is also different.

# Also, why can't the bounds of the CI be calculated?

# If "rg" gets a starting value of 0.25 instead of .5, we get

# an estimate out of the intuitive range: (???)

# lbound estimate ubound

#rg NA 12.27812 NA

########

# method 2 : cholesky

# from A1 to phen2

path21x<- mxPath( from="A1x", to="phen2x" , arrows=1 , free=TRUE , value=.15, label="a21" )

path21y<- mxPath( from="A1y", to="phen2y" , arrows=1 , free=TRUE , value=.15, label="a21" )

# confidence intervals

cimat.cho <- mxMatrix(type="Full",nrow=3,ncol=1,free=T ,values=c( .12,.18,.15 ) ,

labels=c( "a1","a2","a21" ),name="aaa")

cialg.cho <- mxAlgebra( aaa[1,1]*aaa[3,1]/sqrt( aaa[1,1]^2 * ( aaa[3,1]^2+aaa[2,1]^2 ) ), name="rg",

dimnames=list( NULL ,NULL ) )

ci.cho <- mxCI("rg")

# Defining the models the paths

paths.cho <- list( path21x, path21y )

modelMZ.cho <- mxModel(model="MZcho", type="RAM", manifestVars=phenVars,

latentVars=c(aceVars1, aceVars2 ) , commonPaths, paths.cho , covA1mz, covA2mz, dataMZ ,

cimat.cho,cialg.cho, ci.cho )

modelDZ.cho <- mxModel(model="DZcho", type="RAM", manifestVars=phenVars,

latentVars=c(aceVars1, aceVars2 ) , commonPaths, paths.cho , covA1dz, covA2dz, dataDZ )

multi.cho <- mxFitFunctionMultigroup(c("MZcho","DZcho" ) )

model.cho <- mxModel(model="ACEcho", modelMZ.cho, modelDZ.cho, multi.cho )

# check identification

print( mxCheckIdentification( model.cho ) )

# the model is indentified

# Run Model

fit.cho <- mxRun( model.cho , intervals=T )

print(fit.cho$output$confidenceIntervals )

`# lbound estimate ubound`

#MZcho.rg[1,1] 0.1515944 0.5169708 1

#

# Different estimate from above.

## bound the correlation

pathRGx <- mxPath( from="A1x", to="A2x" , arrows=2 , free=TRUE , value= .25, label="rg", lbound=-1, ubound=1 )

pathRGy <- mxPath( from="A1y", to="A2y" , arrows=2 , free=TRUE , value= .25, label="rg", lbound=-1, ubound=1 )

If I do that, I get the same point estimate with start values of 0.25 as with 0.5.

As for the rest of your post, I'll have to give it some more thought. I'm rather surprised that `fit.rg` has a worse -2logL than `fit.cho`. If it were the other way around, I'd have a clearer suspicion about what's going on.

BTW, you can see more information about the estimated confidence limits (such as why they're being reported as `NA`) if you pass `verbose=TRUE` to `summary()`, e.g. `summary(fit.rg,verbose=TRUE)`.

EDIT: Hang on. You're using the symbol "rg" to refer to two different things. In `model.rg`, 'rg' is the correlation between A1 and A2. But in `model.cho`, 'rg' is the genetic correlation between the two phenotypes. Those aren't the same thing.

Log in or register to post comments

## Thank you for your comments.

Course found at

https://vipbg.vcu.edu/media/course/HGEN619_2015/BivariateGeneticAnalysis2.pdf

Log in or register to post comments

## slide 53

Log in or register to post comments

In reply to slide 53 by lemire

## Hmm, yeah. I think you're

Log in or register to post comments

## This is an interesting case.

First, I should point out that in `model.rg`, the sign of 'rg' is not identified. Assuming you've bounded 'rg' to [-1,1], you can see what I mean if you do `summary(fit.rg,verbose=T)`, specifically:

CI details:

parameter side value fit diagnostic statusCode method a1 c1 e1 a2 c2 e2 rg mean1 mean2

1 rg lower -1 4507.172 active box constraint OK neale-miller-1997 -0.5443192 2.241322e-06 0.8267003 0.1700324 0.4037318 0.9087478 -1 -0.01174902 0.03194610

2 rg upper 1 4507.172 success OK neale-miller-1997 -0.5029672 2.557402e-01 0.8482156 -0.2021780 0.4658332 0.8661036 1 -0.02967843 0.04124539

The point estimate of 'rg' is negative, but at the solution for the upper confidence limit, 'rg' is positive but the sign of 'a2' has flipped to negative, thus keeping the genetic covariance positive. In `model.cho`, 'rg' doesn't have this problem. You can get saner CI results for the genetic correlation if you define the MZ model as:

modelMZ.rg <- mxModel(

model="MZrg", type="RAM", manifestVars=phenVars,

mxAlgebra(rbind(

cbind(a1^2,a1*rg*a2),

cbind(a1*rg*a2,a2^2)

),name="covA"),

mxAlgebra(cov2cor(covA),name="corA"),

mxCI("corA[1,2]"),

latentVars=c(aceVars1, aceVars2 ) , commonPaths, paths.rg , covA1mz, covA2mz, dataMZ , ci.rg )

Compare:

confidence intervals:

lbound estimate ubound note

MZrg.corA[1,2] -0.2228341 0.2639025 0.9999999

rg NA -0.2639025 1.0000000 !!!

Second, the Cholesky parameterization ensures that the covariance matrix of the latent variables remains positive definite. You can see that with `eigen(fit.cho$MZcho$expectation$UnfilteredExpCov[5:16,5:16])`. There are no negative eigenvalues, though the matrix is nearly singular. `model.rg` is not subject to that implicit constraint; see `eigen(fit.rg$MZrg$expectation$UnfilteredExpCov[5:16,5:16])`. The fact that `model.rg` lacks an implicit constraint which `model.cho` has is why I was surprised that `fit.rg` has the worse -2logL.

Log in or register to post comments

In reply to This is an interesting case. by AdminRobK

## .26 or .51?

It now makes intuitive sense to me that the sign of rg for correlated latent variables is not well defined.

So back to my original question, which estimate should I report? You pointed that the log-likelihood is smaller for cholesky, does that mean the other method got stuck in a local minimum? Are the log-lik comparable anyway? (at least because the number of parameters is the same)? If so, I can base my decision on that (hence cholesky)?

Log in or register to post comments

In reply to .26 or .51? by lemire

## report Cholesky

Perhaps, but I tried running it with `mxTryHard()` with `exhaustive=TRUE` yesterday, and didn't find a better solution.

Yeah, on those grounds, I would report the result from the Cholesky-parameterized model.

Log in or register to post comments

## impossible covariation

In addition the DZ correlations are only modified in two cells, which generates a weird all-C no-C pattern between the cells. Guessing this is not a positive definite target expected covariance matrix.

Try the built-in twinData (say wt and ht) and you'll see the equivalence of correlated factors with a correlation via cholesky paths.

A great paper on this topic of equivalent model transformations is the inimitable John Loehlin (1996).

Loehlin, J. C. (1996). The Cholesky approach: A cautionary note. Behavior Genetics, 26(1), 65-69.

http://ibg.colorado.edu/cdrom2012/demoor/MultivariateCholesky/Papers/Loehlin_BG_1996.pdf

Log in or register to post comments

In reply to impossible covariation by tbates

## data generation

Good catch, Tim.

Log in or register to post comments

In reply to impossible covariation by tbates

## my problem was based on real data

I first saw the problem of the different estimates using our real data that we are not willing to share publicly. I merely simulated data to get a reproducible, testable framework for my post.

Also, my code seems consistent with my choice of indices (1 and 2 for phenotypes, x and y for twins). So does the correlation matrices, unless there's something I am missing.

cor(tmz)

phen1x phen1y phen2x phen2y

phen1x 1.00000000 0.35936636 0.07024392 0.06686935

phen1y 0.35936636 1.00000000 0.10982890 0.07460252

phen2x 0.07024392 0.10982890 1.00000000 0.24500391

phen2y 0.06686935 0.07460252 0.24500391 1.00000000

cor(tdz)

phen1x phen1y phen2x phen2y

phen1x 1.0000000 0.15550815 0.1511272 0.14248904

phen1y 0.1555081 1.00000000 0.1119594 0.06931447

phen2x 0.1511272 0.11195939 1.0000000 0.22150121

phen2y 0.1424890 0.06931447 0.2215012 1.00000000

Log in or register to post comments

In reply to my problem was based on real data by lemire

## Hmm

Log in or register to post comments

## enlightening

Until next time,

Mathieu

Log in or register to post comments

In reply to enlightening by lemire

## You're welcome. Glad to be

I still think there's something weird going on in your dataset, though. How large is the sample?

Log in or register to post comments

In reply to You're welcome. Glad to be by AdminRobK

## sample size

> cor( mzData )

phen1x phen1y phen2x phen2y

phen1x 1.0000000 0.7675690 0.177232 0.2401964

phen1y 0.7675690 1.0000000 0.156969 0.2297162

phen2x 0.1772320 0.1569690 1.000000 0.4758010

phen2y 0.2401964 0.2297162 0.475801 1.0000000

>

> cor( dzData )

phen1x phen1y phen2x phen2y

phen1x 1.00000000 0.1336467 0.2357775 -0.01569437

phen1y 0.13364668 1.0000000 0.0265778 0.11114247

phen2x 0.23577754 0.0265778 1.0000000 0.27126851

phen2y -0.01569437 0.1111425 0.2712685 1.00000000

Log in or register to post comments