# --------------------------------------------------------------------- # Program: TW-IntroToR-20100210.R # Author: Matt Keller & Steve Boker # Date: Wed Feb 10 12:02:11 EST 2010 # # WELCOME TO THE WONDERFUL WORLD OF R! # # This is an introductory script for learning the very basics # of R. This header is a comment. Any text after a hash mark (#) # is ignored by R until the next carriage return. So, if you # put a hash mark at the beginning of a line, the whole line # is ignored. # # This script is formatted for a text window that is 72 columns wide. # # As we go through this file, we encourage you to write your # own comments so that you can come back to this file and # remind yourself of what was happening. # # We encourage you to make these notes in all of the scripts # in the workshop. It will thus be easier to come back later # and modify them for your own use. # # --------------------------------------------------------------------- # Revision History # -- Wed Feb 10 12:02:11 EST 2010 # Created TW-IntroToR-20100210.R. # Note: I always put a revision history in all of my scripts! # --------------------------------------------------------------------- # --------------------------------------------------------------------- # Variables (I always have a section here describing my variables) # --------------------------------------------------------------------- # # --------------------------------------------------------------------- # --------------------------------------------------------------------- # PART 1: USING R AS A CALCULATOR # --------------------------------------------------------------------- # Arithmetic with two numbers 2+2 2-4 2*3 8/3 # If you're like me, you don't remember order of operations 9+12*3 # So use parentheses to be explicit about what you want! (9+12)*3 # 3 squared 3^2 # 9 to the 1/2 power 9^.5 # That is the same as what is returned by the square root function sqrt(9) # A more complicated bit of algebra (sqrt(9)-6^2)/4 # --------------------------------------------------------------------- # PART 2: ASSIGNMENT AND OBJECT CREATION # --------------------------------------------------------------------- # Let's make a variable called Fred Fred <- 2+2 # Congrats! You've just made your first variable. # We say, "Fred gets two plus two". The arrow (less than, dash) # "assigns" 2+2 to Fred. You'll use the "gets arrow" all the time! # Type Fred on a line by itself and press return and you find out # what is contained in Fred Fred # Caps matter. There is no object "fred". Your first error message! fred # Error messages are our friends :) They give us tough love. # These are all equivalent Fred + 3 4 + 3 Fred+3 4 + 3 # white space doesn't matter between things, # but it does matter within t h i n g s F r e d # One variable can be assigned to another. a <- Fred # Now 'a' is a copy of Fred. a # We read the next line as "A gets square root of 'a' times five" A <- sqrt(a)*5 A # This one is a little tricky. We can re-assign A. A <- A*a A # We can use scientific notation x <- 1e3 x # Here, we create a vector of length 13; notice what ":" does x2 <- 1:13 x2 # The "*" does element-wise multiplication. x2*3 # Since there are fewer elements in "3" than in "x2", "3" is # repeated: it is multiplied by each element in "x2". # This makes scalar multiplication easy. # Vectors can also be created by the "c()" function. # "c" stands for "concatenate". myvec <- c(8,13,2,1,6) myvec # --------------------------------------------------------------------- # PART 3: LOGICALS # --------------------------------------------------------------------- # Remember our old friend Fred? # The variable Fred contains the numeric value 4 Fred # Is Fred _really_ a numeric value? is.numeric(Fred) # Does Fred equal four? Fred == 4 # Does Fred equal five? Fred == 5 # Is Fred greater than 3? Fred > 3 # Is Fred less than 3? Fred < 3 # Is Fred between 3 and 10? Fred > 3 & Fred < 10 # Is Fred either greater than 5 or equal to 2? Fred > 5 | Fred == 2 # Which elements of x2 are greater than 5? x2 > 5 # Let's check by printing out x2 x2 # --------------------------------------------------------------------- # PART 4: FUNCTIONS # --------------------------------------------------------------------- # R is a very *functional* language. # Most of what you do in R is use functions. # We've already seen a couple of functions. # Let's look more in depth: # sqrt() is a "function" and the "argument" is 16. sqrt(16) # Functions take arguments, do something, and then return something. # Every time you see a word followed by parentheses, # you are seeing a function. # help() is a function that prints the help page for a function help(sqrt) # This does the same thing as help(sqrt) ?sqrt # If you don't know the name of a function, but you know what it # does, you can search for it using help.search() help.search("kurtosis") # This is the formal way to use functions. # We can explicitly name the argument x. sqrt(x=16) # Absolute value abs(-7) # The function c() [concatenate] makes vectors c(3,-2,-3,6,1) # We can use functions as arguments to other functions! Fun! abs(c(3,-2,-3,6,1)) # Some functions require no input. # This prints to screen every object we've created so far ls() # This function creates a vector of 50 pseudo random numbers # drawn from a normal distribution with mean = 0 and sd = 1 rnorm(50, mean=0, sd=1.0) # Create *a different* vector of 50 values using rnorm dat <- rnorm(50, mean=0, sd=1.0) dat # How long is dat? length(dat) # Who dat? is.vector(dat) # What is the mean of dat? mean(dat) # What is the median of dat? median(dat) # What is the variance of dat? var(dat) # What is the standard deviation of dat? sd(dat) # Or, you could calculate standard deviation from the variance. sqrt(var(dat)) range(dat) min(dat) max(dat) round(dat, 3) # Now, let's create a matrix MAT <- matrix(1:10, nrow=5, ncol=2) # The matrix function is ?matrix # Create *a different* matrix MAT2 <- matrix(1:10, nrow=5, ncol=2, byrow=TRUE) # The dimensions of MAT are 5 rows, 2 columns dim(MAT) # We can also "bind" vectors together to make matrices. vec1 <- 1:5 vec2 <- vec1 * 2 MAT3 <- cbind(vec1, vec2) MAT3 # The dimensions of MAT3 are 5 rows, 2 columns MAT4 <- rbind(vec1, vec2) MAT4 # The dimensions of MAT4 are 2 rows, 5 columns # --------------------------------------------------------------------- # Part 5: SUBSETTING & INDEXING # --------------------------------------------------------------------- x2 # This selects the third element of x2 by using an index. x2[3] # Select elements 3 through 5. x2[3:5] # Select elements 1, 6, 9, & 2 in that order. x2[c(1,6,9,2)] myvec # Select the 2nd & 3rd elements of myvec. myvec[2:3] # You can select using logical vectors. myvec[c(TRUE,TRUE,FALSE,FALSE,TRUE)] # Here is another way to do the same thing! my.select <- c(TRUE,TRUE,FALSE,FALSE,TRUE) myvec[my.select] # We call my.select a "selection vector". # A selection vector is usually the same length as the target vector. # Select only elements of myvec that are greater than 5 my.select2 <- myvec > 5 myvec[my.select2] # To index matrices, rows go first, then a comma, then columns MAT[1:3,1:2] # Putting nothing before or after the comma means ALL rows or columns. MAT[2:3,] MAT[,1:2] # You can reorder rows or columns MAT[c(3,5,1),1:2] # If you select a single row or column, you obtain a VECTOR # Vectors are not matrices: no dimensions, just length MAT[,1] # Which elements of the 2nd column of MAT are over 8? my.select3 <- MAT[,2] > 8 # Select just the rows where column 2 is over 8 MAT[my.select3,] # As evolution teaches us, selection is a powerful mechanism! # --------------------------------------------------------------------- # PROBLEM SET 1 # Put your work directly into this script, below the q's # --------------------------------------------------------------------- # # a) Create a vector of 100 normally distributed random variables. # Assign it to "Y" # # b) Create another vector, "Z", of 100 normally distributed random # numbers with mean = 100 and the sd = 15. # HINT: See the help function if you get stuck! # # c) Create another variable, "Combo.dist", that is the sum of Y and Z # # d) Put the vector "Combo.dist" into a matrix with 20 rows and # 5 columns. # Do so such that the numbers are put in BY ROW. # Call the matrix "My.Mat" # # e) Get a new matrix that only has the rows of My.Mat where the # first column of My.Mat is less than 100. # # --------------------------------------------------------------------- # --------------------------------------------------------------------- # PART 6: READING AND WRITING DATA ON THE DISK # --------------------------------------------------------------------- # First, we need to set the Working Directory for R. # You can either set the Working Directory in the GUI or in a script. # # The Working Directory is where R will look if you read a file. # The Working Directory is where R will write if you write a file. # The Working Directory is where R can "save its state" at the # end of a session. # # I recommend you create a new folder for each of your projects. # That way you can save everything about a project together. # Also, that way you can "save state" of R and it only applies to # one project. # To display the Working Directory getwd() # If we wanted to change Working Directory on a PC, # we'd use something like this. setwd("C:/NoExist") # Note that this is a forward slash and not a backslash. # Or on a Mac or Linux machine we would use something like setwd("~/NoExist") # where the tilde stands for your home directory. # For this workshop session, we will set the Working Directory: setwd("C:/XXXXXXX/XXXXXXXX/XXXXXXXX") # To list all files in your working directory list.files() # IMPORTANT POINT: # Do you see the file named ExampleData1.csv? # If not, please try again and then raise your hand if you are # still having trouble finding ExampleData1.csv # To see all the arguments for read.csv() and other functions that # read in tables of data: ?read.csv # In this session we will use "csv" data files because they can # be easily saved from programs like SPSS, Excel, etc. There # are lots of options for reading in data files. For instance, help.search("spss") # This tells us that there is an R package called "foreign" that has # a function called read.spss(). # We can load that package by using the following line require(foreign) # We will use read.csv() for this workshop. # The csv format is the most general format for data files I know. # And you can even open it up in a text editor. # Now we will read our first data file. ExampleData1 <- read.csv(file="ExampleData1.csv", header=TRUE) # We have created a "data.frame" called ExampleData1. is.data.frame(ExampleData1) # I always ask for a summary() of a dataframe after reading it in. summary(ExampleData1) # summary() is like a sausage grinder. # You can put almost anything into summary() and sausage will come out. summary(x2) summary(rnorm(1000, mean=2, sd=1)) summary(MAT3) # But summary() doesn't tell me all the descriptive statistics # I want to know about ExampleData1 # Let's load a package called "psych" with functions useful # in psychological measurement. require(psych) # Now let's get some better descriptive statistics about ExampleData1 describe(ExampleData1) # --------------------------------------------------------------------- # PART 7: CORRELATIONS, LINEAR REGRESSION, AND SELECTION # --------------------------------------------------------------------- # These data are in a wide format. # That is to say, there is one twin pair per line. # But both MZ and DZ twins are in the same file. # Let's calculate an overall correlation matrix. cor(ExampleData1) # Hmm... maybe we don't want to see the first and second columns # So, I will just select the 3rd through 6th columns. cor(ExampleData1[,3:6]) # And I usually like to print correlations to only 3 decimal places. round(cor(ExampleData1[,3:6]), 3) # I would like to select only the MZ twins and only the DZ twins. theMZs <- ExampleData1$Zygosity=="MZ" theDZs <- ExampleData1$Zygosity=="DZ" # That was our first use of the "$" operator. # ExampleData1$Zygosity extracts the vector named "Zygosity" from # the data.frame named "ExampleData1". # Now let's print the correlation matrix for the MZ twins. round(cor(ExampleData1[theMZs,3:6]), 3) # And the correlation matrix for the DZ twins. round(cor(ExampleData1[theDZs,3:6]), 3) # Next, we will run a univariate regression on the whole data.frame. # We will predict Y1 from X1. lmOut1 <- lm(Y1~X1, data=ExampleData1) summary(lmOut1) # Finally, we predict Y1 from X1 separately for the MZ & DZ twin 1. lmOut2 <- lm(Y1~X1, data=ExampleData1[theMZs,]) summary(lmOut2) lmOut3 <- lm(Y1~X1, data=ExampleData1[theDZs,]) summary(lmOut3) # This is FAR from a complete analysis of these data. # In order to go further, we need to take into account the # relationship between twin 1 and twin 2 in each pair. # We will need to use Structural Equation Modeling. # --------------------------------------------------------------------- # PROBLEM SET 2 # Put your work directly into this script, below the q's # --------------------------------------------------------------------- # # a) Read in the data in ExampleData2.csv # # b) Select all rows with TwinID equal to 1 and Zygosity equal to "MZ" # # c) Calculate a correlation matrix for those twins. # # d) Now calculate a correlation matrix for TwinID equals 2 and # Zygosity equal to "MZ". # # e) Try running a linear model with Y being predicted by X for # only rows with TwinID 2 and Zygosity "DZ". # # ---------------------------------------------------------------------