R introduction using the tidyverse

Bernd Klaus

November 29, 2016

What are we going to learn?

  1. Starting point: Raw data in various formats
  2. Import, reshape the data into required formats
  3. Perform computations on your data, plot it

Our first tool: R Markdown

Text formatting

*italic*  or _italic_
**bold**   __bold__

Headings

# 1st Level Header

## 2nd Level Header

Lists

*   Bulleted list item 1

*   Item 2

1.  Numbered list item 1

1.  Item 2. The numbers are incremented automatically in the output.
<http://example.com>

[linked phrase](http://example.com)

![optional caption text](path/to/img.png)

Tables

First Header  | Second Header
------------- | -------------
Content Cell  | Content Cell
Content Cell  | Content Cell

Simple arithmetics and vectors

x <- 6
y <- 4
z <- x + y
z
   [1] 10
x <- c(7.5, 8.2, 3.1, 5.6, 8.2)
head(x)
   [1] 7.5 8.2 3.1 5.6 8.2
x[c(1, 2, 4)]
   [1] 7.5 8.2 5.6
x[-(1:3)]
   [1] 5.6 8.2

Matrices in R

x <- c(5, 7, 9)
y <- c(6, 3, 4)
z <- cbind(x, y)
z
        x y
   [1,] 5 6
   [2,] 7 3
   [3,] 9 4
dim(z)
   [1] 3 2
z[c(1,2), ]
        x y
   [1,] 5 6
   [2,] 7 3
z[, -1]
   [1] 6 3 4
z[2, ]
   x y 
   7 3

Data frames (tibbles) and lists

pat <- read_csv("http://www-huber.embl.de/users/klaus/BasicR/Patients.csv")
   Parsed with column specification:
   cols(
     PatientId = col_character(),
     Height = col_double(),
     Weight = col_double(),
     Gender = col_character()
   )
pat
   # A tibble: 3 × 4
     PatientId Height Weight Gender
         <chr>  <dbl>  <dbl>  <chr>
   1        P1   1.65     75      f
   2        P2   1.90     NA      m
   3        P3   1.60     50      f

Accessing data in data frames

pat_tiny <- filter(pat, Height < 1.7)
select(pat_tiny, PatientId,  Height, Gender)
   # A tibble: 2 × 3
     PatientId Height Gender
         <chr>  <dbl>  <chr>
   1        P1   1.65      f
   2        P3   1.60      f

There are a couple of operators useful for comparisons:

pat[2, c("PatientId", "Height")]
   # A tibble: 1 × 2
     PatientId Height
         <chr>  <dbl>
   1        P2    1.9
pat[2, c(1, 2)]
   # A tibble: 1 × 2
     PatientId Height
         <chr>  <dbl>
   1        P2    1.9

Vectors with arbitrary contents: Lists

L <- list(one = 1, two = c(1, 2), five = seq(1, 4, length = 5), 
          list(string = "Hello World"))
L
   $one
   [1] 1
   
   $two
   [1] 1 2
   
   $five
   [1] 1.00 1.75 2.50 3.25 4.00
   
   [[4]]
   [[4]]$string
   [1] "Hello World"

http://r4ds.had.co.nz/vectors.html#visualising-lists

names(L)
   [1] "one"  "two"  "five" ""
L$five + 10
   [1] 11.0 11.8 12.5 13.2 14.0
L[[3]] + 10
   [1] 11.0 11.8 12.5 13.2 14.0
L[["two"]]
   [1] 1 2

=> can be accessed in the same way

pat$Height
   [1] 1.65 1.90 1.60
pat[[2]]
   [1] 1.65 1.90 1.60
pat[["Gender"]]
   [1] "f" "m" "f"

Summary: data access in R

We prape a simple vector to illustrate the access options again:

sample_vector <- c("Alice" = 5.4, "Bob" = 3.7, "Claire" = 8.8)
sample_vector
    Alice    Bob Claire 
      5.4    3.7    8.8

Access by index

sample_vector[1:2]
   Alice   Bob 
     5.4   3.7
sample_vector[-(1:2)]
   Claire 
      8.8

Access by boolean

sample_vector[c(TRUE, FALSE, TRUE)]
    Alice Claire 
      5.4    8.8
sample_vector[sample_vector < 6]
   Alice   Bob 
     5.4   3.7

Access by name

if there are names such as column names present (note that rowname are not preserved in the tidyverse), you can access by name as well:

sample_vector[c("Alice", "Claire")]
    Alice Claire 
      5.4    8.8

Applying a function to elements of a data structure

load(url("http://www-huber.embl.de/users/klaus/BasicR/bodyfat.rda"))
bodyfat <- as_tibble(bodyfat)
bodyfat
   # A tibble: 252 × 15
      density percent.fat   age weight height neck.circum chest.circum
        <dbl>       <dbl> <int>  <dbl>  <dbl>       <dbl>        <dbl>
   1     1.07        12.3    23    154   67.8        36.2         93.1
   2     1.09         6.1    22    173   72.2        38.5         93.6
   3     1.04        25.3    22    154   66.2        34.0         95.8
   4     1.08        10.4    26    185   72.2        37.4        101.8
   5     1.03        28.7    24    184   71.2        34.4         97.3
   6     1.05        20.9    24    210   74.8        39.0        104.5
   7     1.05        19.2    26    181   69.8        36.4        105.1
   8     1.07        12.4    25    176   72.5        37.8         99.6
   9     1.09         4.1    25    191   74.0        38.1        100.9
   10    1.07        11.7    23    198   73.5        42.1         99.6
   # ... with 242 more rows, and 8 more variables: abdomen.circum <dbl>,
   #   hip.circum <dbl>, thigh.circum <dbl>, knee.circum <dbl>,
   #   ankle.circum <dbl>, bicep.circum <dbl>, forearm.circum <dbl>,
   #   wrist.circum <dbl>
head(map_dbl(bodyfat, mean))
       density percent.fat         age      weight      height neck.circum 
          1.06       19.15       44.88      178.92       70.15       37.99

Custom functions

function_name <- function(argument_1, argument_2, 
                          optional_argument = default_value ) 
{
  return(...)
}
robust_z <- function(x){
  (x - median(x)) / mad(x)
}

map_df(bodyfat, robust_z)
   # A tibble: 252 × 15
      density percent.fat   age  weight  height neck.circum chest.circum
        <dbl>       <dbl> <dbl>   <dbl>   <dbl>       <dbl>        <dbl>
   1    0.763      -0.745 -1.69 -0.7746 -0.7588     -0.7588     -0.78193
   2    1.459      -1.414 -1.77 -0.1131  0.7588      0.2108     -0.72224
   3   -0.648       0.658 -1.77 -0.7833 -1.2647     -1.6862     -0.45961
   4    0.970      -0.950 -1.43  0.2872  0.7588     -0.2529      0.25666
   5   -1.003       1.025 -1.60  0.2698  0.4216     -1.5176     -0.28054
   6   -0.226       0.183 -1.60  1.1749  1.6019      0.4216      0.57899
   7    0.000       0.000 -1.43  0.1567 -0.0843     -0.6745      0.65061
   8    0.744      -0.734 -1.52 -0.0174  0.8431     -0.0843     -0.00597
   9    1.685      -1.630 -1.52  0.5048  1.3490      0.0422      0.14922
   10   0.831      -0.809 -1.69  0.7572  1.1804      1.7284     -0.00597
   # ... with 242 more rows, and 8 more variables: abdomen.circum <dbl>,
   #   hip.circum <dbl>, thigh.circum <dbl>, knee.circum <dbl>,
   #   ankle.circum <dbl>, bicep.circum <dbl>, forearm.circum <dbl>,
   #   wrist.circum <dbl>
map_df(bodyfat, ~  (.x - median(.x)) / mad(.x))
   # A tibble: 252 × 15
      density percent.fat   age  weight  height neck.circum chest.circum
        <dbl>       <dbl> <dbl>   <dbl>   <dbl>       <dbl>        <dbl>
   1    0.763      -0.745 -1.69 -0.7746 -0.7588     -0.7588     -0.78193
   2    1.459      -1.414 -1.77 -0.1131  0.7588      0.2108     -0.72224
   3   -0.648       0.658 -1.77 -0.7833 -1.2647     -1.6862     -0.45961
   4    0.970      -0.950 -1.43  0.2872  0.7588     -0.2529      0.25666
   5   -1.003       1.025 -1.60  0.2698  0.4216     -1.5176     -0.28054
   6   -0.226       0.183 -1.60  1.1749  1.6019      0.4216      0.57899
   7    0.000       0.000 -1.43  0.1567 -0.0843     -0.6745      0.65061
   8    0.744      -0.734 -1.52 -0.0174  0.8431     -0.0843     -0.00597
   9    1.685      -1.630 -1.52  0.5048  1.3490      0.0422      0.14922
   10   0.831      -0.809 -1.69  0.7572  1.1804      1.7284     -0.00597
   # ... with 242 more rows, and 8 more variables: abdomen.circum <dbl>,
   #   hip.circum <dbl>, thigh.circum <dbl>, knee.circum <dbl>,
   #   ankle.circum <dbl>, bicep.circum <dbl>, forearm.circum <dbl>,
   #   wrist.circum <dbl>

Transforming variables

pb_to_kg <- 1/2.2046
inch_to_m <- 0.0254

bodyfat <- mutate(bodyfat, height_m = height * inch_to_m,  
                           weight_kg = weight * pb_to_kg)

select(bodyfat, height,  height_m, weight, weight_kg)
   # A tibble: 252 × 4
      height height_m weight weight_kg
       <dbl>    <dbl>  <dbl>     <dbl>
   1    67.8     1.72    154      70.0
   2    72.2     1.84    173      78.6
   3    66.2     1.68    154      69.9
   4    72.2     1.84    185      83.8
   5    71.2     1.81    184      83.6
   6    74.8     1.90    210      95.4
   7    69.8     1.77    181      82.1
   8    72.5     1.84    176      79.8
   9    74.0     1.88    191      86.6
   10   73.5     1.87    198      89.9
   # ... with 242 more rows

Simple plotting in R: “qplot” of ggplot2

qplot(x, y = NULL, ..., data, facets = NULL, 
  NA), ylim = c(NA, NA), log = "", main = NULL,
  xlab = , ylab = )

The arguments are:

A qplot examples using the bodyfat data

bodyfat <- mutate(bodyfat, weight_binned = cut(weight_kg, 
                            breaks = c(0, 50, 75, 100, 175)))

qplot(abdomen.circum, percent.fat, 
      color = weight_binned, data = bodyfat) 

Mountain View

qplot(abdomen.circum, percent.fat, 
      color = weight_binned, data = bodyfat, 
      facets = ~weight_binned) 

Mountain View

Programming statements

R offers the typical options for flow–control known from many other languages.

w <- 3
  if (w < 5) {
      d <- 2
    } else {
      d <- 10
    }
d
   [1] 2
h <- seq(from = 1, to = 8)
s <- numeric() # create empty  vector
    for (i in 1:8)
    {
    s[i] <- h[i] * 10
    }
s
   [1] 10 20 30 40 50 60 70 80

Note however, that you should typically resort to map function for this purpose as this leads to more readable code:

map_dbl(h, ~.x * 10)
   [1] 10 20 30 40 50 60 70 80