The goal of this analysis is to test whether I can predict the number of dreams I’ll remember and how happy those dreams will be using data collected in self-tracking apps. Specifically, I’ll use data from the SleepCycle app and dreams I wrote down in my phone’s notepad.

First dataset - Dreams
Data wrangling

#Set working directory
setwd("~/R_Datasets/Quantself")

#Load libraries
library(tidyr)
library(dplyr)
library(lubridate)
options(lubridate.verbose = FALSE)
library(ggplot2)
#Read dreams text file to a table and separate every new line into a new row
#other methods include the readr package, read_file function or stringr package, readChar function
dreams <- read.table("dreams.txt", sep = "\n", quote = "", strip.white = T, blank.lines.skip = T)

#Split dates and dreams into separate columns using a regular expression
#'separate' is part of the tidyr package
dfdreams <- separate(dreams, V1, c("Dream", "Date"), sep = "^[1-9]+[\\/][1-9]+", remove = F, fill = "right")

#finish in excel - fill down using this method https://exceljet.net/tips/how-to-quickly-fill-in-missing-data
write.csv(dfdreams, "dreams.csv")

#Read data back in
setwd("~/R_Datasets/Quantself")
dreams <- read.csv("dreams2.csv")

#Fix variable formats where necessary
#Create 'simple date' format to easily merge the two datasets
#Create new variable for number of characters in each dream
dreams$Dream <- as.character(dreams$Dream) #class needs to be character for sentiment analysis
dreams$Date <- mdy(dreams$Date)
dreams$Date_simple <- paste(month(dreams$Date), day(dreams$Date), sep = "/")
dreams$nchar <- nchar(dreams$Dream)

#Last step - add sentiment columns
library(syuzhet)
library(ggplot2)
sentiments <- get_nrc_sentiment(dreams$Dream)
dreamsplus <- cbind(dreams, sentiments)
glimpse(dreamsplus) #I started a new note to write down dreams at the start of 2016
## Observations: 201
## Variables: 14
## $ Date         (time) 2016-01-01, 2016-01-01, 2016-01-02, 2016-01-02, ...
## $ Dream        (chr) "At a dinner with some soccer and lax girls and t...
## $ Date_simple  (chr) "1/1", "1/1", "1/2", "1/2", "1/2", "1/3", "1/4", ...
## $ nchar        (int) 198, 272, 142, 80, 19, 175, 225, 174, 164, 78, 29...
## $ anger        (dbl) 0, 0, 1, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0...
## $ anticipation (dbl) 1, 0, 0, 0, 0, 0, 2, 1, 1, 0, 0, 0, 0, 3, 0, 1, 1...
## $ disgust      (dbl) 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0...
## $ fear         (dbl) 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0...
## $ joy          (dbl) 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 2, 0, 0, 1, 0, 0, 0...
## $ sadness      (dbl) 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0...
## $ surprise     (dbl) 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0...
## $ trust        (dbl) 1, 0, 0, 1, 0, 2, 0, 1, 1, 0, 1, 0, 0, 2, 0, 0, 1...
## $ negative     (dbl) 1, 1, 1, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0...
## $ positive     (dbl) 3, 1, 0, 1, 0, 2, 1, 1, 2, 0, 2, 0, 0, 2, 2, 0, 1...

Understanding through visualization

#using the ggplot2 package, examine the two options for dependent variable - proxies for dream recall
dreamcount <- dreamsplus %>% group_by(Date) %>%
  summarize(count = n(),
            nchar = sum(nchar))

ggplot(dreamcount, aes(x= wday(Date, label = T, abbr = T), y = count, fill = nchar)) +
  geom_bar(stat = "identity") +
  xlab("") +
  ylab("Number of Dreams") +
  ggtitle("How Does Dream Recall Vary by Weekday?") +
  scale_fill_continuous(trans = 'reverse') 

I remember the most dreams on Monday nights. Thursday, Friday, and Saturday nights have the lowest dream counts. This is especially telling since my data goes from January 1st 2016 (a Friday night) to last night (a Saturday night), so you would expect a slight boost to those weekdays. The number of characters I write down on a given morning ranges from 0 to ~1600. Long dreams appear pretty spread out throughout the week.

#Examine sentiment totals
sentimentTotals <- data.frame(count = colSums(sentiments))
sentimentTotals$sentiment = rownames(sentimentTotals)
arrange(sentimentTotals, -count)
##    count    sentiment
## 1    219     positive
## 2    135        trust
## 3    126     negative
## 4    121 anticipation
## 5    101          joy
## 6     77         fear
## 7     69     surprise
## 8     63      sadness
## 9     49        anger
## 10    41      disgust
ggplot(sentimentTotals, aes(x = reorder(x = sentiment, count), y = count, fill = sentiment)) +
  geom_bar(stat = "identity") +
  theme(axis.title.x = element_blank()) +
  labs(y = "Count", title = "What Emotions Are My Dreams Made Up Of?")

‘Positive’ is by far the most common sentiment picked up by Syuzhet package. After doing a quick spot check with the text itself, it appears to be liberally assigned. I’ll use ‘Joy’ as the proxy for a good dream.

Second dataset - Sleep
Data wranging

##Data prep - Sleep Cycle
#Read sleep data. To export the csv file from the app, go to Settings -> Advanced -> Database
sleep <- read.csv("sleepdata.csv", sep = ";")
glimpse(sleep) #I haven't been using sleep cycle as long as I've been recording dreams
## Observations: 146
## Variables: 8
## $ Start            (fctr) 2016-01-19 23:33:50, 2016-01-21 00:00:32, 20...
## $ End              (fctr) 2016-01-20 06:12:08, 2016-01-21 07:08:09, 20...
## $ Sleep.quality    (fctr) 78%, 65%, 63%, 78%, 81%, 73%, 82%, 70%, 70%,...
## $ Time.in.bed      (fctr) 6:38, 7:07, 7:17, 7:03, 8:08, 7:38, 7:28, 6:...
## $ Wake.up          (lgl) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ Sleep.Notes      (fctr) , , , , , , , , , , , , , , , , , , , , , , ...
## $ Heart.rate       (int) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ Activity..steps. (int) 14825, 7793, 10790, 15560, 4314, 4561, 10576,...
#Fix variable formats where necessary
#change sleep quality to numeric and remove percentage sign
sleep$Sleep.quality <- as.numeric(sub("%","",sleep$Sleep.quality))
#change dates and times from text to POSIXct
sleep$Start <- ymd_hms(sleep$Start)
sleep$End <- ymd_hms(sleep$End)
#Create simple date column
sleep$Date <- sleep$End - days(1) #sometimes I go to sleep after midnight, so this is the most accurate date to match up with dream date
sleep$Date_simple <- paste(month(sleep$Date), day(sleep$Date), sep = "/")
#Change sleep duration to decimal hours i.e. 7.5 for 7 hours 30 minutes
sleep$Time.in.bed <- as.numeric(as.duration(hm(sleep$Time.in.bed)))/3600

#Rename columns for ease of use, remove unused columns
sleep <- rename(sleep, Duration = Time.in.bed, Quality = Sleep.quality, Activity = Activity..steps.) %>%
  select(-Wake.up, -Sleep.Notes)
glimpse(sleep)
## Observations: 146
## Variables: 8
## $ Start       (time) 2016-01-19 23:33:50, 2016-01-21 00:00:32, 2016-01...
## $ End         (time) 2016-01-20 06:12:08, 2016-01-21 07:08:09, 2016-01...
## $ Quality     (dbl) 78, 65, 63, 78, 81, 73, 82, 70, 70, 79, 62, 83, 78...
## $ Duration    (dbl) 6.633333, 7.116667, 7.283333, 7.050000, 8.133333, ...
## $ Heart.rate  (int) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Activity    (int) 14825, 7793, 10790, 15560, 4314, 4561, 10576, 8579...
## $ Date        (time) 2016-01-19 06:12:08, 2016-01-20 07:08:09, 2016-01...
## $ Date_simple (chr) "1/19", "1/20", "1/21", "1/22", "1/23", "1/24", "1...
#Exclude naps (only take observations greater than 2 hours)
sleep <- filter(sleep, Duration > 2)

Understanding through visualization

#create two new dataframes for the weekly schedule heatmaps -> weekday, hour of day, and count for each combination
gotosleep <- sleep %>% mutate(weekday = wday(sleep$Start, label = T), hour = hour(sleep$Start)) %>%
  group_by(weekday, hour) %>%
  summarize(count = n())

wakeup <- sleep %>% mutate(weekday2 = wday(sleep$End, label = T), hour2 = hour(sleep$End)) %>%
  group_by(weekday2, hour2) %>%
  summarize(count2 = n())

#create heatmaps
ggstart = ggplot(gotosleep, aes(x = weekday, y = hour, fill = count)) +
  geom_tile() +
  ggtitle("Going To Bed At Night") +
  xlab("Weekday") +
  ylab("Hour") +
  scale_fill_continuous(trans = 'reverse') +
  scale_y_reverse() 


ggend = ggplot(wakeup, aes(x = weekday2, y = hour2, fill = count2)) +
  geom_tile() +
  ggtitle("Getting Up In The Morning") +
  xlab("Weekday") +
  ylab("Hour") +
  scale_fill_continuous(low = "yellow", high = "lightgrey", trans = 'reverse') +
  scale_y_reverse()

#put the plots on top of each other
library(gridExtra)
grid.arrange(ggend, ggstart)

The time I go to sleep varies more than the time I wake up, as evidenced by the different legends. This makes sense - I am a morning person and wake up early even when I go to sleep late, often followed by a nap in the afternoon.

Merging the data

#No dreams - Assign a count, nchar, and joy of 0 for dates in the sleep dataset that do not have a match in the dream dataset, so that I can use one table for analysis
sleep$Count <- match(sleep$Date_simple, table = dreams$Date_simple, nomatch = 0)
sleep$nchar <- match(sleep$Date_simple, table = dreams$Date_simple, nomatch = 0)
sleep$Joy <- match(sleep$Date_simple, table = dreams$Date_simple, nomatch = 0)
#Create subset of night with 0 dreams
nodreams <- filter(sleep, Count == 0) %>%
  select(Date_simple, Date, Quality, Duration, Heart.rate, Count, nchar, Joy) 
  
#ndreams - Summarize the number of dreams and total characters for dates that appear in both datasets
ndreams <- merge(dreamsplus, sleep,
              by.x = "Date_simple",
              by.y = "Date_simple") %>%
  filter(!is.na(Duration)) %>%
  group_by(Date_simple) %>%
  summarize(Date = mean(Date.y),
            Quality = mean(Quality),
            Duration = mean(Duration),
            Heart.rate = mean(Heart.rate),
            Count = n(),
            nchar = sum(nchar.x),
            Joy = sum(joy))  

#combine the subsets
data <- bind_rows(nodreams, ndreams)

#Add date-related variables and categorical Joy for final dataset
data$DayofWeek = wday(data$Date)
data$DayofMonth = day(data$Date)
data$Joy = ifelse(data$Joy > 0, 1 , 0)

glimpse(data)
## Observations: 132
## Variables: 10
## $ Date_simple (chr) "1/21", "1/28", "1/28", "2/3", "2/5", "2/9", "2/10...
## $ Date        (time) 2016-01-21 06:37:49, 2016-01-28 06:07:21, 2016-01...
## $ Quality     (dbl) 63, 70, 70, 90, 69, 74, 64, 75, 70, 70, 75, 78, 51...
## $ Duration    (dbl) 7.283333, 6.233333, 6.233333, 8.266667, 5.816667, ...
## $ Heart.rate  (dbl) NA, NA, NA, 61, 75, 55, 66, 51, NA, NA, 46, 84, 96...
## $ Count       (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ nchar       (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Joy         (dbl) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ DayofWeek   (dbl) 5, 5, 5, 4, 6, 3, 4, 5, 6, 6, 2, 3, 4, 3, 4, 7, 1,...
## $ DayofMonth  (int) 21, 28, 28, 3, 5, 9, 10, 11, 12, 12, 15, 16, 17, 2...

Understanding through visualization - Correlation Plot

library(corrplot)

corrdata = data %>% select(-Date_simple, -Date, -Heart.rate) %>% filter(Count > 0) 

correlations = cor(corrdata) 
corrplot(correlations, order="hclust",
         bg = "black",
        title = "How Are All of These Variables Correlated?",
        mar = c(2,2,2,2))