#####################################################
### Google Analytics script:
###
### This script will create a bar graph averaging when
### during the week US visitors have come to your website
#####################################################
# Written by Emily Coderre, PhD, 2015-03-16
########################
# Getting your data
#
# From your google analytics account, go to:
# audience --> location country --> US --> metro area -->
# secondary dimension --> hour of day
# show rows 5000
# **select days in batches so that <5000 visits occur (eg, if doing 6
# weeks, do first 4 days then export then change to days 5-8
# then export, so on)
# Export data to excel, do a find and replace for the entire document and replace spaces with underscores, remove percent signs, also delete last row which has sums
# Go back to ** above and copy and paste the next batch to the excel
# spreadsheet
# Save as text tab delimited (new metro output 1.txt) file in working directory
## Clear workspace ##
rm(list=ls(all=TRUE))
## Load packages ##
library(date)
library(gplots)
## Change to directory ##
homedir = 'C:/Users/Tim/Dropbox/R analyses of google analytics data/2015-03-16/website'
setwd(homedir)
## load the data ##
alldata = read.table('new metro output 1_formatted.txt',header=TRUE)
############################################
### Split up Hour_of_Day to separate columns for month, day, hour, and day of week
daysofweek = c('Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday')
for (r in 1:nrow(alldata)){
# split up date into characters
hourofday = as.character(alldata$Hour_of_Day[r])
a=strsplit(hourofday,NULL)
date=a[[1]]
# put month, day, and hour into separate columns
alldata$month[r] = paste(date[5:6],collapse = "")
alldata$day[r] = paste(date[7:8],collapse = "")
alldata$hour[r] = paste(date[9:10],collapse = "")
# convert to Julian date given month, date, year
jdate = mdy.date(as.numeric(as.character(alldata$month[r])),as.numeric(as.character(alldata$day[r])),2015)
# tells you day of week: 1=Sunday, 7=Saturday
weekday = date.mdy(jdate,weekday=TRUE)$weekday
alldata$dayofweek[r] = daysofweek[weekday]
}
# save as new text file
write.table(alldata,'alldata.txt',sep='\t')
#################################
## Load all data from file if not in memory already
alldata = read.table('alldata.txt',header=TRUE)
## Adjust for timezones ##
# Load timezone file
timezones = read.table('time zones.txt',header=TRUE)
colnames(timezones) = c('Metro_name','Timezone')
daysofweek = c('Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday')
adjdata = vector()
# row loop for Metro names
for (m in 1:nrow(timezones)){
name = as.character(timezones[m,1])
shift = timezones[m,2]
# pull out data for that metro center
zonedata = alldata[alldata$Metro == name,]
if (nrow(zonedata) > 0){
zonedata$hour_adj = NA
# row loop for all hours
for (r in 1:nrow(zonedata)){
hour_adj = as.numeric(as.character(zonedata$hour[r])) + shift
# if adjusted hour straddles midnight, change date to the day before
if (hour_adj >= 0){
zonedata$hour_adj[r] = hour_adj
} else{
zonedata[r,]$hour_adj = hour_adj + 24 # change hour
zonedata[r,]$day = as.numeric(as.character(zonedata[r,]$day)) - 1 # change day
# if day straddles month, change month and day
if (zonedata[r,]$day == 0){
# change month
zonedata[r,]$month = as.numeric(as.character(zonedata[r,]$month)) - 1
# change day
if (zonedata[r,]$month == 2){numdays = 28}
if (zonedata[r,]$month == 4 |zonedata[r,]$month == 6 |zonedata[r,]$month == 9 |zonedata[r,]$month == 11){
numdays = 30
} else {
numdays = 31
}
zonedata[r,]$day = numdays
# change day of week
dayposition = which(daysofweek == zonedata[r,]$dayofweek)
# if day of week straddles Sunday
if (dayposition - 1 == 0){dayposition = 8}
newdayofweek = daysofweek[dayposition - 1]
zonedata[r,]$dayofweek = newdayofweek
}
}
}
}
adjdata = rbind(adjdata,zonedata)
}
######
## Calculate total number of visits for each site (over all weeks)
metrocenters = levels(adjdata$Metro)
allvisits = array(dim=c(length(metrocenters),2))
colnames(allvisits) = c('Metro','totalNumVisits')
for (m in 1:length(metrocenters)){
city = metrocenters[m]
allvisits[m,1] = city
allvisits[m,2] = sum(adjdata[adjdata$Metro == city,]$Sessions)
}
allvisits = as.data.frame(allvisits)
write.table(allvisits,'allvisitsperMetroCenter.txt',sep='\t')
########
## Drop Yuma (straddles time zones)
gooddata = na.exclude(as.data.frame(adjdata[adjdata$Metro != 'Yuma_AZ-El_Centro_CA',]))
## Take out the people who came on January 3 in California - got counted as January 4 on East Coast time but with time change actually came on 1/3 (before start of time period)
## change these values to match your data!!! ##
gooddata = gooddata[!(gooddata$month==1 & gooddata$day==3),]
############################
### Break up into weeks ###
# Time period 1/4/2015 - 3/14/2015 (10 weeks)
# First day was Jan 4 2015
# Last day was March 14, 2015
# Weeks will go Sunday:Saturday
# Counter will start from the first day of the time period and count sequentially until the last day (so 10 weeks = 70 days)
gooddata$datecounter = NA
counter = 1
# month counter
for (month in 1:12){
# February has 28 days (ignoring leap years for now)
if (month == 2){
numdays = 28
}
# Months with 30 days
if (month == 4 | month == 6 | month == 9 | month == 11){
numdays = 30
} else { # Otherwise 31 days
numdays = 31
}
# date counter
for (date in 1:numdays){
if (nrow(gooddata[gooddata$month == month & gooddata$day == date,]) > 0){
gooddata[gooddata$month == month & gooddata$day == date,]$datecounter = counter
counter = counter+1
}
}
}
### Add in week counter
# First day of week 1 = first day of the time period
gooddata$weekcounter = NA
numweeks = 10
# week loop
for (week in 1:numweeks){
lastday = week * 7
firstday = lastday-6
for (d in firstday:lastday){
if (nrow(gooddata[gooddata$datecounter == d,])>0){
gooddata[gooddata$datecounter == d,]$weekcounter = week
}
}
}
################
### Plotting ###
################
# function to change orientation of matrix for graphing
graphingtable <- function(data,nrow,ncol,rownames,colnames){
newtable = array(dim=c(nrow,ncol))
rownames(newtable) = rownames
colnames(newtable) = colnames
p=1
for (r in 1:nrow){
for (c in 1:ncol){
newtable[r,c] = data[p]
p=p+1
}
}
return(newtable)
}
######################################################
### Analyze percentage visitors in 6 4-hour blocks ###
# make giant week x day x hourblock matrix
numweeks = 10
avdata = array(dim=c(numweeks,7,6))
# week loop
for (week in 1:numweeks){
weekdata = gooddata[gooddata$weekcounter == week,]
# total number of visits for that week
weektotal = sum(weekdata$Sessions)
# day of week loop
daysofweek = c('Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday')
for (d in 1:length(daysofweek)){
day = daysofweek[d]
daydata = weekdata[weekdata$dayofweek == day,]
### Break hours into blocks
# 4-hour blocks
block1 = 0:3
block2 = 4:7
block3 = 8:11
block4 = 12:15
block5 = 16:19
block6 = 20:23
# block loop
for (b in 1:6){
block = paste('block',b,sep='')
blockhours = eval(parse(text=block))
hours = daydata$hour_adj == blockhours[1] | daydata$hour_adj == blockhours[2] | daydata$hour_adj == blockhours[3] | daydata$hour_adj == blockhours[4]
hourblockdata = daydata[hours,]
# sum of visits in that hour block
hourblock_visits = sum(hourblockdata$Sessions)
# get percentage of visits for that hour compared to entire week
hourblock_percentage = (hourblock_visits/weektotal)*100
# put hourblock percentage into giant matrix
avdata[week,d,b] = hourblock_percentage
}
}
}
### Get average, SE and CI boundaries for all weeks ###
# For CI from t-distribution, use function qt(0.975, df=n-1), where 0.975 is equivalent to p<0.025 (2-tailed)
# For CI from normal distribution, use function qnorm(0.975) = 1.96, assumes normal distribution with mean = 0 and sd=1
weekavs = array(dim=c(7,6)) # means
weeksems = array(dim=c(7,6)) # standard errors
weekcis = array(dim=c(7,6)) # error for confidence intervals (95%)
numweeks = 10
# day loop
for (d in 1:7){
# hourblock loop
for (b in 1:6){
weekavs[d,b] = mean(avdata[,d,b]) # average over weeks
weeksems[d,b] = sd(avdata[,d,b])/sqrt(numweeks) # SE over weeks
weekcis[d,b] = qt(0.975,df=numweeks-1) * (sd(avdata[,d,b])/sqrt(numweeks)) # error level
}
}
### Plot 6-hour blocks for each day over entire week ###
data = graphingtable(weekavs,6,7,c('12am - 4am','4am - 8am','8am - 12pm','12pm - 4pm','4pm - 8pm','8pm - 12am'),daysofweek)
sems = graphingtable(weeksems,6,7,c('12am - 4am','4am - 8am','8am - 12pm','12pm - 4pm','4pm - 8pm','8pm - 12am'),daysofweek)
CIs = graphingtable(weekcis,6,7,c('12am - 4am','4am - 8am','8am - 12pm','12pm - 4pm','4pm - 8pm','8pm - 12am'),daysofweek)
ciUpper = data + CIs
ciLower = data - CIs
#colors = c('white','gray88','gray72','gray48','gray30','gray16')
colors = c('gray90','gray75','gray60','gray45','gray30','gray15')
barplot2(data,beside=TRUE,legend=TRUE,ylab = 'Percent of sessions',xlab = 'Day of week',ylim = c(0,6),plot.ci=TRUE,ci.u = ciUpper,ci.l = ciLower,col=colors)
###########################################
### Analyze percentage visitors per day ###
numweeks = 10
# make week x day matrix
avdata = array(dim=c(numweeks,7))
# week loop
for (week in 1:numweeks){
weekdata = gooddata[gooddata$weekcounter == week,]
# total number of visits for that week
weektotal = sum(weekdata$Sessions)
# day of week loop
daysofweek = c('Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday')
for (d in 1:length(daysofweek)){
day = daysofweek[d]
daydata = weekdata[weekdata$dayofweek == day,]
# sum of visits in that day
day_visits = sum(daydata$Sessions)
# get percentage of visits for that day compared to entire week
day_percentage = (day_visits/weektotal)*100
# put hourblock percentage into giant matrix
avdata[week,d] = day_percentage
}
}
### Get average and SE over all weeks
weekavs = array(dim=c(1,7))
weeksems = array(dim=c(1,7))
weekcis = array(dim=c(1,7))
# day loop
for (d in 1:7){
weekavs[1,d] = mean(avdata[,d]) # average over weeks
weeksems[1,d] = sd(avdata[,d])/sqrt(numweeks) # SE over weeks
weekcis[1,d] = qt(0.975,df=numweeks-1) * (sd(avdata[,d])/sqrt(numweeks)) # error level
}
### Plot each day over entire week ###
colnames(weekavs) = daysofweek
ciUpper = weekavs + weekcis
ciLower = weekavs - weekcis
barplot2(weekavs,beside=TRUE,legend=TRUE,ylab = 'Percent of sessions',xlab = 'Day of week',ylim = c(0,20),plot.ci=TRUE,ci.u = ciUpper,ci.l = ciLower,col='gray')