##################################################### ### Google Analytics script: ### ### This script will create a bar graph averaging when ### during the week US visitors have come to your website ##################################################### # Written by Emily Coderre, PhD, 2015-03-16 ######################## # Getting your data # # From your google analytics account, go to: # audience --> location country --> US --> metro area --> # secondary dimension --> hour of day # show rows 5000 # **select days in batches so that <5000 visits occur (eg, if doing 6 # weeks, do first 4 days then export then change to days 5-8 # then export, so on) # Export data to excel, do a find and replace for the entire document and replace spaces with underscores, remove percent signs, also delete last row which has sums # Go back to ** above and copy and paste the next batch to the excel # spreadsheet # Save as text tab delimited (new metro output 1.txt) file in working directory ## Clear workspace ## rm(list=ls(all=TRUE)) ## Load packages ## library(date) library(gplots) ## Change to directory ## homedir = 'C:/Users/Tim/Dropbox/R analyses of google analytics data/2015-03-16/website' setwd(homedir) ## load the data ## alldata = read.table('new metro output 1_formatted.txt',header=TRUE) ############################################ ### Split up Hour_of_Day to separate columns for month, day, hour, and day of week daysofweek = c('Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday') for (r in 1:nrow(alldata)){ # split up date into characters hourofday = as.character(alldata$Hour_of_Day[r]) a=strsplit(hourofday,NULL) date=a[[1]] # put month, day, and hour into separate columns alldata$month[r] = paste(date[5:6],collapse = "") alldata$day[r] = paste(date[7:8],collapse = "") alldata$hour[r] = paste(date[9:10],collapse = "") # convert to Julian date given month, date, year jdate = mdy.date(as.numeric(as.character(alldata$month[r])),as.numeric(as.character(alldata$day[r])),2015) # tells you day of week: 1=Sunday, 7=Saturday weekday = date.mdy(jdate,weekday=TRUE)$weekday alldata$dayofweek[r] = daysofweek[weekday] } # save as new text file write.table(alldata,'alldata.txt',sep='\t') ################################# ## Load all data from file if not in memory already alldata = read.table('alldata.txt',header=TRUE) ## Adjust for timezones ## # Load timezone file timezones = read.table('time zones.txt',header=TRUE) colnames(timezones) = c('Metro_name','Timezone') daysofweek = c('Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday') adjdata = vector() # row loop for Metro names for (m in 1:nrow(timezones)){ name = as.character(timezones[m,1]) shift = timezones[m,2] # pull out data for that metro center zonedata = alldata[alldata$Metro == name,] if (nrow(zonedata) > 0){ zonedata$hour_adj = NA # row loop for all hours for (r in 1:nrow(zonedata)){ hour_adj = as.numeric(as.character(zonedata$hour[r])) + shift # if adjusted hour straddles midnight, change date to the day before if (hour_adj >= 0){ zonedata$hour_adj[r] = hour_adj } else{ zonedata[r,]$hour_adj = hour_adj + 24 # change hour zonedata[r,]$day = as.numeric(as.character(zonedata[r,]$day)) - 1 # change day # if day straddles month, change month and day if (zonedata[r,]$day == 0){ # change month zonedata[r,]$month = as.numeric(as.character(zonedata[r,]$month)) - 1 # change day if (zonedata[r,]$month == 2){numdays = 28} if (zonedata[r,]$month == 4 |zonedata[r,]$month == 6 |zonedata[r,]$month == 9 |zonedata[r,]$month == 11){ numdays = 30 } else { numdays = 31 } zonedata[r,]$day = numdays # change day of week dayposition = which(daysofweek == zonedata[r,]$dayofweek) # if day of week straddles Sunday if (dayposition - 1 == 0){dayposition = 8} newdayofweek = daysofweek[dayposition - 1] zonedata[r,]$dayofweek = newdayofweek } } } } adjdata = rbind(adjdata,zonedata) } ###### ## Calculate total number of visits for each site (over all weeks) metrocenters = levels(adjdata$Metro) allvisits = array(dim=c(length(metrocenters),2)) colnames(allvisits) = c('Metro','totalNumVisits') for (m in 1:length(metrocenters)){ city = metrocenters[m] allvisits[m,1] = city allvisits[m,2] = sum(adjdata[adjdata$Metro == city,]$Sessions) } allvisits = as.data.frame(allvisits) write.table(allvisits,'allvisitsperMetroCenter.txt',sep='\t') ######## ## Drop Yuma (straddles time zones) gooddata = na.exclude(as.data.frame(adjdata[adjdata$Metro != 'Yuma_AZ-El_Centro_CA',])) ## Take out the people who came on January 3 in California - got counted as January 4 on East Coast time but with time change actually came on 1/3 (before start of time period) ## change these values to match your data!!! ## gooddata = gooddata[!(gooddata$month==1 & gooddata$day==3),] ############################ ### Break up into weeks ### # Time period 1/4/2015 - 3/14/2015 (10 weeks) # First day was Jan 4 2015 # Last day was March 14, 2015 # Weeks will go Sunday:Saturday # Counter will start from the first day of the time period and count sequentially until the last day (so 10 weeks = 70 days) gooddata$datecounter = NA counter = 1 # month counter for (month in 1:12){ # February has 28 days (ignoring leap years for now) if (month == 2){ numdays = 28 } # Months with 30 days if (month == 4 | month == 6 | month == 9 | month == 11){ numdays = 30 } else { # Otherwise 31 days numdays = 31 } # date counter for (date in 1:numdays){ if (nrow(gooddata[gooddata$month == month & gooddata$day == date,]) > 0){ gooddata[gooddata$month == month & gooddata$day == date,]$datecounter = counter counter = counter+1 } } } ### Add in week counter # First day of week 1 = first day of the time period gooddata$weekcounter = NA numweeks = 10 # week loop for (week in 1:numweeks){ lastday = week * 7 firstday = lastday-6 for (d in firstday:lastday){ if (nrow(gooddata[gooddata$datecounter == d,])>0){ gooddata[gooddata$datecounter == d,]$weekcounter = week } } } ################ ### Plotting ### ################ # function to change orientation of matrix for graphing graphingtable <- function(data,nrow,ncol,rownames,colnames){ newtable = array(dim=c(nrow,ncol)) rownames(newtable) = rownames colnames(newtable) = colnames p=1 for (r in 1:nrow){ for (c in 1:ncol){ newtable[r,c] = data[p] p=p+1 } } return(newtable) } ###################################################### ### Analyze percentage visitors in 6 4-hour blocks ### # make giant week x day x hourblock matrix numweeks = 10 avdata = array(dim=c(numweeks,7,6)) # week loop for (week in 1:numweeks){ weekdata = gooddata[gooddata$weekcounter == week,] # total number of visits for that week weektotal = sum(weekdata$Sessions) # day of week loop daysofweek = c('Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday') for (d in 1:length(daysofweek)){ day = daysofweek[d] daydata = weekdata[weekdata$dayofweek == day,] ### Break hours into blocks # 4-hour blocks block1 = 0:3 block2 = 4:7 block3 = 8:11 block4 = 12:15 block5 = 16:19 block6 = 20:23 # block loop for (b in 1:6){ block = paste('block',b,sep='') blockhours = eval(parse(text=block)) hours = daydata$hour_adj == blockhours[1] | daydata$hour_adj == blockhours[2] | daydata$hour_adj == blockhours[3] | daydata$hour_adj == blockhours[4] hourblockdata = daydata[hours,] # sum of visits in that hour block hourblock_visits = sum(hourblockdata$Sessions) # get percentage of visits for that hour compared to entire week hourblock_percentage = (hourblock_visits/weektotal)*100 # put hourblock percentage into giant matrix avdata[week,d,b] = hourblock_percentage } } } ### Get average, SE and CI boundaries for all weeks ### # For CI from t-distribution, use function qt(0.975, df=n-1), where 0.975 is equivalent to p<0.025 (2-tailed) # For CI from normal distribution, use function qnorm(0.975) = 1.96, assumes normal distribution with mean = 0 and sd=1 weekavs = array(dim=c(7,6)) # means weeksems = array(dim=c(7,6)) # standard errors weekcis = array(dim=c(7,6)) # error for confidence intervals (95%) numweeks = 10 # day loop for (d in 1:7){ # hourblock loop for (b in 1:6){ weekavs[d,b] = mean(avdata[,d,b]) # average over weeks weeksems[d,b] = sd(avdata[,d,b])/sqrt(numweeks) # SE over weeks weekcis[d,b] = qt(0.975,df=numweeks-1) * (sd(avdata[,d,b])/sqrt(numweeks)) # error level } } ### Plot 6-hour blocks for each day over entire week ### data = graphingtable(weekavs,6,7,c('12am - 4am','4am - 8am','8am - 12pm','12pm - 4pm','4pm - 8pm','8pm - 12am'),daysofweek) sems = graphingtable(weeksems,6,7,c('12am - 4am','4am - 8am','8am - 12pm','12pm - 4pm','4pm - 8pm','8pm - 12am'),daysofweek) CIs = graphingtable(weekcis,6,7,c('12am - 4am','4am - 8am','8am - 12pm','12pm - 4pm','4pm - 8pm','8pm - 12am'),daysofweek) ciUpper = data + CIs ciLower = data - CIs #colors = c('white','gray88','gray72','gray48','gray30','gray16') colors = c('gray90','gray75','gray60','gray45','gray30','gray15') barplot2(data,beside=TRUE,legend=TRUE,ylab = 'Percent of sessions',xlab = 'Day of week',ylim = c(0,6),plot.ci=TRUE,ci.u = ciUpper,ci.l = ciLower,col=colors) ########################################### ### Analyze percentage visitors per day ### numweeks = 10 # make week x day matrix avdata = array(dim=c(numweeks,7)) # week loop for (week in 1:numweeks){ weekdata = gooddata[gooddata$weekcounter == week,] # total number of visits for that week weektotal = sum(weekdata$Sessions) # day of week loop daysofweek = c('Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday') for (d in 1:length(daysofweek)){ day = daysofweek[d] daydata = weekdata[weekdata$dayofweek == day,] # sum of visits in that day day_visits = sum(daydata$Sessions) # get percentage of visits for that day compared to entire week day_percentage = (day_visits/weektotal)*100 # put hourblock percentage into giant matrix avdata[week,d] = day_percentage } } ### Get average and SE over all weeks weekavs = array(dim=c(1,7)) weeksems = array(dim=c(1,7)) weekcis = array(dim=c(1,7)) # day loop for (d in 1:7){ weekavs[1,d] = mean(avdata[,d]) # average over weeks weeksems[1,d] = sd(avdata[,d])/sqrt(numweeks) # SE over weeks weekcis[1,d] = qt(0.975,df=numweeks-1) * (sd(avdata[,d])/sqrt(numweeks)) # error level } ### Plot each day over entire week ### colnames(weekavs) = daysofweek ciUpper = weekavs + weekcis ciLower = weekavs - weekcis barplot2(weekavs,beside=TRUE,legend=TRUE,ylab = 'Percent of sessions',xlab = 'Day of week',ylim = c(0,20),plot.ci=TRUE,ci.u = ciUpper,ci.l = ciLower,col='gray')