library(tidyverse)
library(lubridate)
library(treemap)
library(scales) # % format in plot
#devtools::install_github("timelyportfolio/d3treeR")
library(d3treeR)
d1 <- read.csv("accidents_2005_to_2007.csv")%>%select(Accident_Severity,Number_of_Vehicles:Day_of_Week,Weather_Conditions:Year,Time)
d2 <- read.csv("accidents_2009_to_2011.csv")%>%select(Accident_Severity,Number_of_Vehicles:Day_of_Week,Weather_Conditions:Year,Time)
d3 <- read.csv("accidents_2012_to_2014.csv")%>%select(Accident_Severity,Number_of_Vehicles:Day_of_Week,Weather_Conditions:Year,Time)
data <- bind_rows(d1,d2,d3)
#Label
data$Day_of_Week=factor(data$Day_of_Week,levels=1:7,labels=c("Mon","Tue","Wed","Thu","Fri","Sat","Sun"))
# remove missing outcome info
data%>%rename(Police=Did_Police_Officer_Attend_Scene_of_Accident)%>%filter(Police!="")->data1
# fix format
data1$Year=as.character(data1$Year)
# hour categories
data1$hour=hour(hm(data1$Time))
data1$hour_cat <- cut(data1$hour,breaks = c(0, 3,6,9, 12,15, 18,21, 24), include.lowest = TRUE,labels=c("Midnight-3am","3-6am","6-9am","9-noon","noon-3pm","3-6pm","6-9pm","9-Midnight"))
# subset of data
data1%>%select(Year,Police) ->subdata
## get counts convert to percentage
subdata%>%filter(Police=="Yes")%>%group_by(Year)%>%summarise(show_yr=n())->shown_yr
subdata%>%filter(Police=="No")%>%group_by(Year)%>%summarise(noshow_yr=n())->noshown_yr
yrdata <- bind_cols(shown_yr,noshown_yr)%>%select(Year,show_yr,noshow_yr)
# calculate percentage:
yrdata%>%mutate(Yes_ratio=show_yr/(show_yr+noshow_yr))%>%mutate(No_ratio=1-Yes_ratio)->yrdata
yrdata%>%select(-show_yr,-noshow_yr)%>%gather("Police","Ratio",-Year)->yrdata2
yrdata2$Police=gsub("_ratio","",yrdata2$Police)
p2 <- ggplot(data=yrdata2,aes(x=Police,y=Ratio,fill=Police))+geom_bar(stat="identity",position="dodge")+facet_wrap(~Year) + xlab("")
##For line graphs, the data points must be grouped so that it knows which points to connect. In this case, it is simple – all points should be connected, so group=1. When more variables are used and multiple lines are drawn, the grouping for lines is usually done by variable.
ggplot(data=yrdata2%>%filter(Police=="Yes"),aes(x=as.factor(Year),y=Ratio,group=1))+geom_point(colour="red",size=5)+geom_line(colour="orange",size=2)+ylim(0.75,0.9)+xlab("Year")+ylab("% of Police Attendance")+theme_bw()+ggtitle("Police attendance rate over time") ->bar_grph
data1%>%select(Police,Day_of_Week)->data1a
ggplot(aes(x = Day_of_Week), data = data1a) +
geom_bar(stat = 'count', position = 'stack',aes(fill=Police)) +
xlab('Day of the Week') +
ylab('Police attendance total N') +
ggtitle('Number of Police Attendences by Day of the Week ') +
theme_bw()
## convert to percentage
data1a%>%filter(Police=="Yes")%>%group_by(Day_of_Week)%>%summarise(show=n())->shown
data1a%>%filter(Police=="No")%>%group_by(Day_of_Week)%>%summarise(noshow=n())->noshown
daydata <- bind_cols(shown,noshown)%>%select(Day_of_Week,show,noshow)%>%gather("Police","N",2:3)
#library--> scale
ggplot(daydata,aes(x = Day_of_Week, y = N,fill = Police)) +
geom_bar(position = "fill",stat = "identity") +
scale_y_continuous(labels = percent_format())+xlab("Day of the Week")+ylab("")+ggtitle("Attendance Rate by Day of the Week")+theme_bw()
Treemap: Treemaps are ideal for displaying large amounts of hierarchically structured (tree-structured) data. The space in the visualization is split up into rectangles that are sized and ordered by a quantitative variable.
data1%>%filter(Police=="Yes")%>%group_by(Day_of_Week,hour_cat)%>%summarise(showbyhour=n())->show_time
treemap(show_time, #Your data frame object
index=c("hour_cat","Day_of_Week"), #A list of your categorical variables
vSize="showbyhour", #This is your quantitative variable
type="index", #Type sets the organization and color scheme of your treemap
palette = "Set3", #Select your color palette from the RColorBrewer presets or make your own.
fontsize.labels=c(15,8),
title='Number of Police Attendances by Time & Day of the Week',
fontcolor.labels=c("white","black"),
fontface.labels=c(4,1),
bg.labels=c("transparent"),
align.labels=list(c("center", "center"), c("left", "bottom")),
overlap.labels=1,
inflate.labels=F)
data1%>%group_by(Day_of_Week,hour_cat)%>%summarise(accidentbyhour=n())->dataaccident
treemap(dataaccident, #Your data frame object
index=c("hour_cat","Day_of_Week"), #A list of your categorical variables
vSize="accidentbyhour", #This is your quantitative variable
type="index", #Type sets the organization and color scheme of your treemap
palette = "Set3", #Select your color palette from the RColorBrewer presets or make your own.
fontsize.labels=c(15,8),
title='Number of Accidents by Time & Day of the Week',
fontcolor.labels=c("white","black"),
fontface.labels=c(4,1),
bg.labels=c("transparent"),
align.labels=list(c("center", "center"), c("left", "bottom")),
overlap.labels=1,
inflate.labels=F)->tree_accident
# "rootname" becomes the title of the plot
inter=d3tree2( tree_accident , rootname = "Overall" )
inter