-
Notifications
You must be signed in to change notification settings - Fork 0
/
Doctors in ER.R
168 lines (137 loc) · 7.1 KB
/
Doctors in ER.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
library(dplyr)
library(lattice)
library(ggplot2)
library(MASS)
library(DHARMa)
library(lmtest)
library(AER)
library(countreg)
library(magrittr)
# To clarify, some of the code here was not included. In reality, I did so many more tests and checks, but if I were to include them here it would be too much
# Step 1: Checking everything: summaries, outliers, zero inflation, simple plots between variables
compdat <- read.delim("compdat.txt")
compdat <- compdat[!duplicated(compdat[, -1]), ]
compdat <- na.omit(compdat)
summary(compdat)
compdat[compdat$complaints > 5,1:6]
compdat[compdat$visits>2271 & compdat$hours>1469 & compdat$complaints > 5,1:6]
zeros<compdat[compdat$complaints==0,1:6]
sd(compdat$visits)>sd(compdat$revenue)
sd(compdat$complaints)>sd(compdat$hours)
#factoring categorical variables
compdat$residency<- as.factor(compdat$residency)
compdat$gender<-as.factor(compdat$gender)
count(compdat, 'residency')
count(compdat, 'gender')
#checking for outliers and doing simple graphs with and without interactions
plot(compdat$complaints ~ compdat$visits)
plot(compdat$complaints ~ compdat$residency)
plot(compdat$complaints ~ compdat$gender)
plot(compdat$complaints ~ compdat$hours)
plot(compdat$complaints ~ compdat$revenue)
c.lm <- lm(complaints~visits+revenue+hours+gender+residency, compdat)
c.lm2g <-lm(complaints~visits+revenue+hours+gender,compdat)
c.lm2r <- lm(complaints~visits+revenue+hours+residency,compdat)
c.lm3g <- lm(complaints~(visits+revenue+hours)*gender,compdat)
c.lm3r <- lm(complaints~(visits+revenue+hours)*residency,compdat)
c.lm3rg <- lm(complaints~(visits+revenue+hours)*residency+gender,compdat)
c.lm3gr <- lm(complaints~(visits+revenue+hours)*gender+residency,compdat)
par(mfrow=c(1,1))
plot(c.lm)
histogram(~complaints | gender, compdat, breaks = 0:18 - 0.5,
main = "Complaints")
#did not remove outliers, for after further inspection,
#it became obvious that the data is zero inflated and that it would make the already limited dataset less reliable if I were to remove a value that was not 0
#Step 2: Checking for potential models such as poisson, quasi and negative binomial. Dispersion as well
#GLM Poisson model(not yet zero inflated)
compdat.p <- glm(complaints~visits+revenue+hours+gender+residency | visits+revenue+hours+gender+residency,
data = compdat,
family = poisson)
summary(compdat.p)
compdat.p$deviance/compdat$p$df.residual
#GLM Quasipoisson model (not yet zero inflated, there won't be one for quasi)
compdat.q <- glm(complaints~visits+revenue+hours+gender+residency | visits+revenue+hours+gender+residency,
data = compdat,
family = quasipoisson)
compdat.q$deviance/compdat.q$df.residual
summary(compdat.q)
#GLM Negative binomial model (not yet zero inflated)
compdat.nb <- glm.nb(complaints~visits+revenue+hours+gender+residency | visits+revenue+hours+gender+residency,data = compdat, control = glm.control(maxit = 50))
compdat.nb$deviance/compdat.nb$df.residual
summary(compdat.nb)
#Executing the codes above, dispersion was detected using the deviance to residual value.
#We can see that both the poisson models are overdispersed, and since we can't do much with quasi (no AIC, etc),
#we assume that quasi is not a suitable model for this data.
dispersiontest(compdat.p)
nb.dharm <- simulateResiduals(compdat.nb, refit=T, n=99)
plotSimulatedResiduals(compdat.nb)
testOverdispersion(nb.dharm)
rootogram(compdat.p)
box()
rootogram(compdat.nb)
box()
AIC(compdat.p,compdat.nb)
#Comparing the AIC's of the poisson and negative binomial models, it shows that the negative binomial model has a lower AIC,
#thus suggesting that negative binomial is a much better model than the poisson.
#Step 3: Zero inflation models, with and without interactions. Doing tests such as likelihood tests and AIC tests to see which model is the best fit
#Since it is zero-inflated, it is only fair we go for zero-inflated models of Poisson ad Negative Binomial
#the codes included here are only the ones that were used in the final report.
compdat.zip1 <- zeroinfl(complaints~revenue+gender+residency | visits+revenue+hours+gender+residency,
dist = 'poisson', data = compdat)
summary(compdat.zip1)
compdat.zip2 <- zeroinfl(complaints~visits+revenue+hours| gender,
dist = 'poisson', data = compdat)
compdat.zinb <- zeroinfl(complaints~hours+revenue+gender+residency| revenue+gender+residency,
dist = 'negbin', data = compdat)
summary(compdat.zinb)
AIC(compdat.p,compdat.nb,compdat.zip1,compdat.zinb)
rootogram(compdat.zip1)
box()
rootogram(compdat.zinb)
box()
#Step 4: Reducing the model to get the final model
compdat.zinb1 <- zeroinfl(complaints~visits+revenue+hours | gender,
dist = 'negbin', data = compdat)
summary(compdat.zinb1)
rootogram(compdat.zinb1)
box()
compdat.zinb6 <- zeroinfl(complaints~hours*residency+gender:hours+residency:gender| visits+gender+visits:hender,
dist = 'negbin', data = compdat)
summary(compdat.zinb6)
rootogram(compdat.zinb6)
box()
compdat.zinb7 <- zeroinfl(complaints~hours*residency+gender*residency| visits*gender,
dist = 'negbin', data = compdat)
options(warnings = -1)
par(mfrow = c(1,2))
plot(residuals(compdat.zinb1) ~ fitted.values(compdat.zinb1,
type = "pearson"),
xlab = "Fitted values", ylab = "Pearson Residuals",
main = "With Interactions")
plot(residuals(compdat.zinb1, type = "pearson"),
ylab = "Pearson residuals")
par(mfrow=c(1,1))
options(warnings = 0)
plot(residuals(compdat.zinb7) ~ fitted.values(compdat.zinb7,
type = "pearson"),
xlab = "Fitted values", ylab = "Pearson Residuals",
main = "With Interactions")
plot(residuals(compdat.zinb7, type = "pearson"),
ylab = "Pearson residuals")
par(mfrow=c(1,1))
options(warnings = 0)
anova(compdat.zinb7, compdat.zinb1, test = 'LRT')
anova <- aov (complaints~ hours*residency, data = compdat)
summary(anova)
stepAIC(compdat.zip2, direction = "backward")
stepAIC(compdat.zip2, direction = "forward")
stepAIC(compdat.zip2, direction = "both")
stepAIC(compdat.zinb7, direction = "backward")
stepAIC(compdat.zinb7, direction = "forward")
stepAIC(compdat.zinb7, direction = "both")
AIC(compdat.zip2,compdat.zinb7)
lrtest(compdat.zinb1,compdat.zinb8)
#Checking AIC shows that the model without interactions has a lower AIC, but that is expected considering the fact that it is simpler.
#Since it may be conflicting to use a model without interactions than one with, we want to check for the goodness of fit of both models.
#Running the likelihood test shows that the p value is less than 0.05, so we will not reject the null hypothesis and take the simpler model:
#zinb1, the model without interactions, as our final model.