-
Notifications
You must be signed in to change notification settings - Fork 2
/
HW 1 Mitchell Layton R Code.R
299 lines (206 loc) · 14.1 KB
/
HW 1 Mitchell Layton R Code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
#make sure to install packages in order to run
library(tidyverse)
library(ggmosaic)
library(plyr)
library(ggthemes)
library(grid)
#STA 141A -- Homework Assignment 1
# R Scripts:
# 1) Using dplyr package we can use either the count() OR n_distinct() function.
#Read in the file into a variable I names college13
college13 <- readRDS("E:/R Files/college_scorecard_13.rds")
total_observations <- sum(is.na(college13) == TRUE) + sum(is.na(college13) == FALSE)
cat("Total Observations: ", total_observations)
#total_colleges <- count(college13, college13$ope_id)
# OR USE
total_colleges <- sum(is.na(college13$ope_id) == FALSE)
cat("Total Colleges Recorded: ", total_colleges)
# 2)
total_features = ncol(college13)
cat("Total Features: ", total_features)
#For more information regarding the data.frame we use str() functions
data_info = str(college13)
# For finding out how many of the vectors in our data.frame() are
# categorical, discrete, or any other features we use the following
class_table = table(sapply(college13, class))
class_table
# This shows a contigency table using our data frame which selects each
# count of the combination of factor levels
#Set each index of our atomic vectors of the contigency table to display our results
chr_count = class_table[1]
factor_count = class_table[2]
int_count = class_table[3]
logi_count = class_table[4]
num_count = class_table[5]
cat("There are",(chr_count + logi_count + factor_count), "categorical vectors in the data frame.")
cat("Given that a discrete variable is a numberic variable where observations can take on
a value based on counts from a set of distinct whole values,
there is really only 1 discrete vector in the data frame which is 'Branches'.")
# 3)
# Total NA in data set
total_NA = sum(is.na(college13) == TRUE)
cat("There are",total_NA,"missing values in the data set")
# To find the column vector in the data frame with the largest number of NA's we use:
missing_values = max(sapply(college13, function(y) sum(is.na(y))))
most_NA = data.frame(missing_values, row.names = 'avg_sat')
cat("Here is the feature with most missing values:")
most_NA
# Explanation of code: Starting with the "sapply" which applies a particular function
# to every column vector of our college13. We then specify the function(y) and create it.
# This function finds the missing values (NA) and sums up the amount for each object.
# Then, using the max() function we find the largest number of NA.
# Last, I created a 1x1 data.frame() to display the avg_sat and its corresponding missing values.
table_NA_plot = sort(table(sapply(college13, function(y) sum(is.na(y)))))
table_NA_plot
mosaicplot(table_NA_plot, main = "Mosaic Plot: Missing Values (NA)", sub = "Occurances of NA's (Ascending)", off = 35, color = TRUE, cex.axis = 1.05)
# 4)
# More public or private colleges recorded:
type_of_college = table(college13$ownership)
type_of_college
# Proportion of highes degree awarded code:
ggplot(data = college13) +
geom_mosaic(aes(x = product(college13$highest_degree, college13$ownership), fill = factor(college13$highest_degree)), na.rm = TRUE, offset = 0.012) +
labs(x = "Ownership", y = "Proportion of Degree Awarded (# of specific degree ÷ total degrees)", title = "Proportions of Highest Degree Awarded") +
guides(fill = guide_legend(title = "Degrees Offered", reverse = TRUE)) +
theme(plot.title = element_text(size = rel(2))) +
theme(axis.title.y = element_text(size = rel(1.25))) +
theme(axis.text.x = element_text(size = rel(1.75))) +
theme(axis.text.x=element_text(angle=25, hjust= .5)) +
theme(axis.title.x = element_text(size = rel(1.85))) +
theme(axis.text.y = element_text(size = rel(1.35))) +
theme(axis.ticks.length=unit(0.25,"cm")) +
theme(axis.ticks = element_line(size = rel(2))) +
theme(plot.margin=unit(c(1,1,1.5,1.2),"cm"))
# REFERENCES: https://cran.r-project.org/web/packages/ggmosaic/vignettes/ggmosaic.htm AND R for Data Science.
# Explanation of code: We start by using the ggplot() with its large amount of graphical packages.
# From ggplot() we use the recently added ggmosaic package. From here the syntax is to call the ggplot function
# specifying the data we want to use, in this case college13. Then we set the particular <GEOM_FUNCTION> we desire in
# this case mosaic(). Then we specify the mapping arguement aes() to specify what variables to map.
# In our mapping funciton we specify the x object to be a product (a wrapper for a list)
# of our highest degree data and the ownership. The fill() fills in missing observations from the last non-missing value.
# Then we handle the spacing of the x factors with offset(). Lastly, we handle all adjustments to our plot using theme()
# The theme package offers so many adjustable factors and you just need to specify what and with which parameters.
# 5)
# Average undergraduate population:
avg_undergraduate_pop = round(mean(college13$undergrad_pop, na.rm = TRUE))
cat("The mean undergraduate population is:",avg_undergraduate_pop)
med_undergraduate_pop = round(median(college13$undergrad_pop, na.rm = TRUE))
cat("The median undergraduate population is:",med_undergraduate_pop)
# Decile:
decile_undergrad_pop = quantile(college13$undergrad_pop, prob = seq(.1:1, by = .1), na.rm = TRUE)
(decile_undergrad_pop)
# 6)
# According to, https://www.census.gov/newsroom/press-releases/2014/cb14-232.html the 5 most populous states are:
# (In order) California, Texas, Florida, New York, Illinois
ca = filter(college13, state %in% c("CA"))
CA = cbind(quantile(ca$tuition, na.rm = TRUE))
tx = filter(college13, state %in% c("TX"))
TX = quantile(tx$tuition, na.rm = TRUE)
fl = filter(college13, state %in% c("FL"))
FL = quantile(fl$tuition, na.rm = TRUE)
ny = filter(college13, state %in% c("NY"))
NY = quantile(ny$tuition, na.rm = TRUE)
il = filter(college13, state %in% c("IL"))
IL = quantile(il$tuition, na.rm = TRUE)
quantile_5_states = data.frame(cbind(CA,TX,FL,NY,IL))
quantile_5_states
# Index our data frame to just filter based on top 5 states
populous_states_data = filter(college13, state %in% c("CA","TX","FL","NY","IL"))
# Then we explicity set the factors (the 5 states) and their levels to be in the order we want.
# In this case I order them from left to right from the most populous to the least populous.
populous_states_data$state <- factor(populous_states_data$state, levels = c("CA","TX","FL","NY","IL"))
# Since our first feature is tuition (numerical) and second feature is state (categorical)
# lets use either a box plot:
# references: http://t-redactyl.io/blog/2016/04/creating-plots-in-r-using-ggplot2-part-10-boxplots.html
tuition_boxplot = ggplot(data = populous_states_data) +
geom_boxplot(aes(x = populous_states_data$state, y = populous_states_data$tuition), na.rm = TRUE, outlier.color = "red", ) +
labs(x = "5 Most Populous States", y = "Tuition ($)") + theme_bw() + ggtitle("Tuition Boxplot for 5 Most Populous States") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(size = rel(2))) +
theme(axis.title.y = element_text(size = rel(1.25))) +
theme(axis.text.x = element_text(size = rel(1.75))) +
theme(axis.text.x=element_text(angle=25, hjust= .625)) +
theme(axis.title.x = element_text(size = rel(1.45))) +
theme(axis.text.y = element_text(size = rel(1.45))) +
theme(axis.ticks.length=unit(0.25,"cm")) +
theme(axis.ticks = element_line(size = rel(2))) +
theme(plot.margin=unit(c(1,1,1.5,1.2),"cm"))
# The themes I used from top to bottomt were to, center adjust the ggtitle() using plot.title and shifting the element text.
# Then I increased all 3 titles text size, as well as tilting the x axis title for nice visuals
# Furthermore, I increased the ticks sick and length.
tuition_boxplot
#7)
# Going to create a graph with x = spending per student (by colleges), y = student 10-year earnings for each ownership factor level.
#step 1: filter 3 instances with respect to ownership for entire data set.
public_data = filter(college13, ownership %in% "Public")
nonprofit_data = filter(college13, ownership %in% "Nonprofit")
private_data = filter(college13, ownership %in% "For Profit")
#PUBLIC
spend_per_stud_public = public_data$spend_per_student
avg_10yr_public = public_data$avg_10yr_salary
scatter.smooth(x = spend_per_stud_public, y = avg_10yr_public, xlab = "Spending per Student (Public Colleges)", ylab = "Average 10 Year Salary (Public)", main = "Public Avg 10 Yr Salary vs. Public Colleges Spending per Student")
#NONPROFIT
spend_per_stud_nonprofit = nonprofit_data$spend_per_student
avg_10yr_nonprofit = nonprofit_data$avg_10yr_salary
scatter.smooth(x = spend_per_stud_nonprofit, y = avg_10yr_nonprofit, xlab = "Spending per Student (Nonprofit Colleges)", ylab = "Average 10 Year Salary (Nonprofit)", main = "Nonprofit Avg 10 Yr Salary vs. Nonprofit Colleges Spending per Student")
#FOR PROFIT
spend_per_stud_private = private_data$spend_per_student
avg_10yr_private = private_data$avg_10yr_salary
scatter.smooth(x = spend_per_stud_private, y = avg_10yr_private, xlab = "Spending per Student (Private Colleges)", ylab = "Average 10 Year Salary (Private)", main = "Private Avg 10 Yr Salary vs. Private Colleges Spending per Student")
college13_spend_per_stud = college13$spend_per_student
scatter.smooth(x = college13_spend_per_stud, y = college13$avg_10yr_salary, xlab = "Spending per Student", ylab = "Average 10 Year Salary", main = "Avg 10 Yr Salary vs. Colleges Spending per Student")
#8)
college13 = college13 %>% mutate(
ratio = (college13$med_10yr_salary/college13$net_cost)
)
ratio_max = max(college13$ratio, na.rm = TRUE)
ratio_max
ratio_max = filter(college13, ratio_max == ratio)
ratio_max = ratio_max$name
cat("The largest ratio of median 10-year salary to cost is: ",ratio_max)
#9)
# Want to to detemine colleges that are most racially diverse. We will do so by checking the % of each race within each college and identifying
# the lowest standard deviations which means the data will be closest together meaning more diversity.
white = college13$race_white
black = college13$race_black
asian = college13$race_asian
hispanic = college13$race_hispanic
native = college13$race_native
pacific = college13$race_pacific
other = college13$race_other
race_data = data.frame(white,black,asian,hispanic,native,pacific,other)
#to add the variance as an additional column in the data frame
college13$variance = apply(race_data,1,function(row) var(as.vector(row[1:7])))
#Now we add the standard deviation variable into college13, then sort it and get if of the zeros
college13$sd = sqrt(college13$variance)
sorted = sort(college13$sd)
sorted = sorted[sorted > 0]
# Then we select top 5 uniersities (which I thought was a good idea to grab the top 5 from one of the classmates)
top_5_diverse = sorted[1:5]
# Filter our data based on the top_5_diverse standard deviation data points in our college13 data to give us all accompanying data for those 5.
diversity_1= filter(college13, sd %in% top_5_diverse)
diversity_2 = diversity_1 %>% select(name,race_white,race_black,race_hispanic,race_asian,race_native,race_other,sd) %>% arrange(sd)
diversity_2
#10)
require(data.table)
UC = college13 %>% filter(name %like% 'University of California')
UC %>% select(name, undergrad_pop, cost, spend_per_student, retention, completion, med_10yr_salary, sd)
require(data.table)
state_col = college13 %>% filter(name %like% 'State')
state_col %>% select(name, undergrad_pop, cost, spend_per_student, retention, completion, med_10yr_salary, sd)
retention_uc = mean(UC$retention, na.rm = TRUE)
retention_uc
retention_state = mean(state_col$retention, na.rm = TRUE)
retention_state
med10yr_uc = mean(UC$med_10yr_salary, na.rm = TRUE)
med10yr_uc
med10yr_state = mean(state_col$med_10yr_salary, na.rm = TRUE)
med10yr_state
sd_uc = mean(UC$sd, na.rm = TRUE)
sd_uc
sd_state = mean(state_col$sd, na.rm = TRUE)
sd_state
comp_uc = mean(UC$completion, na.rm = TRUE)
comp_uc
comp_state = mean(state_col$completion, na.rm = TRUE)
comp_state